diff --git a/docs/build/html/.buildinfo b/docs/build/html/.buildinfo
index b92778bcc..c20792bb3 100644
--- a/docs/build/html/.buildinfo
+++ b/docs/build/html/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: aa9c5e5c24909e63ee7546fee2f7474d
+config: 8880bb5f0a2c9a353db73959d72b9edf
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/docs/build/html/_sources/dev/extensions.rst b/docs/build/html/_sources/dev/extensions.rst
index ecb418468..196f8bf65 100644
--- a/docs/build/html/_sources/dev/extensions.rst
+++ b/docs/build/html/_sources/dev/extensions.rst
@@ -494,7 +494,7 @@ below.
 
         // Prepare to encode kernel
         auto& compute_encoder = d.get_command_encoder(s.index);
-        compute_encoder->setComputePipelineState(kernel);
+        compute_encoder.set_compute_pipeline_state(kernel);
 
         // Kernel parameters are registered with buffer indices corresponding to
         // those in the kernel declaration at axpby.metal
@@ -509,14 +509,14 @@ below.
         compute_encoder.set_output_array(out, 2);
 
         // Encode alpha and beta
-        compute_encoder->setBytes(&alpha_, sizeof(float), 3);
-        compute_encoder->setBytes(&beta_, sizeof(float), 4);
+        compute_encoder.set_bytes(alpha_, 3);
+        compute_encoder.set_bytes(beta_, 4);
 
         // Encode shape, strides and ndim
-        compute_encoder->setBytes(x.shape().data(), ndim * sizeof(int), 5);
-        compute_encoder->setBytes(x.strides().data(), ndim * sizeof(size_t), 6);
-        compute_encoder->setBytes(y.strides().data(), ndim * sizeof(size_t), 7);
-        compute_encoder->setBytes(&ndim, sizeof(int), 8);
+        compute_encoder.set_vector_bytes(x.shape(), 5);
+        compute_encoder.set_vector_bytes(x.strides(), 6);
+        compute_encoder.set_bytes(y.strides(), 7);
+        compute_encoder.set_bytes(ndim, 8);
 
         // We launch 1 thread for each input and make sure that the number of
         // threads in any given threadgroup is not higher than the max allowed
@@ -530,7 +530,7 @@ below.
 
         // Launch the grid with the given number of threads divided among
         // the given threadgroups
-        compute_encoder.dispatchThreads(grid_dims, group_dims);
+        compute_encoder.dispatch_threads(grid_dims, group_dims);
     }
 
 We can now call the :meth:`axpby` operation on both the CPU and the GPU!
diff --git a/docs/build/html/_sources/install.rst b/docs/build/html/_sources/install.rst
index 7c68942f4..1566d1f2c 100644
--- a/docs/build/html/_sources/install.rst
+++ b/docs/build/html/_sources/install.rst
@@ -209,7 +209,7 @@ Metal library by run-time compiling kernels the first time they are used in MLX
 on a given machine. Note run-time compilation incurs a cold-start cost which can
 be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
-Metal kernel cache persists accross reboots.
+Metal kernel cache persists across reboots.
 
 Troubleshooting
 ^^^^^^^^^^^^^^^
diff --git a/docs/build/html/_sources/python/_autosummary/mlx.core.fast.affine_quantize.rst b/docs/build/html/_sources/python/_autosummary/mlx.core.fast.affine_quantize.rst
deleted file mode 100644
index 9daa5e111..000000000
--- a/docs/build/html/_sources/python/_autosummary/mlx.core.fast.affine_quantize.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-﻿mlx.core.fast.affine\_quantize
-==============================
-
-.. currentmodule:: mlx.core.fast
-
-.. autofunction:: affine_quantize
\ No newline at end of file
diff --git a/docs/build/html/_sources/python/fast.rst b/docs/build/html/_sources/python/fast.rst
index 30ade264e..f78f40563 100644
--- a/docs/build/html/_sources/python/fast.rst
+++ b/docs/build/html/_sources/python/fast.rst
@@ -12,5 +12,4 @@ Fast
   layer_norm
   rope
   scaled_dot_product_attention
-  affine_quantize
   metal_kernel
diff --git a/docs/build/html/_sources/python/nn/_autosummary/mlx.nn.AvgPool3d.rst b/docs/build/html/_sources/python/nn/_autosummary/mlx.nn.AvgPool3d.rst
new file mode 100644
index 000000000..71dc765d5
--- /dev/null
+++ b/docs/build/html/_sources/python/nn/_autosummary/mlx.nn.AvgPool3d.rst
@@ -0,0 +1,16 @@
+﻿mlx.nn.AvgPool3d
+================
+
+.. currentmodule:: mlx.nn
+
+.. autoclass:: AvgPool3d
+
+   
+
+   
+   .. rubric:: Methods
+
+   .. autosummary::
+   
+   
+   
diff --git a/docs/build/html/_sources/python/nn/_autosummary/mlx.nn.MaxPool3d.rst b/docs/build/html/_sources/python/nn/_autosummary/mlx.nn.MaxPool3d.rst
new file mode 100644
index 000000000..2656eef35
--- /dev/null
+++ b/docs/build/html/_sources/python/nn/_autosummary/mlx.nn.MaxPool3d.rst
@@ -0,0 +1,16 @@
+﻿mlx.nn.MaxPool3d
+================
+
+.. currentmodule:: mlx.nn
+
+.. autoclass:: MaxPool3d
+
+   
+
+   
+   .. rubric:: Methods
+
+   .. autosummary::
+   
+   
+   
diff --git a/docs/build/html/_sources/python/nn/layers.rst b/docs/build/html/_sources/python/nn/layers.rst
index fc24d410b..4eb14b088 100644
--- a/docs/build/html/_sources/python/nn/layers.rst
+++ b/docs/build/html/_sources/python/nn/layers.rst
@@ -12,6 +12,7 @@ Layers
    ALiBi
    AvgPool1d
    AvgPool2d
+   AvgPool3d
    BatchNorm
    CELU
    Conv1d
@@ -41,6 +42,7 @@ Layers
    LSTM
    MaxPool1d
    MaxPool2d
+   MaxPool3d
    Mish
    MultiHeadAttention
    PReLU
diff --git a/docs/build/html/_sources/usage/function_transforms.rst b/docs/build/html/_sources/usage/function_transforms.rst
index 9769fceaa..045c36c93 100644
--- a/docs/build/html/_sources/usage/function_transforms.rst
+++ b/docs/build/html/_sources/usage/function_transforms.rst
@@ -184,8 +184,8 @@ Let's time these two different versions:
   print(timeit.timeit(lambda: mx.eval(naive_add(xs, ys)), number=100))
   print(timeit.timeit(lambda: mx.eval(vmap_add(xs, ys)), number=100))
 
-On an M1 Max the naive version takes in total ``0.390`` seconds whereas the
-vectorized version takes only ``0.025`` seconds, more than ten times faster.
+On an M1 Max the naive version takes in total ``5.639`` seconds whereas the
+vectorized version takes only ``0.024`` seconds, more than 200 times faster.
 
 Of course, this operation is quite contrived. A better approach is to simply do
 ``xs + ys.T``, but for more complex functions :func:`vmap` can be quite handy.
diff --git a/docs/build/html/_static/documentation_options.js b/docs/build/html/_static/documentation_options.js
index a531c3d56..75c0fd73e 100644
--- a/docs/build/html/_static/documentation_options.js
+++ b/docs/build/html/_static/documentation_options.js
@@ -1,5 +1,5 @@
 const DOCUMENTATION_OPTIONS = {
-    VERSION: '0.20.0',
+    VERSION: '0.21.0',
     LANGUAGE: 'en',
     COLLAPSE_INDEX: false,
     BUILDER: 'html',
diff --git a/docs/build/html/annotated.html b/docs/build/html/annotated.html
index 87141e0a0..cbd46ef82 100644
--- a/docs/build/html/annotated.html
+++ b/docs/build/html/annotated.html
@@ -236,129 +236,135 @@ $(function(){ initResizable(false); });
 <tr id="row_1_0_34_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1complex64__t.html" target="_self">complex64_t</a></td><td class="desc"></td></tr>
 <tr id="row_1_0_35_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_concatenate.html" target="_self">Concatenate</a></td><td class="desc"></td></tr>
 <tr id="row_1_0_36_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_conjugate.html" target="_self">Conjugate</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_37_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html" target="_self">ContiguousIterator</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_38_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_convolution.html" target="_self">Convolution</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_39_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_copy.html" target="_self">Copy</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_40_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cos.html" target="_self">Cos</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_41_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cosh.html" target="_self">Cosh</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_42_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_custom_transforms.html" target="_self">CustomTransforms</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_43_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_default_contiguous_reduce.html" target="_self">DefaultContiguousReduce</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_44_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_default_strided_reduce.html" target="_self">DefaultStridedReduce</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_45_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_depends.html" target="_self">Depends</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_46_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_device.html" target="_self">Device</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_47_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_divide.html" target="_self">Divide</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_48_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_div_mod.html" target="_self">DivMod</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_49_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_dtype.html" target="_self">Dtype</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_50_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_eigh.html" target="_self">Eigh</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_51_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_equal.html" target="_self">Equal</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_52_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_erf.html" target="_self">Erf</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_53_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_erf_inv.html" target="_self">ErfInv</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_54_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_event.html" target="_self">Event</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_55_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_exp.html" target="_self">Exp</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_56_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_expm1.html" target="_self">Expm1</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_57_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_f_f_t.html" target="_self">FFT</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_58_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_floor.html" target="_self">Floor</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_59_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_full.html" target="_self">Full</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_60_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather.html" target="_self">Gather</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_61_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather_m_m.html" target="_self">GatherMM</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_62_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html" target="_self">GatherQMM</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_63_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_greater.html" target="_self">Greater</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_64_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_greater_equal.html" target="_self">GreaterEqual</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_65_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_hadamard.html" target="_self">Hadamard</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_66_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_imag.html" target="_self">Imag</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_67_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_inverse.html" target="_self">Inverse</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_68_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_less.html" target="_self">Less</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_69_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_less_equal.html" target="_self">LessEqual</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_70_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_load.html" target="_self">Load</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_71_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log.html" target="_self">Log</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_72_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log1p.html" target="_self">Log1p</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_73_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log_add_exp.html" target="_self">LogAddExp</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_74_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_and.html" target="_self">LogicalAnd</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_75_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_not.html" target="_self">LogicalNot</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_76_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_or.html" target="_self">LogicalOr</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_77_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_matmul.html" target="_self">Matmul</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_78_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_maximum.html" target="_self">Maximum</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_79_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_minimum.html" target="_self">Minimum</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_80_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_multiply.html" target="_self">Multiply</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_81_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_negative.html" target="_self">Negative</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_82_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_node_namer.html" target="_self">NodeNamer</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_83_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_not_equal.html" target="_self">NotEqual</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_84_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_number_of_elements.html" target="_self">NumberOfElements</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_85_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_pad.html" target="_self">Pad</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_86_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_partition.html" target="_self">Partition</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_87_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_power.html" target="_self">Power</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_88_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_primitive.html" target="_self">Primitive</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_89_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_print_formatter.html" target="_self">PrintFormatter</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_90_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_q_r_f.html" target="_self">QRF</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_91_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html" target="_self">QuantizedMatmul</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_92_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_random_bits.html" target="_self">RandomBits</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_93_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_real.html" target="_self">Real</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_94_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_reduce.html" target="_self">Reduce</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_95_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_reduction_plan.html" target="_self">ReductionPlan</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_96_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_remainder.html" target="_self">Remainder</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_97_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_reshape.html" target="_self">Reshape</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_98_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_round.html" target="_self">Round</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_99_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_scan.html" target="_self">Scan</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_100_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_scatter.html" target="_self">Scatter</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_101_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_select.html" target="_self">Select</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_102_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sigmoid.html" target="_self">Sigmoid</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_103_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sign.html" target="_self">Sign</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_104_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sin.html" target="_self">Sin</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_105_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sinh.html" target="_self">Sinh</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_106_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_slice.html" target="_self">Slice</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_107_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_slice_update.html" target="_self">SliceUpdate</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_108_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_softmax.html" target="_self">Softmax</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_109_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sort.html" target="_self">Sort</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_110_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_split.html" target="_self">Split</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_111_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sqrt.html" target="_self">Sqrt</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_112_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_square.html" target="_self">Square</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_113_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_stop_gradient.html" target="_self">StopGradient</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_114_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_stream.html" target="_self">Stream</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_115_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_stream_context.html" target="_self">StreamContext</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_116_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_subtract.html" target="_self">Subtract</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_117_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_s_v_d.html" target="_self">SVD</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_118_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_tan.html" target="_self">Tan</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_119_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_tanh.html" target="_self">Tanh</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_120_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_transpose.html" target="_self">Transpose</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_121_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_type_to_dtype.html" target="_self">TypeToDtype</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_122_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html" target="_self">UnaryPrimitive</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_123_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_uniform.html" target="_self">Uniform</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_124_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_view.html" target="_self">View</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_37_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_contiguous.html" target="_self">Contiguous</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_38_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html" target="_self">ContiguousIterator</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_39_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_convolution.html" target="_self">Convolution</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_40_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_copy.html" target="_self">Copy</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_41_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cos.html" target="_self">Cos</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_42_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cosh.html" target="_self">Cosh</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_43_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_custom_transforms.html" target="_self">CustomTransforms</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_44_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_default_contiguous_reduce.html" target="_self">DefaultContiguousReduce</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_45_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_default_strided_reduce.html" target="_self">DefaultStridedReduce</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_46_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_depends.html" target="_self">Depends</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_47_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_device.html" target="_self">Device</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_48_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_divide.html" target="_self">Divide</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_49_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_div_mod.html" target="_self">DivMod</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_50_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_dtype.html" target="_self">Dtype</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_51_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_eigh.html" target="_self">Eigh</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_52_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_equal.html" target="_self">Equal</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_53_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_erf.html" target="_self">Erf</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_54_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_erf_inv.html" target="_self">ErfInv</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_55_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_event.html" target="_self">Event</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_56_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_exp.html" target="_self">Exp</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_57_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_expm1.html" target="_self">Expm1</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_58_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_f_f_t.html" target="_self">FFT</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_59_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_floor.html" target="_self">Floor</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_60_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_full.html" target="_self">Full</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_61_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather.html" target="_self">Gather</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_62_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather_m_m.html" target="_self">GatherMM</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_63_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html" target="_self">GatherQMM</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_64_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_greater.html" target="_self">Greater</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_65_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_greater_equal.html" target="_self">GreaterEqual</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_66_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_hadamard.html" target="_self">Hadamard</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_67_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_imag.html" target="_self">Imag</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_68_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_inverse.html" target="_self">Inverse</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_69_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_less.html" target="_self">Less</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_70_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_less_equal.html" target="_self">LessEqual</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_71_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_load.html" target="_self">Load</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_72_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log.html" target="_self">Log</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_73_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log1p.html" target="_self">Log1p</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_74_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log_add_exp.html" target="_self">LogAddExp</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_75_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_and.html" target="_self">LogicalAnd</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_76_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_not.html" target="_self">LogicalNot</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_77_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_or.html" target="_self">LogicalOr</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_78_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_matmul.html" target="_self">Matmul</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_79_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_maximum.html" target="_self">Maximum</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_80_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_minimum.html" target="_self">Minimum</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_81_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_multiply.html" target="_self">Multiply</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_82_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_negative.html" target="_self">Negative</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_83_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_node_namer.html" target="_self">NodeNamer</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_84_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_not_equal.html" target="_self">NotEqual</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_85_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_number_of_elements.html" target="_self">NumberOfElements</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_86_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_pad.html" target="_self">Pad</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_87_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_partition.html" target="_self">Partition</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_88_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_power.html" target="_self">Power</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_89_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_primitive.html" target="_self">Primitive</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_90_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_print_formatter.html" target="_self">PrintFormatter</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_91_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_q_r_f.html" target="_self">QRF</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_92_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html" target="_self">QuantizedMatmul</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_93_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_random_bits.html" target="_self">RandomBits</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_94_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_real.html" target="_self">Real</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_95_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_reduce.html" target="_self">Reduce</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_96_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_reduction_plan.html" target="_self">ReductionPlan</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_97_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_remainder.html" target="_self">Remainder</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_98_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_reshape.html" target="_self">Reshape</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_99_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_round.html" target="_self">Round</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_100_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_scan.html" target="_self">Scan</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_101_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_scatter.html" target="_self">Scatter</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_102_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_select.html" target="_self">Select</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_103_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sigmoid.html" target="_self">Sigmoid</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_104_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sign.html" target="_self">Sign</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_105_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sin.html" target="_self">Sin</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_106_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sinh.html" target="_self">Sinh</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_107_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_slice.html" target="_self">Slice</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_108_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_slice_update.html" target="_self">SliceUpdate</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_109_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_softmax.html" target="_self">Softmax</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_110_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sort.html" target="_self">Sort</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_111_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_split.html" target="_self">Split</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_112_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sqrt.html" target="_self">Sqrt</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_113_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_square.html" target="_self">Square</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_114_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_stop_gradient.html" target="_self">StopGradient</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_115_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_stream.html" target="_self">Stream</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_116_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_stream_context.html" target="_self">StreamContext</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_117_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_subtract.html" target="_self">Subtract</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_118_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_s_v_d.html" target="_self">SVD</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_119_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_tan.html" target="_self">Tan</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_120_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_tanh.html" target="_self">Tanh</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_121_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_transpose.html" target="_self">Transpose</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_122_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_type_to_dtype.html" target="_self">TypeToDtype</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_123_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html" target="_self">UnaryPrimitive</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_124_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_uniform.html" target="_self">Uniform</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_125_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_view.html" target="_self">View</a></td><td class="desc"></td></tr>
 <tr id="row_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_1_1_" class="arrow" onclick="dynsection.toggleFolder('1_1_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1steel.html" target="_self">steel</a></td><td class="desc"></td></tr>
 <tr id="row_1_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_accum_helper.html" target="_self">AccumHelper</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html" target="_self">BaseMMAFrag</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html" target="_self">BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_3_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span id="arr_1_1_3_" class="arrow" onclick="dynsection.toggleFolder('1_1_3_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader.html" target="_self">BlockLoader</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_3_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html" target="_self">ReadVector</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html" target="_self">BlockMMA</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html" target="_self">BlockSwizzle</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper.html" target="_self">ChannelHelper</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html" target="_self">ChannelHelper&lt; 1 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_8_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html" target="_self">ChannelHelper&lt; 2 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_9_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html" target="_self">ChannelHelper&lt; 3 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_10_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html" target="_self">ChannelHelper&lt; 4 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_11_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_base_info.html" target="_self">Conv2DGeneralBaseInfo</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_12_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_jump_params.html" target="_self">Conv2DGeneralJumpParams</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_13_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html" target="_self">Conv2DInputBlockLoaderGeneral</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_14_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html" target="_self">Conv2DInputBlockLoaderLargeFilter</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_15_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html" target="_self">Conv2DInputBlockLoaderSmallChannels</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_16_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html" target="_self">Conv2DInputBlockLoaderSmallFilter</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_17_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html" target="_self">Conv2DWeightBlockLoader</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_18_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html" target="_self">Conv2DWeightBlockLoaderGeneral</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_19_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html" target="_self">Conv2DWeightBlockLoaderSmallChannels</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_20_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html" target="_self">GEMMAddMMParams</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_21_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html" target="_self">GEMMKernel</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_22_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html" target="_self">GEMMParams</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_23_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html" target="_self">GEMMSpiltKParams</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_24_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html" target="_self">ImplicitGemmConv2DParams</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_25_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1integral__constant.html" target="_self">integral_constant</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_26_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1is__integral.html" target="_self">is_integral</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_27_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1is__integral_3_01integral__constant_3_01_t_00_01v_01_4_01_4.html" target="_self">is_integral&lt; integral_constant&lt; T, v &gt; &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_28_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html" target="_self">LoopAlignment</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_29_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">MMATile</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_30_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_add.html" target="_self">TransformAdd</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_31_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html" target="_self">TransformAxpby</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_32_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_none.html" target="_self">TransformNone</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_attn_params.html" target="_self">AttnParams</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html" target="_self">BaseMMAFrag</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html" target="_self">BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_4_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span id="arr_1_1_4_" class="arrow" onclick="dynsection.toggleFolder('1_1_4_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader.html" target="_self">BlockLoader</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_4_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html" target="_self">ReadVector</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html" target="_self">BlockLoaderT</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html" target="_self">BlockMMA</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html" target="_self">BlockSwizzle</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_8_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper.html" target="_self">ChannelHelper</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_9_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html" target="_self">ChannelHelper&lt; 1 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_10_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html" target="_self">ChannelHelper&lt; 2 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_11_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html" target="_self">ChannelHelper&lt; 3 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_12_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html" target="_self">ChannelHelper&lt; 4 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_13_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_base_info.html" target="_self">Conv2DGeneralBaseInfo</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_14_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_jump_params.html" target="_self">Conv2DGeneralJumpParams</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_15_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html" target="_self">Conv2DInputBlockLoaderGeneral</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_16_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html" target="_self">Conv2DInputBlockLoaderLargeFilter</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_17_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html" target="_self">Conv2DInputBlockLoaderSmallChannels</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_18_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html" target="_self">Conv2DInputBlockLoaderSmallFilter</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_19_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html" target="_self">Conv2DWeightBlockLoader</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_20_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html" target="_self">Conv2DWeightBlockLoaderGeneral</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_21_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html" target="_self">Conv2DWeightBlockLoaderSmallChannels</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_22_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_c_shape.html" target="_self">CShape</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_23_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html" target="_self">GEMMAddMMParams</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_24_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html" target="_self">GEMMKernel</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_25_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html" target="_self">GEMMParams</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_26_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html" target="_self">GEMMSpiltKParams</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_27_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html" target="_self">ImplicitGemmConv2DParams</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_28_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1integral__constant.html" target="_self">integral_constant</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_29_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1is__integral.html" target="_self">is_integral</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_30_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1is__integral_3_01integral__constant_3_01_t_00_01v_01_4_01_4.html" target="_self">is_integral&lt; integral_constant&lt; T, v &gt; &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_31_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html" target="_self">Layout2D</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_32_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html" target="_self">LoopAlignment</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_33_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">MMATile</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_34_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html" target="_self">Shape2D</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_35_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_add.html" target="_self">TransformAdd</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_36_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html" target="_self">TransformAxpby</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_37_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_none.html" target="_self">TransformNone</a></td><td class="desc"></td></tr>
 <tr id="row_2_" class="even"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_2_" class="arrow" onclick="dynsection.toggleFolder('2_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacepocketfft.html" target="_self">pocketfft</a></td><td class="desc"></td></tr>
 <tr id="row_2_0_" class="odd" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_2_0_" class="arrow" onclick="dynsection.toggleFolder('2_0_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacepocketfft_1_1detail.html" target="_self">detail</a></td><td class="desc"></td></tr>
 <tr id="row_2_0_0_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span id="arr_2_0_0_" class="arrow" onclick="dynsection.toggleFolder('2_0_0_')">&#9658;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacepocketfft_1_1detail_1_1threading.html" target="_self">threading</a></td><td class="desc"></td></tr>
@@ -423,88 +429,93 @@ $(function(){ initResizable(false); });
 <tr id="row_29_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_sum.html" target="_self">CumSum</a></td><td class="desc"></td></tr>
 <tr id="row_30_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_divide.html" target="_self">Divide</a></td><td class="desc"></td></tr>
 <tr id="row_31_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_div_mod.html" target="_self">DivMod</a></td><td class="desc"></td></tr>
-<tr id="row_32_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_equal.html" target="_self">Equal</a></td><td class="desc"></td></tr>
-<tr id="row_33_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_erf.html" target="_self">Erf</a></td><td class="desc"></td></tr>
-<tr id="row_34_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_erf_inv.html" target="_self">ErfInv</a></td><td class="desc"></td></tr>
-<tr id="row_35_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_exp.html" target="_self">Exp</a></td><td class="desc"></td></tr>
-<tr id="row_36_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_expm1.html" target="_self">Expm1</a></td><td class="desc"></td></tr>
-<tr id="row_37_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_floor.html" target="_self">Floor</a></td><td class="desc"></td></tr>
-<tr id="row_38_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_floor_divide.html" target="_self">FloorDivide</a></td><td class="desc"></td></tr>
-<tr id="row_39_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_g_e_m_v_kernel.html" target="_self">GEMVKernel</a></td><td class="desc"></td></tr>
-<tr id="row_40_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_g_e_m_v_t_kernel.html" target="_self">GEMVTKernel</a></td><td class="desc">Vector matrix multiplication </td></tr>
-<tr id="row_41_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_greater.html" target="_self">Greater</a></td><td class="desc"></td></tr>
-<tr id="row_42_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_greater_equal.html" target="_self">GreaterEqual</a></td><td class="desc"></td></tr>
-<tr id="row_43_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_imag.html" target="_self">Imag</a></td><td class="desc"></td></tr>
-<tr id="row_44_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_indices.html" target="_self">Indices</a></td><td class="desc"></td></tr>
-<tr id="row_45_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_kernel_merge_sort.html" target="_self">KernelMergeSort</a></td><td class="desc"></td></tr>
-<tr id="row_46_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_kernel_multi_block_merge_sort.html" target="_self">KernelMultiBlockMergeSort</a></td><td class="desc"></td></tr>
-<tr id="row_47_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_left_shift.html" target="_self">LeftShift</a></td><td class="desc"></td></tr>
-<tr id="row_48_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less.html" target="_self">Less</a></td><td class="desc"></td></tr>
-<tr id="row_49_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less_equal.html" target="_self">LessEqual</a></td><td class="desc"></td></tr>
-<tr id="row_50_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less_than.html" target="_self">LessThan</a></td><td class="desc"></td></tr>
-<tr id="row_51_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits.html" target="_self">Limits</a></td><td class="desc"></td></tr>
-<tr id="row_52_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01bfloat16__t_01_4.html" target="_self">Limits&lt; bfloat16_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_53_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01bool_01_4.html" target="_self">Limits&lt; bool &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_54_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01complex64__t_01_4.html" target="_self">Limits&lt; complex64_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_55_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01float_01_4.html" target="_self">Limits&lt; float &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_56_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01half_01_4.html" target="_self">Limits&lt; half &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_57_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int16__t_01_4.html" target="_self">Limits&lt; int16_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_58_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int32__t_01_4.html" target="_self">Limits&lt; int32_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_59_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int64__t_01_4.html" target="_self">Limits&lt; int64_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_60_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int8__t_01_4.html" target="_self">Limits&lt; int8_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_61_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint16__t_01_4.html" target="_self">Limits&lt; uint16_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_62_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint32__t_01_4.html" target="_self">Limits&lt; uint32_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_63_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint64__t_01_4.html" target="_self">Limits&lt; uint64_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_64_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint8__t_01_4.html" target="_self">Limits&lt; uint8_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_65_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log.html" target="_self">Log</a></td><td class="desc"></td></tr>
-<tr id="row_66_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log10.html" target="_self">Log10</a></td><td class="desc"></td></tr>
-<tr id="row_67_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log1p.html" target="_self">Log1p</a></td><td class="desc"></td></tr>
-<tr id="row_68_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log2.html" target="_self">Log2</a></td><td class="desc"></td></tr>
-<tr id="row_69_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log_add_exp.html" target="_self">LogAddExp</a></td><td class="desc"></td></tr>
-<tr id="row_70_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_and.html" target="_self">LogicalAnd</a></td><td class="desc"></td></tr>
-<tr id="row_71_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_not.html" target="_self">LogicalNot</a></td><td class="desc"></td></tr>
-<tr id="row_72_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_or.html" target="_self">LogicalOr</a></td><td class="desc"></td></tr>
-<tr id="row_73_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structlooped__elem__to__loc.html" target="_self">looped_elem_to_loc</a></td><td class="desc"></td></tr>
-<tr id="row_74_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html" target="_self">looped_elem_to_loc&lt; 0, offset_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_75_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html" target="_self">looped_elem_to_loc&lt; 1, offset_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_76_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_max.html" target="_self">Max</a></td><td class="desc"></td></tr>
-<tr id="row_77_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_maximum.html" target="_self">Maximum</a></td><td class="desc"></td></tr>
-<tr id="row_78_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_min.html" target="_self">Min</a></td><td class="desc"></td></tr>
-<tr id="row_79_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_minimum.html" target="_self">Minimum</a></td><td class="desc"></td></tr>
-<tr id="row_80_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx__atomic.html" target="_self">mlx_atomic</a></td><td class="desc"></td></tr>
-<tr id="row_81_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html" target="_self">mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_82_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_conv_params.html" target="_self">MLXConvParams</a></td><td class="desc"></td></tr>
-<tr id="row_83_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_fast_attention_params.html" target="_self">MLXFastAttentionParams</a></td><td class="desc"></td></tr>
-<tr id="row_84_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html" target="_self">MLXScaledDotProductAttentionParams</a></td><td class="desc"></td></tr>
-<tr id="row_85_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_multiply.html" target="_self">Multiply</a></td><td class="desc"></td></tr>
-<tr id="row_86_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_na_n_equal.html" target="_self">NaNEqual</a></td><td class="desc"></td></tr>
-<tr id="row_87_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_negative.html" target="_self">Negative</a></td><td class="desc"></td></tr>
-<tr id="row_88_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_none.html" target="_self">None</a></td><td class="desc"></td></tr>
-<tr id="row_89_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_not_equal.html" target="_self">NotEqual</a></td><td class="desc"></td></tr>
-<tr id="row_90_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_or.html" target="_self">Or</a></td><td class="desc"></td></tr>
-<tr id="row_91_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_power.html" target="_self">Power</a></td><td class="desc"></td></tr>
-<tr id="row_92_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_prod.html" target="_self">Prod</a></td><td class="desc"></td></tr>
-<tr id="row_93_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_quantized_block_loader.html" target="_self">QuantizedBlockLoader</a></td><td class="desc"></td></tr>
-<tr id="row_94_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_read_writer.html" target="_self">ReadWriter</a></td><td class="desc"></td></tr>
-<tr id="row_95_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_real.html" target="_self">Real</a></td><td class="desc"></td></tr>
-<tr id="row_96_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_remainder.html" target="_self">Remainder</a></td><td class="desc"></td></tr>
-<tr id="row_97_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_right_shift.html" target="_self">RightShift</a></td><td class="desc"></td></tr>
-<tr id="row_98_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_round.html" target="_self">Round</a></td><td class="desc"></td></tr>
-<tr id="row_99_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_rsqrt.html" target="_self">Rsqrt</a></td><td class="desc"></td></tr>
-<tr id="row_100_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_scale_op.html" target="_self">ScaleOp</a></td><td class="desc"></td></tr>
-<tr id="row_101_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_select.html" target="_self">Select</a></td><td class="desc"></td></tr>
-<tr id="row_102_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sigmoid.html" target="_self">Sigmoid</a></td><td class="desc"></td></tr>
-<tr id="row_103_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sign.html" target="_self">Sign</a></td><td class="desc"></td></tr>
-<tr id="row_104_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sin.html" target="_self">Sin</a></td><td class="desc"></td></tr>
-<tr id="row_105_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sinh.html" target="_self">Sinh</a></td><td class="desc"></td></tr>
-<tr id="row_106_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sqrt.html" target="_self">Sqrt</a></td><td class="desc"></td></tr>
-<tr id="row_107_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_square.html" target="_self">Square</a></td><td class="desc"></td></tr>
-<tr id="row_108_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_subtract.html" target="_self">Subtract</a></td><td class="desc"></td></tr>
-<tr id="row_109_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sum.html" target="_self">Sum</a></td><td class="desc"></td></tr>
-<tr id="row_110_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_tan.html" target="_self">Tan</a></td><td class="desc"></td></tr>
-<tr id="row_111_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_tanh.html" target="_self">Tanh</a></td><td class="desc"></td></tr>
-<tr id="row_112_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="class_thread_pool.html" target="_self">ThreadPool</a></td><td class="desc"></td></tr>
-<tr id="row_113_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_thread_sort.html" target="_self">ThreadSort</a></td><td class="desc"></td></tr>
+<tr id="row_32_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_div_op.html" target="_self">DivOp</a></td><td class="desc"></td></tr>
+<tr id="row_33_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_equal.html" target="_self">Equal</a></td><td class="desc"></td></tr>
+<tr id="row_34_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_erf.html" target="_self">Erf</a></td><td class="desc"></td></tr>
+<tr id="row_35_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_erf_inv.html" target="_self">ErfInv</a></td><td class="desc"></td></tr>
+<tr id="row_36_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_exp.html" target="_self">Exp</a></td><td class="desc"></td></tr>
+<tr id="row_37_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_expm1.html" target="_self">Expm1</a></td><td class="desc"></td></tr>
+<tr id="row_38_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_exp_sub_op.html" target="_self">ExpSubOp</a></td><td class="desc"></td></tr>
+<tr id="row_39_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_floor.html" target="_self">Floor</a></td><td class="desc"></td></tr>
+<tr id="row_40_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_floor_divide.html" target="_self">FloorDivide</a></td><td class="desc"></td></tr>
+<tr id="row_41_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_g_e_m_v_kernel.html" target="_self">GEMVKernel</a></td><td class="desc"></td></tr>
+<tr id="row_42_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_g_e_m_v_t_kernel.html" target="_self">GEMVTKernel</a></td><td class="desc">Vector matrix multiplication </td></tr>
+<tr id="row_43_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_greater.html" target="_self">Greater</a></td><td class="desc"></td></tr>
+<tr id="row_44_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_greater_equal.html" target="_self">GreaterEqual</a></td><td class="desc"></td></tr>
+<tr id="row_45_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_imag.html" target="_self">Imag</a></td><td class="desc"></td></tr>
+<tr id="row_46_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_indices.html" target="_self">Indices</a></td><td class="desc"></td></tr>
+<tr id="row_47_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_kernel_merge_sort.html" target="_self">KernelMergeSort</a></td><td class="desc"></td></tr>
+<tr id="row_48_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_kernel_multi_block_merge_sort.html" target="_self">KernelMultiBlockMergeSort</a></td><td class="desc"></td></tr>
+<tr id="row_49_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_left_shift.html" target="_self">LeftShift</a></td><td class="desc"></td></tr>
+<tr id="row_50_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less.html" target="_self">Less</a></td><td class="desc"></td></tr>
+<tr id="row_51_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less_equal.html" target="_self">LessEqual</a></td><td class="desc"></td></tr>
+<tr id="row_52_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less_than.html" target="_self">LessThan</a></td><td class="desc"></td></tr>
+<tr id="row_53_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits.html" target="_self">Limits</a></td><td class="desc"></td></tr>
+<tr id="row_54_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01bfloat16__t_01_4.html" target="_self">Limits&lt; bfloat16_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_55_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01bool_01_4.html" target="_self">Limits&lt; bool &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_56_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01complex64__t_01_4.html" target="_self">Limits&lt; complex64_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_57_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01float_01_4.html" target="_self">Limits&lt; float &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_58_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01half_01_4.html" target="_self">Limits&lt; half &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_59_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int16__t_01_4.html" target="_self">Limits&lt; int16_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_60_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int32__t_01_4.html" target="_self">Limits&lt; int32_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_61_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int64__t_01_4.html" target="_self">Limits&lt; int64_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_62_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int8__t_01_4.html" target="_self">Limits&lt; int8_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_63_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint16__t_01_4.html" target="_self">Limits&lt; uint16_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_64_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint32__t_01_4.html" target="_self">Limits&lt; uint32_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_65_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint64__t_01_4.html" target="_self">Limits&lt; uint64_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_66_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint8__t_01_4.html" target="_self">Limits&lt; uint8_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_67_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log.html" target="_self">Log</a></td><td class="desc"></td></tr>
+<tr id="row_68_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log10.html" target="_self">Log10</a></td><td class="desc"></td></tr>
+<tr id="row_69_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log1p.html" target="_self">Log1p</a></td><td class="desc"></td></tr>
+<tr id="row_70_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log2.html" target="_self">Log2</a></td><td class="desc"></td></tr>
+<tr id="row_71_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log_add_exp.html" target="_self">LogAddExp</a></td><td class="desc"></td></tr>
+<tr id="row_72_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_and.html" target="_self">LogicalAnd</a></td><td class="desc"></td></tr>
+<tr id="row_73_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_not.html" target="_self">LogicalNot</a></td><td class="desc"></td></tr>
+<tr id="row_74_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_or.html" target="_self">LogicalOr</a></td><td class="desc"></td></tr>
+<tr id="row_75_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_looped_elem_to_loc.html" target="_self">LoopedElemToLoc</a></td><td class="desc"></td></tr>
+<tr id="row_76_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html" target="_self">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_77_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html" target="_self">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_78_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_max.html" target="_self">Max</a></td><td class="desc"></td></tr>
+<tr id="row_79_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_maximum.html" target="_self">Maximum</a></td><td class="desc"></td></tr>
+<tr id="row_80_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_max_op.html" target="_self">MaxOp</a></td><td class="desc"></td></tr>
+<tr id="row_81_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_min.html" target="_self">Min</a></td><td class="desc"></td></tr>
+<tr id="row_82_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_minimum.html" target="_self">Minimum</a></td><td class="desc"></td></tr>
+<tr id="row_83_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx__atomic.html" target="_self">mlx_atomic</a></td><td class="desc"></td></tr>
+<tr id="row_84_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html" target="_self">mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_85_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_conv_params.html" target="_self">MLXConvParams</a></td><td class="desc"></td></tr>
+<tr id="row_86_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_mul_op.html" target="_self">MulOp</a></td><td class="desc"></td></tr>
+<tr id="row_87_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_multiply.html" target="_self">Multiply</a></td><td class="desc"></td></tr>
+<tr id="row_88_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_na_n_equal.html" target="_self">NaNEqual</a></td><td class="desc"></td></tr>
+<tr id="row_89_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_negative.html" target="_self">Negative</a></td><td class="desc"></td></tr>
+<tr id="row_90_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_none.html" target="_self">None</a></td><td class="desc"></td></tr>
+<tr id="row_91_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_not_equal.html" target="_self">NotEqual</a></td><td class="desc"></td></tr>
+<tr id="row_92_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_or.html" target="_self">Or</a></td><td class="desc"></td></tr>
+<tr id="row_93_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_power.html" target="_self">Power</a></td><td class="desc"></td></tr>
+<tr id="row_94_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_prod.html" target="_self">Prod</a></td><td class="desc"></td></tr>
+<tr id="row_95_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_quantized_block_loader.html" target="_self">QuantizedBlockLoader</a></td><td class="desc"></td></tr>
+<tr id="row_96_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_read_writer.html" target="_self">ReadWriter</a></td><td class="desc"></td></tr>
+<tr id="row_97_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_real.html" target="_self">Real</a></td><td class="desc"></td></tr>
+<tr id="row_98_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_remainder.html" target="_self">Remainder</a></td><td class="desc"></td></tr>
+<tr id="row_99_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_right_shift.html" target="_self">RightShift</a></td><td class="desc"></td></tr>
+<tr id="row_100_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_round.html" target="_self">Round</a></td><td class="desc"></td></tr>
+<tr id="row_101_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_rsqrt.html" target="_self">Rsqrt</a></td><td class="desc"></td></tr>
+<tr id="row_102_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_scale_op.html" target="_self">ScaleOp</a></td><td class="desc"></td></tr>
+<tr id="row_103_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_select.html" target="_self">Select</a></td><td class="desc"></td></tr>
+<tr id="row_104_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sigmoid.html" target="_self">Sigmoid</a></td><td class="desc"></td></tr>
+<tr id="row_105_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sign.html" target="_self">Sign</a></td><td class="desc"></td></tr>
+<tr id="row_106_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sin.html" target="_self">Sin</a></td><td class="desc"></td></tr>
+<tr id="row_107_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sinh.html" target="_self">Sinh</a></td><td class="desc"></td></tr>
+<tr id="row_108_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sqrt.html" target="_self">Sqrt</a></td><td class="desc"></td></tr>
+<tr id="row_109_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_square.html" target="_self">Square</a></td><td class="desc"></td></tr>
+<tr id="row_110_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sub_op.html" target="_self">SubOp</a></td><td class="desc"></td></tr>
+<tr id="row_111_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_subtract.html" target="_self">Subtract</a></td><td class="desc"></td></tr>
+<tr id="row_112_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sum.html" target="_self">Sum</a></td><td class="desc"></td></tr>
+<tr id="row_113_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sum_op.html" target="_self">SumOp</a></td><td class="desc"></td></tr>
+<tr id="row_114_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_tan.html" target="_self">Tan</a></td><td class="desc"></td></tr>
+<tr id="row_115_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_tanh.html" target="_self">Tanh</a></td><td class="desc"></td></tr>
+<tr id="row_116_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="class_thread_pool.html" target="_self">ThreadPool</a></td><td class="desc"></td></tr>
+<tr id="row_117_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_thread_sort.html" target="_self">ThreadSort</a></td><td class="desc"></td></tr>
+<tr id="row_118_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_transform_scale.html" target="_self">TransformScale</a></td><td class="desc"></td></tr>
 </table>
 </div><!-- directory -->
 </div><!-- contents -->
diff --git a/docs/build/html/atomic_8h_source.html b/docs/build/html/atomic_8h_source.html
index 796c0a05e..a5c220e83 100644
--- a/docs/build/html/atomic_8h_source.html
+++ b/docs/build/html/atomic_8h_source.html
@@ -472,9 +472,9 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aatomic_8h_html_adfdbea60436f14f1af9ce36e2a0a77a3"><div class="ttname"><a href="atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3">mlx_atomic_fetch_mul_explicit</a></div><div class="ttdeci">METAL_FUNC void mlx_atomic_fetch_mul_explicit(device mlx_atomic&lt; T &gt; *object, T val, size_t offset)</div><div class="ttdef"><b>Definition</b> atomic.h:91</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
 <div class="ttc" id="agroup__ops_html_ga484eaa10d5e19a4ca46d3a9cd9fab600"><div class="ttname"><a href="group__ops.html#ga484eaa10d5e19a4ca46d3a9cd9fab600">mlx::core::identity</a></div><div class="ttdeci">array identity(int n, Dtype dtype, StreamOrDevice s={})</div><div class="ttdoc">Create a square matrix of shape (n,n) of zeros, and ones in the major diagonal.</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1distributed_html_a33633c058c7ec82cca4f237243c6810d"><div class="ttname"><a href="namespacemlx_1_1core_1_1distributed.html#a33633c058c7ec82cca4f237243c6810d">mlx::core::distributed::init</a></div><div class="ttdeci">Group init(bool strict=false)</div><div class="ttdoc">Initialize the distributed backend and return the group containing all discoverable processes.</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1random_html_abb895baa477f5a06b5f88e69245f1825"><div class="ttname"><a href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">mlx::core::random::bits</a></div><div class="ttdeci">array bits(const std::vector&lt; int &gt; &amp;shape, int width, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})</div><div class="ttdoc">Generate an array with type uint32 filled with random bits.</div></div>
 <div class="ttc" id="astructmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4_html_a8dbf729fcd8c4a16e41b546c7405543d"><div class="ttname"><a href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html#a8dbf729fcd8c4a16e41b546c7405543d">mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;::val</a></div><div class="ttdeci">atomic&lt; T &gt; val</div><div class="ttdef"><b>Definition</b> atomic.h:31</div></div>
diff --git a/docs/build/html/attn_2loader_8h.html b/docs/build/html/attn_2loader_8h.html
new file mode 100644
index 000000000..882286aea
--- /dev/null
+++ b/docs/build/html/attn_2loader_8h.html
@@ -0,0 +1,126 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/loader.h File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#nested-classes">Classes</a> &#124;
+<a href="#namespaces">Namespaces</a>  </div>
+  <div class="headertitle"><div class="title">loader.h File Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<div class="textblock"><code>#include &quot;<a class="el" href="steel_2defines_8h_source.html">mlx/backend/metal/kernels/steel/defines.h</a>&quot;</code><br />
+</div>
+<p><a href="attn_2loader_8h_source.html">Go to the source code of this file.</a></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
+Classes</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_c_shape.html">mlx::steel::CShape&lt; R, C &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="namespaces" name="namespaces"></a>
+Namespaces</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx.html">mlx</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/attn_2loader_8h_source.html b/docs/build/html/attn_2loader_8h_source.html
new file mode 100644
index 000000000..311bc1556
--- /dev/null
+++ b/docs/build/html/attn_2loader_8h_source.html
@@ -0,0 +1,426 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/loader.h Source File</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">loader.h</div></div>
+</div><!--header-->
+<div class="contents">
+<a href="attn_2loader_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2024 Apple Inc.</span></div>
+<div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
+<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#include &quot;<a class="code" href="steel_2defines_8h.html">mlx/backend/metal/kernels/steel/defines.h</a>&quot;</span></div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span> </div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span><span class="comment">// Loading helper</span></div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span> </div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemlx.html">mlx</a> {</div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span><span class="keyword">namespace </span>steel {</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span> </div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    <span class="keywordtype">short</span> BROWS,</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    <span class="keywordtype">short</span> BCOLS,</div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    <span class="keywordtype">short</span> dst_ld,</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    <span class="keywordtype">short</span> reduction_dim,</div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>    <span class="keywordtype">short</span> tgp_size,</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>    <span class="keywordtype">short</span> alignment = 1,</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>    <span class="keywordtype">short</span> n_reads = (BCOLS * BROWS) / (tgp_size),</div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>    <span class="keywordtype">short</span> TCOLS = BCOLS / n_reads,</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    <span class="keywordtype">short</span> TROWS = tgp_size / TCOLS&gt;</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span><span class="keyword">struct </span>BlockLoader {</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">   26</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">n_rows</a> = (BROWS + TROWS - 1) / TROWS;</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">   27</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a> = n_reads;</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span> </div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  <span class="comment">// Leading dimension for src</span></div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">   30</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a>;</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">   31</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">tile_stride</a>;</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span> </div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  <span class="comment">// Thread location indices</span></div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">   34</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a>;</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">   35</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a>;</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">   36</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>;</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span> </div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <span class="comment">// threadgroup and device memory</span></div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2">   39</a></span>  threadgroup T* <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2">dst</a>;</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">   40</a></span>  <span class="keyword">const</span> device T* <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a>;</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span> </div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  <span class="keyword">struct </span><span class="keyword">alignas</span>(alignment * sizeof(T)) <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">ReadVector</a> {</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d">   43</a></span>    uint8_t <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d">v</a>[<span class="keyword">sizeof</span>(T) * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>];</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  };</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span> </div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  <span class="comment">/* Constructor */</span></div>
+<div class="foldopen" id="foldopen00047" data-start="{" data-end="}">
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335">   47</a></span>  METAL_FUNC <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335">BlockLoader</a>(</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>      <span class="keyword">const</span> device T* src_,</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> src_ld_,</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>      threadgroup T* dst_,</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>      ushort simd_group_id [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>      ushort simd_lane_id [[thread_index_in_simdgroup]])</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>      : <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a>(src_ld_),</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">tile_stride</a>(reduction_dim ? BCOLS : BROWS * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a>),</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a>(simd_group_id * 32 + simd_lane_id),</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a> / TCOLS),</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a> * (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a> % TCOLS)),</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>        dst(dst_ + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a> * dst_ld + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>),</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a>(src_ + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a> + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>) {}</div>
+</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span> </div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>  <span class="comment">/* Apply operation to threadgroup without bound checking */</span></div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> UnaryOp&gt;</div>
+<div class="foldopen" id="foldopen00063" data-start="{" data-end="}">
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">   63</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">apply_inplace_op</a>(thread <span class="keyword">const</span> UnaryOp&amp; <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; BROWS; i += TROWS) {</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>        dst[i * dst_ld + j] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.apply(dst[i * dst_ld + j]);</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>      }</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>    }</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>  }</div>
+</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span> </div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  <span class="comment">/* Load from device memory into threadgroup memory - without bound checking */</span></div>
+<div class="foldopen" id="foldopen00074" data-start="{" data-end="}">
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27">   74</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27">load_unsafe</a>()<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; BROWS; i += TROWS) {</div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>      *((threadgroup <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">ReadVector</a>*)(&amp;dst[i * dst_ld])) =</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>          *((<span class="keyword">const</span> device <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">ReadVector</a>*)(&amp;<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a>]));</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    }</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>  }</div>
+</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span> </div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>  <span class="comment">/* Load from device memory into threadgroup memory - with bound checking */</span></div>
+<div class="foldopen" id="foldopen00083" data-start="{" data-end="}">
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">   83</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">load_safe</a>(short2 src_tile_dim)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    src_tile_dim = src_tile_dim - short2(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a>);</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span> </div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>    <span class="comment">// Skip loading if thread has no valid reads</span></div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>    <span class="keywordflow">if</span> (src_tile_dim.x &lt;= 0 || src_tile_dim.y &lt;= 0) {</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; BROWS; i += TROWS) {</div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>          dst[i * dst_ld + j] = T(0);</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>        }</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>      }</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>    }</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span> </div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>    <span class="comment">// Use fast thread memory for bound checks</span></div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>    <span class="keywordtype">bool</span> tmp_idx[<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>];</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    T tmp_val[<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>];</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span> </div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; BROWS; i += TROWS) {</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>      <span class="comment">// Make sure tmp_idx only contains valid indices</span></div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>        tmp_idx[j] = (i &lt; src_tile_dim.y) &amp;&amp; (j &lt; src_tile_dim.x);</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>      }</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span> </div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>      <span class="comment">// Read valid indices into tmp_val</span></div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>        tmp_val[j] = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a>[(tmp_idx[j] ? i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a> + j : 0)];</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>      }</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span> </div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>      <span class="comment">// Zero out uneeded values</span></div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>      }</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span> </div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>      <span class="comment">// Copy values to threadgroup memory</span></div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>        dst[i * dst_ld + j] = tmp_val[j];</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>      }</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    }</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>  }</div>
+</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span> </div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>  <span class="comment">/* Iteration helper */</span></div>
+<div class="foldopen" id="foldopen00131" data-start="{" data-end="}">
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">  131</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">next</a>() {</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a> += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">tile_stride</a>;</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  }</div>
+</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>};</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span> </div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span><span class="keyword">template</span> &lt;<span class="keywordtype">int</span> R, <span class="keywordtype">int</span> C&gt;</div>
+<div class="foldopen" id="foldopen00137" data-start="{" data-end="};">
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_c_shape.html">  137</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_c_shape.html">CShape</a> {</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_c_shape.html#a5caf36cb9acf9f90ba59a9b0b4197993">  138</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_c_shape.html#a5caf36cb9acf9f90ba59a9b0b4197993">kRows</a> = R;</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_c_shape.html#a01b09227356b6a682a0694523a8e6901">  139</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_c_shape.html#a01b09227356b6a682a0694523a8e6901">kCols</a> = C;</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>};</div>
+</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span> </div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>    <span class="keywordtype">short</span> BROWS,</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>    <span class="keywordtype">short</span> BCOLS,</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>    <span class="keywordtype">short</span> kDstStrRow,</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>    <span class="keywordtype">short</span> kDstStrCol,</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>    <span class="keywordtype">short</span> reduction_dim,</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>    <span class="keywordtype">short</span> tgp_size,</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    <span class="keywordtype">short</span> n_reads = (BCOLS * BROWS) / (tgp_size),</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>    <span class="keywordtype">short</span> TCOLS = BCOLS / n_reads,</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    <span class="keywordtype">short</span> TROWS = tgp_size / TCOLS&gt;</div>
+<div class="foldopen" id="foldopen00153" data-start="{" data-end="};">
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html">  153</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_t.html">BlockLoaderT</a> {</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#a0ccc7caa93e6e709981a1a08159d41dc">  154</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a0ccc7caa93e6e709981a1a08159d41dc">n_rows</a> = (BROWS + TROWS - 1) / TROWS;</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">  155</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a> = n_reads;</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span> </div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>  <span class="comment">// Leading dimension for src</span></div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">  158</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">src_ld</a>;</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f">  159</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f">tile_stride</a>;</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span> </div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>  <span class="comment">// Thread location indices</span></div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da">  162</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da">thread_idx</a>;</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">  163</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">bi</a>;</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">  164</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">bj</a>;</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span> </div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>  <span class="comment">// threadgroup and device memory</span></div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#a6eb4e566b687395e27f290da288362db">  167</a></span>  threadgroup T* <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a6eb4e566b687395e27f290da288362db">dst</a>;</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">  168</a></span>  <span class="keyword">const</span> device T* <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">src</a>;</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span> </div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>  <span class="comment">/* Constructor */</span></div>
+<div class="foldopen" id="foldopen00171" data-start="{" data-end="}">
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#a076616a7c67ad1b847e0e6b046077ee2">  171</a></span>  METAL_FUNC <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader_t.html#a076616a7c67ad1b847e0e6b046077ee2">BlockLoaderT</a>(</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>      <span class="keyword">const</span> device T* src_,</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> src_ld_,</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>      threadgroup T* dst_,</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>      ushort simd_group_id [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>      ushort simd_lane_id [[thread_index_in_simdgroup]])</div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>      : <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">src_ld</a>(src_ld_),</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f">tile_stride</a>(reduction_dim ? BCOLS : BROWS * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">src_ld</a>),</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da">thread_idx</a>(simd_group_id * 32 + simd_lane_id),</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">bi</a>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da">thread_idx</a> / TCOLS),</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">bj</a>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a> * (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da">thread_idx</a> % TCOLS)),</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>        dst(dst_ + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">bi</a> * kDstStrRow + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">bj</a> * kDstStrCol),</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">src</a>(src_ + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">bi</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">src_ld</a> + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">bj</a>) {}</div>
+</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span> </div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>  <span class="comment">/* Apply operation to threadgroup without bound checking */</span></div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> UnaryOp&gt;</div>
+<div class="foldopen" id="foldopen00187" data-start="{" data-end="}">
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#a2b136fad00dc54300e68aa6b905eff97">  187</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader_t.html#a2b136fad00dc54300e68aa6b905eff97">apply_inplace_op</a>(thread <span class="keyword">const</span> UnaryOp&amp; <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; BROWS; i += TROWS) {</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>        dst[i * kDstStrRow + j * kDstStrCol] =</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>            <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.apply(dst[i * kDstStrRow + j * kDstStrCol]);</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>      }</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    }</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>  }</div>
+</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span> </div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>  <span class="comment">/* Load from device memory into threadgroup memory - without bound checking */</span></div>
+<div class="foldopen" id="foldopen00199" data-start="{" data-end="}">
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#acb743f32146fdc7986264b7beb35fb38">  199</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader_t.html#acb743f32146fdc7986264b7beb35fb38">load_unsafe</a>()<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; BROWS; i += TROWS) {</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>        dst[i * kDstStrRow + j * kDstStrCol] = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">src</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">src_ld</a> + j];</div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      }</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>    }</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>  }</div>
+</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span> </div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>  <span class="comment">/* Load from device memory into threadgroup memory - with bound checking */</span></div>
+<div class="foldopen" id="foldopen00210" data-start="{" data-end="}">
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#ac2d95e35ba39e0984e6f1e58ca935f7d">  210</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader_t.html#ac2d95e35ba39e0984e6f1e58ca935f7d">load_safe</a>(short2 src_tile_dim)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>    src_tile_dim = src_tile_dim - short2(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">bj</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">bi</a>);</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span> </div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>    <span class="comment">// Skip loading if thread has no valid reads</span></div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>    <span class="keywordflow">if</span> (src_tile_dim.x &lt;= 0 || src_tile_dim.y &lt;= 0) {</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; BROWS; i += TROWS) {</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>          dst[i * kDstStrRow + j * kDstStrCol] = T(0);</div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>        }</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>      }</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>    }</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span> </div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>    <span class="comment">// Use fast thread memory for bound checks</span></div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>    <span class="keywordtype">bool</span> tmp_idx[<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a>];</div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>    T tmp_val[<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a>];</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span> </div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; BROWS; i += TROWS) {</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>      <span class="comment">// Make sure tmp_idx only contains valid indices</span></div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>        tmp_idx[j] = (i &lt; src_tile_dim.y) &amp;&amp; (j &lt; src_tile_dim.x);</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>      }</div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span> </div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>      <span class="comment">// Read valid indices into tmp_val</span></div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>        tmp_val[j] = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">src</a>[(tmp_idx[j] ? i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">src_ld</a> + j : 0)];</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>      }</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span> </div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>      <span class="comment">// Zero out uneeded values</span></div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>        tmp_val[j] = tmp_idx[j] ? tmp_val[j] : T(0);</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>      }</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span> </div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>      <span class="comment">// Copy values to threadgroup memory</span></div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a>; j++) {</div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>        dst[i * kDstStrRow + j * kDstStrCol] = tmp_val[j];</div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>      }</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>    }</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>  }</div>
+</div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span> </div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>  <span class="comment">/* Iteration helper */</span></div>
+<div class="foldopen" id="foldopen00258" data-start="{" data-end="}">
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_t.html#a6008ef45ff980dbe1119da0630f6c697">  258</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader_t.html#a6008ef45ff980dbe1119da0630f6c697">next</a>() {</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">src</a> += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f">tile_stride</a>;</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>  }</div>
+</div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>};</div>
+</div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span> </div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>} <span class="comment">// namespace steel</span></div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>} <span class="comment">// namespace mlx</span></div>
+<div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
+<div class="ttc" id="anamespacemlx_html"><div class="ttname"><a href="namespacemlx.html">mlx</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
+<div class="ttc" id="asteel_2defines_8h_html"><div class="ttname"><a href="steel_2defines_8h.html">defines.h</a></div></div>
+<div class="ttc" id="asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6"><div class="ttname"><a href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div><div class="ttdeci">#define STEEL_PRAGMA_UNROLL</div><div class="ttdef"><b>Definition</b> defines.h:4</div></div>
+<div class="ttc" id="asteel_2defines_8h_html_a90b91c866313ffa46eff6d9cc944ad2b"><div class="ttname"><a href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a></div><div class="ttdeci">#define STEEL_CONST</div><div class="ttdef"><b>Definition</b> defines.h:3</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_1_1_read_vector_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">mlx::steel::BlockLoader::ReadVector</a></div><div class="ttdef"><b>Definition</b> loader.h:42</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_1_1_read_vector_html_a20963f7191251defca48bf8a843d019d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d">mlx::steel::BlockLoader::ReadVector::v</a></div><div class="ttdeci">uint8_t v[sizeof(T) *vec_size]</div><div class="ttdef"><b>Definition</b> loader.h:43</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a064e2cc77e0b1cf0f8027929e031775b"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">mlx::steel::BlockLoader::thread_idx</a></div><div class="ttdeci">const short thread_idx</div><div class="ttdef"><b>Definition</b> loader.h:34</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a37aca066e63dff238865b5923a2d4335"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335">mlx::steel::BlockLoader::BlockLoader</a></div><div class="ttdeci">METAL_FUNC BlockLoader(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> loader.h:47</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a58bdf9b9c81962733e22ecdeae28c092"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">mlx::steel::BlockLoader::vec_size</a></div><div class="ttdeci">STEEL_CONST short vec_size</div><div class="ttdef"><b>Definition</b> loader.h:27</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a6af21428f0e7c17b48ddedf4dd20a1e8"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">mlx::steel::BlockLoader::next</a></div><div class="ttdeci">METAL_FUNC void next()</div><div class="ttdef"><b>Definition</b> loader.h:131</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a6c9e27f11f48b34580ed2c7e9cad9a27"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27">mlx::steel::BlockLoader::load_unsafe</a></div><div class="ttdeci">METAL_FUNC void load_unsafe() const</div><div class="ttdef"><b>Definition</b> loader.h:74</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a78c326e75ee35a484685771143047cd4"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">mlx::steel::BlockLoader::bj</a></div><div class="ttdeci">const short bj</div><div class="ttdef"><b>Definition</b> loader.h:36</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a973804e5b1d418c98c90861cda1a6fb5"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">mlx::steel::BlockLoader::n_rows</a></div><div class="ttdeci">STEEL_CONST short n_rows</div><div class="ttdef"><b>Definition</b> loader.h:26</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a9ef13742bcdf07532d8f09394928a8af"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">mlx::steel::BlockLoader::bi</a></div><div class="ttdeci">const short bi</div><div class="ttdef"><b>Definition</b> loader.h:35</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_aadafc50f7f06af434149d7469df4714d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">mlx::steel::BlockLoader::src_ld</a></div><div class="ttdeci">const int src_ld</div><div class="ttdef"><b>Definition</b> loader.h:30</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_ab87876699d55473620c7ea99f9da911d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">mlx::steel::BlockLoader::tile_stride</a></div><div class="ttdeci">const int tile_stride</div><div class="ttdef"><b>Definition</b> loader.h:31</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_abb0f4f66ec8b123627beb8eb4fbb609d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">mlx::steel::BlockLoader::load_safe</a></div><div class="ttdeci">METAL_FUNC void load_safe(short2 src_tile_dim) const</div><div class="ttdef"><b>Definition</b> loader.h:83</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_ad1db14517568ae9eddfb6986ef31c7aa"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">mlx::steel::BlockLoader::src</a></div><div class="ttdeci">const device T * src</div><div class="ttdef"><b>Definition</b> loader.h:40</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_adb4ca2cc193630a779de552fa8847ddf"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">mlx::steel::BlockLoader::apply_inplace_op</a></div><div class="ttdeci">METAL_FUNC void apply_inplace_op(thread const UnaryOp &amp;op) const</div><div class="ttdef"><b>Definition</b> loader.h:63</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_af1c6c35a42e9da4408c1013ff1741bc2"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2">mlx::steel::BlockLoader::dst</a></div><div class="ttdeci">threadgroup T * dst</div><div class="ttdef"><b>Definition</b> loader.h:39</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a></div><div class="ttdef"><b>Definition</b> loader.h:153</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_a076616a7c67ad1b847e0e6b046077ee2"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#a076616a7c67ad1b847e0e6b046077ee2">mlx::steel::BlockLoaderT::BlockLoaderT</a></div><div class="ttdeci">METAL_FUNC BlockLoaderT(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> loader.h:171</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_a0ccc7caa93e6e709981a1a08159d41dc"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#a0ccc7caa93e6e709981a1a08159d41dc">mlx::steel::BlockLoaderT::n_rows</a></div><div class="ttdeci">STEEL_CONST short n_rows</div><div class="ttdef"><b>Definition</b> loader.h:154</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_a2b136fad00dc54300e68aa6b905eff97"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#a2b136fad00dc54300e68aa6b905eff97">mlx::steel::BlockLoaderT::apply_inplace_op</a></div><div class="ttdeci">METAL_FUNC void apply_inplace_op(thread const UnaryOp &amp;op) const</div><div class="ttdef"><b>Definition</b> loader.h:187</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_a3abb86e68adb7e4d87cb808d6c25e35f"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f">mlx::steel::BlockLoaderT::tile_stride</a></div><div class="ttdeci">const int tile_stride</div><div class="ttdef"><b>Definition</b> loader.h:159</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_a6008ef45ff980dbe1119da0630f6c697"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#a6008ef45ff980dbe1119da0630f6c697">mlx::steel::BlockLoaderT::next</a></div><div class="ttdeci">METAL_FUNC void next()</div><div class="ttdef"><b>Definition</b> loader.h:258</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_a6964273994b06d6cf8ef7e59fb10bb35"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">mlx::steel::BlockLoaderT::bi</a></div><div class="ttdeci">const short bi</div><div class="ttdef"><b>Definition</b> loader.h:163</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_a6eb4e566b687395e27f290da288362db"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#a6eb4e566b687395e27f290da288362db">mlx::steel::BlockLoaderT::dst</a></div><div class="ttdeci">threadgroup T * dst</div><div class="ttdef"><b>Definition</b> loader.h:167</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_a7004a4efaa483cc79b8b79810a17c777"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">mlx::steel::BlockLoaderT::src</a></div><div class="ttdeci">const device T * src</div><div class="ttdef"><b>Definition</b> loader.h:168</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_a9ac651d9e5097507c57b10dfeb40bfe5"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">mlx::steel::BlockLoaderT::vec_size</a></div><div class="ttdeci">STEEL_CONST short vec_size</div><div class="ttdef"><b>Definition</b> loader.h:155</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_ac2d95e35ba39e0984e6f1e58ca935f7d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#ac2d95e35ba39e0984e6f1e58ca935f7d">mlx::steel::BlockLoaderT::load_safe</a></div><div class="ttdeci">METAL_FUNC void load_safe(short2 src_tile_dim) const</div><div class="ttdef"><b>Definition</b> loader.h:210</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_aca83e49c31095badc8a46eb3c8e00957"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">mlx::steel::BlockLoaderT::bj</a></div><div class="ttdeci">const short bj</div><div class="ttdef"><b>Definition</b> loader.h:164</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_acb743f32146fdc7986264b7beb35fb38"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#acb743f32146fdc7986264b7beb35fb38">mlx::steel::BlockLoaderT::load_unsafe</a></div><div class="ttdeci">METAL_FUNC void load_unsafe() const</div><div class="ttdef"><b>Definition</b> loader.h:199</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_aeba87e81185da6b20a092c5d240d3321"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">mlx::steel::BlockLoaderT::src_ld</a></div><div class="ttdeci">const int src_ld</div><div class="ttdef"><b>Definition</b> loader.h:158</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html_af2838998a02866f22b525f9b6ae004da"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da">mlx::steel::BlockLoaderT::thread_idx</a></div><div class="ttdeci">const short thread_idx</div><div class="ttdef"><b>Definition</b> loader.h:162</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_c_shape_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_c_shape.html">mlx::steel::CShape</a></div><div class="ttdef"><b>Definition</b> loader.h:137</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_c_shape_html_a01b09227356b6a682a0694523a8e6901"><div class="ttname"><a href="structmlx_1_1steel_1_1_c_shape.html#a01b09227356b6a682a0694523a8e6901">mlx::steel::CShape::kCols</a></div><div class="ttdeci">STEEL_CONST int kCols</div><div class="ttdef"><b>Definition</b> loader.h:139</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_c_shape_html_a5caf36cb9acf9f90ba59a9b0b4197993"><div class="ttname"><a href="structmlx_1_1steel_1_1_c_shape.html#a5caf36cb9acf9f90ba59a9b0b4197993">mlx::steel::CShape::kRows</a></div><div class="ttdeci">STEEL_CONST int kRows</div><div class="ttdef"><b>Definition</b> loader.h:138</div></div>
+</div><!-- fragment --></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/attn_2mma_8h.html b/docs/build/html/attn_2mma_8h.html
new file mode 100644
index 000000000..9c7def403
--- /dev/null
+++ b/docs/build/html/attn_2mma_8h.html
@@ -0,0 +1,142 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/mma.h File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#nested-classes">Classes</a> &#124;
+<a href="#namespaces">Namespaces</a> &#124;
+<a href="#func-members">Functions</a>  </div>
+  <div class="headertitle"><div class="title">mma.h File Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<div class="textblock"><code>#include &lt;metal_simdgroup&gt;</code><br />
+<code>#include &lt;metal_simdgroup_matrix&gt;</code><br />
+<code>#include &lt;metal_stdlib&gt;</code><br />
+<code>#include &quot;<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">mlx/backend/metal/kernels/steel/attn/transforms.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="steel_2defines_8h_source.html">mlx/backend/metal/kernels/steel/defines.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="integral__constant_8h_source.html">mlx/backend/metal/kernels/steel/utils/integral_constant.h</a>&quot;</code><br />
+</div>
+<p><a href="attn_2mma_8h_source.html">Go to the source code of this file.</a></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
+Classes</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">mlx::steel::Layout2D&lt; Shape, Layout &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag&lt; T, kFragRows_, kFragCols_ &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="namespaces" name="namespaces"></a>
+Namespaces</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx.html">mlx</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
+Functions</h2></td></tr>
+<tr class="memitem:ad583e6038efc119542410f43b603d4ad" id="r_ad583e6038efc119542410f43b603d4ad"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , int M, int N, int K&gt; </td></tr>
+<tr class="memitem:ad583e6038efc119542410f43b603d4ad"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">mlx::steel::tile_matmad</a> (thread <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; T, M, N &gt; &amp;D, thread <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; U, M, K &gt; &amp;A, thread <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; U, K, N &gt; &amp;B, thread <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; T, M, N &gt; &amp;C)</td></tr>
+<tr class="separator:ad583e6038efc119542410f43b603d4ad"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/attn_2mma_8h_source.html b/docs/build/html/attn_2mma_8h_source.html
new file mode 100644
index 000000000..156ab79e8
--- /dev/null
+++ b/docs/build/html/attn_2mma_8h_source.html
@@ -0,0 +1,986 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/mma.h Source File</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">mma.h</div></div>
+</div><!--header-->
+<div class="contents">
+<a href="attn_2mma_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2024 Apple Inc.</span></div>
+<div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
+<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#include &lt;metal_simdgroup&gt;</span></div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span><span class="preprocessor">#include &lt;metal_simdgroup_matrix&gt;</span></div>
+<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="preprocessor">#include &lt;metal_stdlib&gt;</span></div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span> </div>
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h.html">mlx/backend/metal/kernels/steel/attn/transforms.h</a>&quot;</span></div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="preprocessor">#include &quot;<a class="code" href="steel_2defines_8h.html">mlx/backend/metal/kernels/steel/defines.h</a>&quot;</span></div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span><span class="preprocessor">#include &quot;<a class="code" href="integral__constant_8h.html">mlx/backend/metal/kernels/steel/utils/integral_constant.h</a>&quot;</span></div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span> </div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span><span class="keyword">using namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a>;</div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span> </div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span><span class="comment">// MMA helper</span></div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span> </div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemlx.html">mlx</a> {</div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span><span class="keyword">namespace </span>steel {</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span> </div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> RInt, <span class="keyword">typename</span> CInt&gt;</div>
+<div class="foldopen" id="foldopen00023" data-start="{" data-end="};">
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_shape2_d.html">   23</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_shape2_d.html">Shape2D</a> {</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe">   24</a></span>  RInt <a class="code hl_variable" href="structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe">r</a>;</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e">   25</a></span>  CInt <a class="code hl_variable" href="structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e">c</a>;</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span> </div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_shape2_d.html#a070ce70eb6d84361c7f313159c438a5c">   27</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_shape2_d.html#a070ce70eb6d84361c7f313159c438a5c">Shape2D</a>(RInt r_, CInt c_) : <a class="code hl_variable" href="structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe">r</a>(r_), <a class="code hl_variable" href="structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e">c</a>(c_) {}</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>};</div>
+</div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span> </div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> Shape, <span class="keyword">typename</span> Layout&gt;</div>
+<div class="foldopen" id="foldopen00031" data-start="{" data-end="};">
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_layout2_d.html">   31</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_layout2_d.html">Layout2D</a> {</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_layout2_d.html#a23183747ab1ddbdd3f1fcac6d0faa2cd">   32</a></span>  Shape <a class="code hl_variable" href="structmlx_1_1steel_1_1_layout2_d.html#a23183747ab1ddbdd3f1fcac6d0faa2cd">shape</a>;</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_layout2_d.html#a6beedf1677ee1b192fb48c83a29ac8a1">   33</a></span>  Layout <a class="code hl_variable" href="structmlx_1_1steel_1_1_layout2_d.html#a6beedf1677ee1b192fb48c83a29ac8a1">layout</a>;</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>};</div>
+</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span> </div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> kFragRows_, <span class="keywordtype">int</span> kFragCols_&gt;</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a> {</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>      kFragRows_ == 8,</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>      <span class="stringliteral">&quot;Only 8 x 8 fragment matrices are currently supported&quot;</span>);</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>      kFragCols_ == 8,</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>      <span class="stringliteral">&quot;Only 8 x 8 fragment matrices are currently supported&quot;</span>);</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>};</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span> </div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span><span class="keyword">struct </span>BaseMMAFrag&lt;T, 8, 8&gt; {</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4">   48</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kFragRows = 8;</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a211102315e2afbcfcd2e2c201b638e9f">   49</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kFragCols = 8;</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span> </div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a3c34dfdc944db110f4735f1b25307cf0">   51</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kElemsPerFrag = (kFragRows * kFragCols) / 32;</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span> </div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f">   53</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kElemRows = 1;</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd">   54</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kElemCols = 2;</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span> </div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>      kElemRows * kElemCols == kElemsPerFrag,</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>      <span class="stringliteral">&quot;MMAFrag shape is not consistent with MMAFrag size&quot;</span>);</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span> </div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">   60</a></span>  <span class="keyword">typedef</span> metal::simdgroup_matrix&lt;T, kFragRows, kFragCols&gt; <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>;</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">   61</a></span>  <span class="keyword">typedef</span> metal::vec&lt;T, kElemsPerFrag&gt; <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>;</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a5ec2e40a8f5ad98c71b825544cdd878b">   62</a></span>  <span class="keyword">typedef</span> metal::vec&lt;T, kElemRows&gt; <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a5ec2e40a8f5ad98c71b825544cdd878b">row_frag_type</a>;</div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aab8dd1c6917247da41dd3a31139a665f">   63</a></span>  <span class="keyword">typedef</span> metal::vec&lt;T, kElemCols&gt; <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aab8dd1c6917247da41dd3a31139a665f">col_frag_type</a>;</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span> </div>
+<div class="foldopen" id="foldopen00065" data-start="{" data-end="}">
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83">   65</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> short2 <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83">get_coord</a>(ushort simd_lane_id</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>                                               [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    <span class="keyword">const</span> <span class="keywordtype">short</span> qid = simd_lane_id / 4;</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    <span class="keyword">const</span> <span class="keywordtype">short</span> fm = (qid &amp; 4) + ((simd_lane_id / 2) % 4);</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>    <span class="keyword">const</span> <span class="keywordtype">short</span> fn = (qid &amp; 2) * 2 + (simd_lane_id % 2) * 2;</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>    <span class="keywordflow">return</span> short2{fn, fm};</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>  }</div>
+</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span> </div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> SrcPtrType, <span class="keyword">typename</span> StrX, <span class="keyword">typename</span> StrY&gt;</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span></div>
+<div class="foldopen" id="foldopen00075" data-start="{" data-end="}">
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">   75</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">load</a>(thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; dst, SrcPtrType src, StrX str_x, StrY str_y) {</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kElemRows; i++) {</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; kElemCols; j++) {</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>        dst[i * kElemCols + j] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(src[i * str_x + j * str_y]);</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>      }</div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    }</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>  }</div>
+</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span> </div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>  <span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>      <span class="keyword">typename</span> SrcPtrType,</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>      <span class="keyword">typename</span> StrX,</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>      <span class="keyword">typename</span> StrY,</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>      <span class="keyword">typename</span> LimX,</div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>      <span class="keyword">typename</span> LimY,</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>      <span class="keyword">typename</span> OffX,</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>      <span class="keyword">typename</span> OffY&gt;</div>
+<div class="foldopen" id="foldopen00093" data-start="{" data-end="}">
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">   93</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">load_safe</a>(</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; dst,</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>      SrcPtrType src,</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>      StrX str_x,</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>      StrY str_y,</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>      LimX lim_x,</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>      LimY lim_y,</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>      OffX off_x = <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;0&gt;</a>{},</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>      OffY off_y = <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;0&gt;</a>{}) {</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kElemRows; i++) {</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; kElemCols; j++) {</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>        <span class="keywordflow">if</span> ((off_x + i) &lt; lim_x &amp;&amp; (off_y + j) &lt; lim_y) {</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>          dst[i * kElemCols + j] =</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>              <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(src[(off_x + i) * str_x + (off_x + j) * str_y]);</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>        } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>          dst[i * kElemCols + j] = T(0);</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>        }</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>      }</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    }</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  }</div>
+</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span> </div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> DstPtrType, <span class="keyword">typename</span> StrX, <span class="keyword">typename</span> StrY&gt;</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span></div>
+<div class="foldopen" id="foldopen00118" data-start="{" data-end="}">
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">  118</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">store</a>(<span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; src, DstPtrType dst, StrX str_x, StrY str_y) {</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    <span class="keyword">using </span>U = <a class="code hl_typedef" href="namespacemetal.html#ac82ee6c3fbe9ec5c78c07329424aaec9">pointer_element_t&lt;DstPtrType&gt;</a>;</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span> </div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kElemRows; i++) {</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; kElemCols; j++) {</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>        dst[i * str_x + j * str_y] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[i * kElemCols + j]);</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>      }</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    }</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>  }</div>
+</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span> </div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>  <span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>      <span class="keyword">typename</span> DstPtrType,</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>      <span class="keyword">typename</span> StrX,</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>      <span class="keyword">typename</span> StrY,</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>      <span class="keyword">typename</span> LimX,</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>      <span class="keyword">typename</span> LimY,</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>      <span class="keyword">typename</span> OffX,</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>      <span class="keyword">typename</span> OffY&gt;</div>
+<div class="foldopen" id="foldopen00138" data-start="{" data-end="}">
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328">  138</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328">store_safe</a>(</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>      <span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; src,</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>      DstPtrType dst,</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>      StrX str_x,</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>      StrY str_y,</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>      LimX lim_x,</div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>      LimY lim_y,</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>      OffX off_x = <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;0&gt;</a>{},</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>      OffY off_y = <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;0&gt;</a>{}) {</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>    <span class="keyword">using </span>U = <a class="code hl_typedef" href="namespacemetal.html#ac82ee6c3fbe9ec5c78c07329424aaec9">pointer_element_t&lt;DstPtrType&gt;</a>;</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span> </div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kElemRows; i++) {</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; kElemCols; j++) {</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>        <span class="keywordflow">if</span> ((off_x + i) &lt; lim_x &amp;&amp; (off_y + j) &lt; lim_y) {</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>          dst[(off_x + i) * str_x + (off_y + j) * str_y] =</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>              <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[i * kElemCols + j]);</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>        }</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>      }</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    }</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>  }</div>
+</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span> </div>
+<div class="foldopen" id="foldopen00161" data-start="{" data-end="}">
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3">  161</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3">mma</a>(</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; D,</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; A,</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; B,</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; C) {</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> D_mat;</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> A_mat;</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> B_mat;</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> C_mat;</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span> </div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>    <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp;<span class="keyword">&gt;</span>(A_mat.thread_elements()) = A;</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>    <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp;<span class="keyword">&gt;</span>(B_mat.thread_elements()) = B;</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp;<span class="keyword">&gt;</span>(C_mat.thread_elements()) = C;</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span> </div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>    mma(D_mat, A_mat, B_mat, C_mat);</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span> </div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    D = <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp;<span class="keyword">&gt;</span>(D_mat.thread_elements());</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>  }</div>
+</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span> </div>
+<div class="foldopen" id="foldopen00180" data-start="{" data-end="}">
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946">  180</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946">mma</a>(</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>&amp; D,</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>&amp; A,</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>&amp; B,</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>&amp; C) {</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>    simdgroup_multiply_accumulate(D, A, B, C);</div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>  }</div>
+</div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span> </div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> Op&gt;</div>
+<div class="foldopen" id="foldopen00189" data-start="{" data-end="}">
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a51d662e4cff88b5ad17d7c44bb6b6970">  189</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a51d662e4cff88b5ad17d7c44bb6b6970">row_reduce</a>(</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>      thread <span class="keyword">const</span> <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; inp_vals,</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>      thread T* reduced_vals) {</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>    T thr_reduce = Op::apply(inp_vals.x, inp_vals.y);</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span> </div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>    T qgr_reduce = <a class="code hl_function" href="namespacemetal.html#a5017efc9605e069cfb507137cd1a1852">simd_shuffle_xor</a>(thr_reduce, ushort(1));</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    qgr_reduce = Op::apply(thr_reduce, qgr_reduce);</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span> </div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>    T sgr_reduce = <a class="code hl_function" href="namespacemetal.html#a5017efc9605e069cfb507137cd1a1852">simd_shuffle_xor</a>(qgr_reduce, ushort(8));</div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>    sgr_reduce = Op::apply(qgr_reduce, sgr_reduce);</div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span> </div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>    reduced_vals[0] = Op::apply(reduced_vals[0], sgr_reduce);</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>  }</div>
+</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span> </div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> Op&gt;</div>
+<div class="foldopen" id="foldopen00204" data-start="{" data-end="}">
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a318c4279bdc7b39b7919f108b1cd8010">  204</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a318c4279bdc7b39b7919f108b1cd8010">row_bin_op</a>(</div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; inp_vals,</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>      thread T* row_vals) {</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kElemRows; i++) {</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; kElemCols; j++) {</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>        inp_vals[i * kElemCols + j] =</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>            Op::apply(inp_vals[i * kElemCols + j], row_vals[i]);</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>      }</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>    }</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>  }</div>
+</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>};</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span> </div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>    <span class="keywordtype">int</span> kTileRows_,</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>    <span class="keywordtype">int</span> kTileCols_,</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>    <span class="keyword">class </span>MMAFrag_ = BaseMMAFrag&lt;T, 8, 8&gt;&gt;</div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span><span class="keyword">struct </span>MMATile {</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4">  224</a></span>  <span class="keyword">using </span><a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4">MMAFrag_t</a> = MMAFrag_;</div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">  225</a></span>  <span class="keyword">using </span><a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a> = T;</div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">  226</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a> = MMAFrag_t::kFragRows;</div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">  227</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a> = MMAFrag_t::kFragCols;</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">  228</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a> = MMAFrag_t::kElemsPerFrag;</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span> </div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">  230</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> = kTileRows_;</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">  231</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> = kTileCols_;</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span> </div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">  233</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">kRows</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a>;</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">  234</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">kCols</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a>;</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span> </div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">  236</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>;</div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f">  237</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f">kElemsPerTile</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a>;</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span> </div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e">  239</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e">kRowsPerThread</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> * MMAFrag_t::kElemRows;</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1ea49efd92696b15302ee4b52ecd548c">  240</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1ea49efd92696b15302ee4b52ecd548c">kColsPerThread</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> * MMAFrag_t::kElemCols;</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span> </div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">  242</a></span>  <span class="keyword">typedef</span> <span class="keyword">typename</span> MMAFrag_t::mat_type <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a>;</div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">  243</a></span>  <span class="keyword">typedef</span> <span class="keyword">typename</span> MMAFrag_t::frag_type <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>;</div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span> </div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">  245</a></span>  <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>[<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a>] = {<a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>(0)};</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span> </div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6">  247</a></span>  METAL_FUNC <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6">MMATile</a>() thread {}</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span> </div>
+<div class="foldopen" id="foldopen00249" data-start="{" data-end="}">
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">  249</a></span>  METAL_FUNC <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">clear</a>() {</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a>; ++i) {</div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>[i] = <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>(0);</div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>    }</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>  }</div>
+</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span> </div>
+<div class="foldopen" id="foldopen00256" data-start="{" data-end="}">
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">  256</a></span>  METAL_FUNC <span class="keyword">constexpr</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>&amp; <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(<span class="keyword">const</span> <span class="keywordtype">short</span> i, <span class="keyword">const</span> <span class="keywordtype">short</span> j) {</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> + j];</div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>  }</div>
+</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span> </div>
+<div class="foldopen" id="foldopen00260" data-start="{" data-end="}">
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">  260</a></span>  METAL_FUNC <span class="keyword">constexpr</span> <span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>&amp; <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">frag_at</a>(</div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>      <span class="keyword">const</span> <span class="keywordtype">short</span> i,</div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>      <span class="keyword">const</span> <span class="keywordtype">short</span> j)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> + j];</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>  }</div>
+</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span> </div>
+<div class="foldopen" id="foldopen00266" data-start="{" data-end="}">
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">  266</a></span>  METAL_FUNC <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">mat_at</a>(<span class="keyword">const</span> <span class="keywordtype">short</span> i, <span class="keyword">const</span> <span class="keywordtype">short</span> j) {</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a> val_mat;</div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> ii = 0; ii &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a>; ++ii) {</div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>      val_mat.thread_elements()[ii] = <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j)[ii];</div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>    }</div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>    <span class="keywordflow">return</span> val_mat;</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>  }</div>
+</div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span> </div>
+<div class="foldopen" id="foldopen00275" data-start="{" data-end="}">
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">  275</a></span>  METAL_FUNC thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a>* <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">elems</a>() {</div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>    <span class="keywordflow">return</span> <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a>*<span class="keyword">&gt;</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>);</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>  }</div>
+</div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span> </div>
+<div class="foldopen" id="foldopen00279" data-start="{" data-end="}">
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">  279</a></span>  METAL_FUNC <span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a>* <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">elems</a>()<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>    <span class="keywordflow">return</span> <span class="keyword">reinterpret_cast&lt;</span><span class="keyword">const </span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a>*<span class="keyword">&gt;</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>);</div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>  }</div>
+</div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span> </div>
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> Op&gt;</div>
+<div class="foldopen" id="foldopen00284" data-start="{" data-end="}">
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88">  284</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88">row_reduce</a>(thread T vals[<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e">kRowsPerThread</a>])<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a>; ++i) {</div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>; ++j) {</div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>        MMAFrag_t::template <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88">row_reduce&lt;Op&gt;</a>(</div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>            <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j), &amp;vals[i * MMAFrag_t::kElemRows]);</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>      }</div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>    }</div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>  }</div>
+</div>
+<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span> </div>
+<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> Op&gt;</div>
+<div class="foldopen" id="foldopen00296" data-start="{" data-end="}">
+<div class="line"><a id="l00296" name="l00296"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2">  296</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2">row_bin_op</a>(thread T vals[<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e">kRowsPerThread</a>]) {</div>
+<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a>; ++i) {</div>
+<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>; ++j) {</div>
+<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>        MMAFrag_t::template <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2">row_bin_op&lt;Op&gt;</a>(</div>
+<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>            <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j), &amp;vals[i * MMAFrag_t::kElemRows]);</div>
+<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>      }</div>
+<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>    }</div>
+<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>  }</div>
+</div>
+<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span> </div>
+<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> w_x, <span class="keywordtype">int</span> w_y, <span class="keywordtype">int</span> str_x, <span class="keywordtype">int</span> str_y&gt;</div>
+<div class="foldopen" id="foldopen00308" data-start="{" data-end="}">
+<div class="line"><a id="l00308" name="l00308"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96">  308</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96">load</a>(<span class="keyword">const</span> threadgroup U* src) {</div>
+<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a>; ++i) {</div>
+<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>; ++j) {</div>
+<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>        MMAFrag_t::load(</div>
+<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>            <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j),</div>
+<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>            &amp;(</div>
+<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span>                src[(i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a>) * w_x * str_x +</div>
+<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span>                    (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a>) * w_y * str_y]),</div>
+<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span>            <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;str_x&gt;</a>{},</div>
+<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>            <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;str_y&gt;</a>{});</div>
+<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>      }</div>
+<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span>    }</div>
+<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>  }</div>
+</div>
+<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span> </div>
+<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> w_x, <span class="keywordtype">int</span> w_y, <span class="keywordtype">int</span> str_x, <span class="keywordtype">int</span> str_y&gt;</div>
+<div class="foldopen" id="foldopen00325" data-start="{" data-end="}">
+<div class="line"><a id="l00325" name="l00325"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98">  325</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98">store</a>(threadgroup U* dst)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a>; ++i) {</div>
+<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>; ++j) {</div>
+<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>        MMAFrag_t::store(</div>
+<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>            <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j),</div>
+<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span>            &amp;(</div>
+<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>                dst[(i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a>) * w_x * str_x +</div>
+<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span>                    (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a>) * w_y * str_y]),</div>
+<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span>            <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;str_x&gt;</a>{},</div>
+<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span>            <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;str_y&gt;</a>{});</div>
+<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span>      }</div>
+<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span>    }</div>
+<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>  }</div>
+</div>
+<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span> </div>
+<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> w_x, <span class="keywordtype">int</span> w_y&gt;</div>
+<div class="foldopen" id="foldopen00342" data-start="{" data-end="}">
+<div class="line"><a id="l00342" name="l00342"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9">  342</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9">load</a>(<span class="keyword">const</span> device U* src, <span class="keyword">const</span> <span class="keywordtype">int</span> ld) {</div>
+<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a>; ++i) {</div>
+<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>; ++j) {</div>
+<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span>        MMAFrag_t::load(</div>
+<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>            <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j),</div>
+<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>            &amp;(src[(i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a>) * w_x * ld + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a>) * w_y]),</div>
+<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span>            ld,</div>
+<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span>            <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;1&gt;</a>{});</div>
+<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>      }</div>
+<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span>    }</div>
+<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span>  }</div>
+</div>
+<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span> </div>
+<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> w_x, <span class="keywordtype">int</span> w_y&gt;</div>
+<div class="foldopen" id="foldopen00357" data-start="{" data-end="}">
+<div class="line"><a id="l00357" name="l00357"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f">  357</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f">store</a>(device U* dst, <span class="keyword">const</span> <span class="keywordtype">int</span> ld)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a>; ++i) {</div>
+<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>; ++j) {</div>
+<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span>        MMAFrag_t::store(</div>
+<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>            <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j),</div>
+<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>            &amp;(dst[(i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a>) * w_x * ld + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a>) * w_y]),</div>
+<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span>            ld,</div>
+<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span>            <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;1&gt;</a>{});</div>
+<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span>      }</div>
+<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span>    }</div>
+<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span>  }</div>
+</div>
+<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span> </div>
+<div class="line"><a id="l00371" name="l00371"></a><span class="lineno">  371</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> w_x, <span class="keywordtype">int</span> w_y&gt;</div>
+<div class="line"><a id="l00372" name="l00372"></a><span class="lineno">  372</span>  METAL_FUNC <span class="keywordtype">void</span></div>
+<div class="foldopen" id="foldopen00373" data-start="{" data-end="}">
+<div class="line"><a id="l00373" name="l00373"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">  373</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">load_safe</a>(<span class="keyword">const</span> device U* src, <span class="keyword">const</span> <span class="keywordtype">int</span> ld, <span class="keyword">const</span> short2 src_tile_dims) {</div>
+<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00375" name="l00375"></a><span class="lineno">  375</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a>; ++i) {</div>
+<div class="line"><a id="l00376" name="l00376"></a><span class="lineno">  376</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00377" name="l00377"></a><span class="lineno">  377</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>; ++j) {</div>
+<div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span>        MMAFrag_t::load_safe(</div>
+<div class="line"><a id="l00379" name="l00379"></a><span class="lineno">  379</span>            <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j),</div>
+<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>            src,</div>
+<div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span>            ld,</div>
+<div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span>            <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;1&gt;</a>{},</div>
+<div class="line"><a id="l00383" name="l00383"></a><span class="lineno">  383</span>            src_tile_dims.y,</div>
+<div class="line"><a id="l00384" name="l00384"></a><span class="lineno">  384</span>            src_tile_dims.x,</div>
+<div class="line"><a id="l00385" name="l00385"></a><span class="lineno">  385</span>            (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a>) * w_x,</div>
+<div class="line"><a id="l00386" name="l00386"></a><span class="lineno">  386</span>            (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a>) * w_y);</div>
+<div class="line"><a id="l00387" name="l00387"></a><span class="lineno">  387</span>      }</div>
+<div class="line"><a id="l00388" name="l00388"></a><span class="lineno">  388</span>    }</div>
+<div class="line"><a id="l00389" name="l00389"></a><span class="lineno">  389</span>  }</div>
+</div>
+<div class="line"><a id="l00390" name="l00390"></a><span class="lineno">  390</span> </div>
+<div class="line"><a id="l00391" name="l00391"></a><span class="lineno">  391</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> w_x, <span class="keywordtype">int</span> w_y&gt;</div>
+<div class="line"><a id="l00392" name="l00392"></a><span class="lineno">  392</span>  METAL_FUNC <span class="keywordtype">void</span></div>
+<div class="foldopen" id="foldopen00393" data-start="{" data-end="}">
+<div class="line"><a id="l00393" name="l00393"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba">  393</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba">store_safe</a>(device U* dst, <span class="keyword">const</span> <span class="keywordtype">int</span> ld, <span class="keyword">const</span> short2 dst_tile_dims)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00394" name="l00394"></a><span class="lineno">  394</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00395" name="l00395"></a><span class="lineno">  395</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a>; ++i) {</div>
+<div class="line"><a id="l00396" name="l00396"></a><span class="lineno">  396</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00397" name="l00397"></a><span class="lineno">  397</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>; ++j) {</div>
+<div class="line"><a id="l00398" name="l00398"></a><span class="lineno">  398</span>        MMAFrag_t::store_safe(</div>
+<div class="line"><a id="l00399" name="l00399"></a><span class="lineno">  399</span>            <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j),</div>
+<div class="line"><a id="l00400" name="l00400"></a><span class="lineno">  400</span>            dst,</div>
+<div class="line"><a id="l00401" name="l00401"></a><span class="lineno">  401</span>            ld,</div>
+<div class="line"><a id="l00402" name="l00402"></a><span class="lineno">  402</span>            <a class="code hl_struct" href="structmlx_1_1steel_1_1integral__constant.html">Int&lt;1&gt;</a>{},</div>
+<div class="line"><a id="l00403" name="l00403"></a><span class="lineno">  403</span>            dst_tile_dims.y,</div>
+<div class="line"><a id="l00404" name="l00404"></a><span class="lineno">  404</span>            dst_tile_dims.x,</div>
+<div class="line"><a id="l00405" name="l00405"></a><span class="lineno">  405</span>            (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a>) * w_x,</div>
+<div class="line"><a id="l00406" name="l00406"></a><span class="lineno">  406</span>            (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a>) * w_y);</div>
+<div class="line"><a id="l00407" name="l00407"></a><span class="lineno">  407</span>      }</div>
+<div class="line"><a id="l00408" name="l00408"></a><span class="lineno">  408</span>    }</div>
+<div class="line"><a id="l00409" name="l00409"></a><span class="lineno">  409</span>  }</div>
+</div>
+<div class="line"><a id="l00410" name="l00410"></a><span class="lineno">  410</span>};</div>
+<div class="line"><a id="l00411" name="l00411"></a><span class="lineno">  411</span> </div>
+<div class="line"><a id="l00412" name="l00412"></a><span class="lineno">  412</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keywordtype">int</span> M, <span class="keywordtype">int</span> N, <span class="keywordtype">int</span> K&gt;</div>
+<div class="foldopen" id="foldopen00413" data-start="{" data-end="}">
+<div class="line"><a id="l00413" name="l00413"></a><span class="lineno"><a class="line" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">  413</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">tile_matmad</a>(</div>
+<div class="line"><a id="l00414" name="l00414"></a><span class="lineno">  414</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;T, M, N&gt;</a>&amp; D,</div>
+<div class="line"><a id="l00415" name="l00415"></a><span class="lineno">  415</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;U, M, K&gt;</a>&amp; A,</div>
+<div class="line"><a id="l00416" name="l00416"></a><span class="lineno">  416</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;U, K, N&gt;</a>&amp; B,</div>
+<div class="line"><a id="l00417" name="l00417"></a><span class="lineno">  417</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;T, M, N&gt;</a>&amp; C) {</div>
+<div class="line"><a id="l00418" name="l00418"></a><span class="lineno">  418</span>  <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00419" name="l00419"></a><span class="lineno">  419</span>  <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; K; ++k) {</div>
+<div class="line"><a id="l00420" name="l00420"></a><span class="lineno">  420</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00421" name="l00421"></a><span class="lineno">  421</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> m = 0; m &lt; M; ++m) {</div>
+<div class="line"><a id="l00422" name="l00422"></a><span class="lineno">  422</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00423" name="l00423"></a><span class="lineno">  423</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> n = 0; n &lt; N; ++n) {</div>
+<div class="line"><a id="l00424" name="l00424"></a><span class="lineno">  424</span>        <span class="keywordtype">short</span> n_serp = (m % 2) ? (N - 1 - n) : n;</div>
+<div class="line"><a id="l00425" name="l00425"></a><span class="lineno">  425</span>        <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;T, M, N&gt;::MMAFrag_t::mma</a>(</div>
+<div class="line"><a id="l00426" name="l00426"></a><span class="lineno">  426</span>            D.frag_at(m, n_serp),</div>
+<div class="line"><a id="l00427" name="l00427"></a><span class="lineno">  427</span>            A.frag_at(m, k),</div>
+<div class="line"><a id="l00428" name="l00428"></a><span class="lineno">  428</span>            B.frag_at(k, n_serp),</div>
+<div class="line"><a id="l00429" name="l00429"></a><span class="lineno">  429</span>            C.frag_at(m, n_serp));</div>
+<div class="line"><a id="l00430" name="l00430"></a><span class="lineno">  430</span>      }</div>
+<div class="line"><a id="l00431" name="l00431"></a><span class="lineno">  431</span>    }</div>
+<div class="line"><a id="l00432" name="l00432"></a><span class="lineno">  432</span>  }</div>
+<div class="line"><a id="l00433" name="l00433"></a><span class="lineno">  433</span>}</div>
+</div>
+<div class="line"><a id="l00434" name="l00434"></a><span class="lineno">  434</span> </div>
+<div class="line"><a id="l00435" name="l00435"></a><span class="lineno">  435</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00436" name="l00436"></a><span class="lineno">  436</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00437" name="l00437"></a><span class="lineno">  437</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00438" name="l00438"></a><span class="lineno">  438</span>    <span class="keywordtype">int</span> BM,</div>
+<div class="line"><a id="l00439" name="l00439"></a><span class="lineno">  439</span>    <span class="keywordtype">int</span> BN,</div>
+<div class="line"><a id="l00440" name="l00440"></a><span class="lineno">  440</span>    <span class="keywordtype">int</span> BK,</div>
+<div class="line"><a id="l00441" name="l00441"></a><span class="lineno">  441</span>    <span class="keywordtype">int</span> WM,</div>
+<div class="line"><a id="l00442" name="l00442"></a><span class="lineno">  442</span>    <span class="keywordtype">int</span> WN,</div>
+<div class="line"><a id="l00443" name="l00443"></a><span class="lineno">  443</span>    <span class="keywordtype">bool</span> transpose_a,</div>
+<div class="line"><a id="l00444" name="l00444"></a><span class="lineno">  444</span>    <span class="keywordtype">bool</span> transpose_b,</div>
+<div class="line"><a id="l00445" name="l00445"></a><span class="lineno">  445</span>    <span class="keywordtype">short</span> lda_tgp,</div>
+<div class="line"><a id="l00446" name="l00446"></a><span class="lineno">  446</span>    <span class="keywordtype">short</span> ldb_tgp,</div>
+<div class="line"><a id="l00447" name="l00447"></a><span class="lineno">  447</span>    <span class="keyword">typename</span> AccumType = float,</div>
+<div class="line"><a id="l00448" name="l00448"></a><span class="lineno">  448</span>    <span class="keyword">typename</span> Epilogue = TransformNone&lt;U, AccumType&gt;&gt;</div>
+<div class="line"><a id="l00449" name="l00449"></a><span class="lineno">  449</span><span class="keyword">struct </span>BlockMMA {</div>
+<div class="line"><a id="l00450" name="l00450"></a><span class="lineno">  450</span>  <span class="comment">// MMAFrag size</span></div>
+<div class="line"><a id="l00451" name="l00451"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">  451</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> = 8;</div>
+<div class="line"><a id="l00452" name="l00452"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8231b0e3475077c1381eb8f5daf62e35">  452</a></span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">MMAFrag_acc_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag&lt;AccumType, kFragSize, kFragSize&gt;</a>;</div>
+<div class="line"><a id="l00453" name="l00453"></a><span class="lineno">  453</span> </div>
+<div class="line"><a id="l00454" name="l00454"></a><span class="lineno">  454</span>  <span class="comment">// Warp tile simdgroup matrix strides along M</span></div>
+<div class="line"><a id="l00455" name="l00455"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">  455</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * WM;</div>
+<div class="line"><a id="l00456" name="l00456"></a><span class="lineno">  456</span>  <span class="comment">// Warp tile simdgroup matrix strides along M</span></div>
+<div class="line"><a id="l00457" name="l00457"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">  457</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * WN;</div>
+<div class="line"><a id="l00458" name="l00458"></a><span class="lineno">  458</span> </div>
+<div class="line"><a id="l00459" name="l00459"></a><span class="lineno">  459</span>  <span class="comment">// Warp tile size along M</span></div>
+<div class="line"><a id="l00460" name="l00460"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">  460</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a> = BM / <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>;</div>
+<div class="line"><a id="l00461" name="l00461"></a><span class="lineno">  461</span>  <span class="comment">// Warp tile size along N</span></div>
+<div class="line"><a id="l00462" name="l00462"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">  462</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a> = BN / <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>;</div>
+<div class="line"><a id="l00463" name="l00463"></a><span class="lineno">  463</span> </div>
+<div class="line"><a id="l00464" name="l00464"></a><span class="lineno">  464</span>  <span class="comment">// Threadgroup A strides</span></div>
+<div class="line"><a id="l00465" name="l00465"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">  465</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">A_str_m</a> = transpose_a ? 1 : lda_tgp; <span class="comment">// M</span></div>
+<div class="line"><a id="l00466" name="l00466"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">  466</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">A_str_k</a> = transpose_a ? lda_tgp : 1; <span class="comment">// K</span></div>
+<div class="line"><a id="l00467" name="l00467"></a><span class="lineno">  467</span> </div>
+<div class="line"><a id="l00468" name="l00468"></a><span class="lineno">  468</span>  <span class="comment">// Threadgroup B strides</span></div>
+<div class="line"><a id="l00469" name="l00469"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">  469</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">B_str_k</a> = transpose_b ? 1 : ldb_tgp; <span class="comment">// K</span></div>
+<div class="line"><a id="l00470" name="l00470"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">  470</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">B_str_n</a> = transpose_b ? ldb_tgp : 1; <span class="comment">// N</span></div>
+<div class="line"><a id="l00471" name="l00471"></a><span class="lineno">  471</span> </div>
+<div class="line"><a id="l00472" name="l00472"></a><span class="lineno">  472</span>  <span class="comment">// Threadgroup strides along K</span></div>
+<div class="line"><a id="l00473" name="l00473"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">  473</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">tile_stride_a</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">A_str_k</a>;</div>
+<div class="line"><a id="l00474" name="l00474"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">  474</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">tile_stride_b</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">B_str_k</a>;</div>
+<div class="line"><a id="l00475" name="l00475"></a><span class="lineno">  475</span> </div>
+<div class="line"><a id="l00476" name="l00476"></a><span class="lineno">  476</span>  <span class="comment">// Simdgroup matrices</span></div>
+<div class="line"><a id="l00477" name="l00477"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">  477</a></span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TM, 1, MMAFrag_acc_t&gt;</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">Atile</a>;</div>
+<div class="line"><a id="l00478" name="l00478"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">  478</a></span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, 1, TN, MMAFrag_acc_t&gt;</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">Btile</a>;</div>
+<div class="line"><a id="l00479" name="l00479"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">  479</a></span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TM, TN, MMAFrag_acc_t&gt;</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>;</div>
+<div class="line"><a id="l00480" name="l00480"></a><span class="lineno">  480</span> </div>
+<div class="line"><a id="l00481" name="l00481"></a><span class="lineno">  481</span>  <span class="comment">// Offsets within threadgroup</span></div>
+<div class="line"><a id="l00482" name="l00482"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">  482</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>;</div>
+<div class="line"><a id="l00483" name="l00483"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">  483</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>;</div>
+<div class="line"><a id="l00484" name="l00484"></a><span class="lineno">  484</span> </div>
+<div class="line"><a id="l00485" name="l00485"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">  485</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">As_offset</a>;</div>
+<div class="line"><a id="l00486" name="l00486"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">  486</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">Bs_offset</a>;</div>
+<div class="line"><a id="l00487" name="l00487"></a><span class="lineno">  487</span> </div>
+<div class="line"><a id="l00488" name="l00488"></a><span class="lineno">  488</span>  <span class="comment">/* Constructor */</span></div>
+<div class="foldopen" id="foldopen00489" data-start="{" data-end="}">
+<div class="line"><a id="l00489" name="l00489"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8">  489</a></span>  METAL_FUNC <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8">BlockMMA</a>(</div>
+<div class="line"><a id="l00490" name="l00490"></a><span class="lineno">  490</span>      ushort simd_group_id [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00491" name="l00491"></a><span class="lineno">  491</span>      ushort simd_lane_id [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l00492" name="l00492"></a><span class="lineno">  492</span>    <span class="comment">// Determine thread position in simdgroup matrix</span></div>
+<div class="line"><a id="l00493" name="l00493"></a><span class="lineno">  493</span>    <span class="keywordtype">short</span> tm = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * (simd_group_id / WN);</div>
+<div class="line"><a id="l00494" name="l00494"></a><span class="lineno">  494</span>    <span class="keywordtype">short</span> tn = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * (simd_group_id % WN);</div>
+<div class="line"><a id="l00495" name="l00495"></a><span class="lineno">  495</span> </div>
+<div class="line"><a id="l00496" name="l00496"></a><span class="lineno">  496</span>    short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);</div>
+<div class="line"><a id="l00497" name="l00497"></a><span class="lineno">  497</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a> = simd_coord.y;</div>
+<div class="line"><a id="l00498" name="l00498"></a><span class="lineno">  498</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a> = simd_coord.x;</div>
+<div class="line"><a id="l00499" name="l00499"></a><span class="lineno">  499</span> </div>
+<div class="line"><a id="l00500" name="l00500"></a><span class="lineno">  500</span>    <span class="comment">// Determine thread and simdgroup offset</span></div>
+<div class="line"><a id="l00501" name="l00501"></a><span class="lineno">  501</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">As_offset</a> = (tm + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>) * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">A_str_m</a> + (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>)*<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">A_str_k</a>; <span class="comment">// M, K</span></div>
+<div class="line"><a id="l00502" name="l00502"></a><span class="lineno">  502</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">Bs_offset</a> = (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>)*<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">B_str_k</a> + (tn + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>) * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">B_str_n</a>; <span class="comment">// K, N</span></div>
+<div class="line"><a id="l00503" name="l00503"></a><span class="lineno">  503</span> </div>
+<div class="line"><a id="l00504" name="l00504"></a><span class="lineno">  504</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a> += tm;</div>
+<div class="line"><a id="l00505" name="l00505"></a><span class="lineno">  505</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a> += tn;</div>
+<div class="line"><a id="l00506" name="l00506"></a><span class="lineno">  506</span>  }</div>
+</div>
+<div class="line"><a id="l00507" name="l00507"></a><span class="lineno">  507</span> </div>
+<div class="line"><a id="l00508" name="l00508"></a><span class="lineno">  508</span>  <span class="comment">/* (BM, BK) X (BK, BN) multiply accumulate function */</span></div>
+<div class="foldopen" id="foldopen00509" data-start="{" data-end="}">
+<div class="line"><a id="l00509" name="l00509"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0">  509</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0">mma</a>(<span class="keyword">const</span> threadgroup T* As, <span class="keyword">const</span> threadgroup T* Bs) {</div>
+<div class="line"><a id="l00510" name="l00510"></a><span class="lineno">  510</span>    <span class="comment">// Adjust for simdgroup and thread location</span></div>
+<div class="line"><a id="l00511" name="l00511"></a><span class="lineno">  511</span>    As += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">As_offset</a>;</div>
+<div class="line"><a id="l00512" name="l00512"></a><span class="lineno">  512</span>    Bs += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">Bs_offset</a>;</div>
+<div class="line"><a id="l00513" name="l00513"></a><span class="lineno">  513</span> </div>
+<div class="line"><a id="l00514" name="l00514"></a><span class="lineno">  514</span>    <span class="comment">// Iterate over BK in blocks of kFragSize</span></div>
+<div class="line"><a id="l00515" name="l00515"></a><span class="lineno">  515</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00516" name="l00516"></a><span class="lineno">  516</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> kk = 0; kk &lt; BK; kk += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>) {</div>
+<div class="line"><a id="l00517" name="l00517"></a><span class="lineno">  517</span>      simdgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00518" name="l00518"></a><span class="lineno">  518</span> </div>
+<div class="line"><a id="l00519" name="l00519"></a><span class="lineno">  519</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">Atile</a>.template load&lt;T, WM, 1, A_str_m, A_str_k&gt;(As);</div>
+<div class="line"><a id="l00520" name="l00520"></a><span class="lineno">  520</span> </div>
+<div class="line"><a id="l00521" name="l00521"></a><span class="lineno">  521</span>      simdgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00522" name="l00522"></a><span class="lineno">  522</span> </div>
+<div class="line"><a id="l00523" name="l00523"></a><span class="lineno">  523</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">Btile</a>.template load&lt;T, 1, WN, B_str_k, B_str_n&gt;(Bs);</div>
+<div class="line"><a id="l00524" name="l00524"></a><span class="lineno">  524</span> </div>
+<div class="line"><a id="l00525" name="l00525"></a><span class="lineno">  525</span>      simdgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00526" name="l00526"></a><span class="lineno">  526</span> </div>
+<div class="line"><a id="l00527" name="l00527"></a><span class="lineno">  527</span>      <a class="code hl_function" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">tile_matmad</a>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">Atile</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">Btile</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>);</div>
+<div class="line"><a id="l00528" name="l00528"></a><span class="lineno">  528</span> </div>
+<div class="line"><a id="l00529" name="l00529"></a><span class="lineno">  529</span>      <span class="comment">// Progress to next simdgroup tile</span></div>
+<div class="line"><a id="l00530" name="l00530"></a><span class="lineno">  530</span>      As += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">tile_stride_a</a>;</div>
+<div class="line"><a id="l00531" name="l00531"></a><span class="lineno">  531</span>      Bs += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">tile_stride_b</a>;</div>
+<div class="line"><a id="l00532" name="l00532"></a><span class="lineno">  532</span>    }</div>
+<div class="line"><a id="l00533" name="l00533"></a><span class="lineno">  533</span>  }</div>
+</div>
+<div class="line"><a id="l00534" name="l00534"></a><span class="lineno">  534</span> </div>
+<div class="line"><a id="l00535" name="l00535"></a><span class="lineno">  535</span>  <span class="comment">/* Store results from simdgroup_matrix results into device memory */</span></div>
+<div class="foldopen" id="foldopen00536" data-start="{" data-end="}">
+<div class="line"><a id="l00536" name="l00536"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b">  536</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b">store_result</a>(device U* D, <span class="keyword">const</span> <span class="keywordtype">int</span> ldd) {</div>
+<div class="line"><a id="l00537" name="l00537"></a><span class="lineno">  537</span>    <span class="comment">// Apply epilogue</span></div>
+<div class="line"><a id="l00538" name="l00538"></a><span class="lineno">  538</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00539" name="l00539"></a><span class="lineno">  539</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerTile; i++) {</div>
+<div class="line"><a id="l00540" name="l00540"></a><span class="lineno">  540</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i] = Epilogue::apply(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i]);</div>
+<div class="line"><a id="l00541" name="l00541"></a><span class="lineno">  541</span>    }</div>
+<div class="line"><a id="l00542" name="l00542"></a><span class="lineno">  542</span> </div>
+<div class="line"><a id="l00543" name="l00543"></a><span class="lineno">  543</span>    <span class="comment">// Adjust for simdgroup and thread location</span></div>
+<div class="line"><a id="l00544" name="l00544"></a><span class="lineno">  544</span>    D += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a> * ldd + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>;</div>
+<div class="line"><a id="l00545" name="l00545"></a><span class="lineno">  545</span> </div>
+<div class="line"><a id="l00546" name="l00546"></a><span class="lineno">  546</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.template store&lt;U, WM, WN&gt;(D, ldd);</div>
+<div class="line"><a id="l00547" name="l00547"></a><span class="lineno">  547</span>  }</div>
+</div>
+<div class="line"><a id="l00548" name="l00548"></a><span class="lineno">  548</span> </div>
+<div class="line"><a id="l00549" name="l00549"></a><span class="lineno">  549</span>  METAL_FUNC <span class="keywordtype">void</span></div>
+<div class="foldopen" id="foldopen00550" data-start="{" data-end="}">
+<div class="line"><a id="l00550" name="l00550"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611">  550</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611">store_result_safe</a>(device U* D, <span class="keyword">const</span> <span class="keywordtype">int</span> ldd, short2 dst_tile_dims) {</div>
+<div class="line"><a id="l00551" name="l00551"></a><span class="lineno">  551</span>    <span class="comment">// Apply epilogue</span></div>
+<div class="line"><a id="l00552" name="l00552"></a><span class="lineno">  552</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00553" name="l00553"></a><span class="lineno">  553</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerTile; i++) {</div>
+<div class="line"><a id="l00554" name="l00554"></a><span class="lineno">  554</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i] = Epilogue::apply(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i]);</div>
+<div class="line"><a id="l00555" name="l00555"></a><span class="lineno">  555</span>    }</div>
+<div class="line"><a id="l00556" name="l00556"></a><span class="lineno">  556</span> </div>
+<div class="line"><a id="l00557" name="l00557"></a><span class="lineno">  557</span>    <span class="comment">// Adjust for simdgroup and thread location</span></div>
+<div class="line"><a id="l00558" name="l00558"></a><span class="lineno">  558</span>    D += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a> * ldd + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>;</div>
+<div class="line"><a id="l00559" name="l00559"></a><span class="lineno">  559</span>    dst_tile_dims -= short2(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>);</div>
+<div class="line"><a id="l00560" name="l00560"></a><span class="lineno">  560</span> </div>
+<div class="line"><a id="l00561" name="l00561"></a><span class="lineno">  561</span>    <span class="keywordflow">if</span> (dst_tile_dims.x &lt;= 0 || dst_tile_dims.y &lt;= 0)</div>
+<div class="line"><a id="l00562" name="l00562"></a><span class="lineno">  562</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00563" name="l00563"></a><span class="lineno">  563</span> </div>
+<div class="line"><a id="l00564" name="l00564"></a><span class="lineno">  564</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.template store_safe&lt;U, WM, WN&gt;(D, ldd, dst_tile_dims);</div>
+<div class="line"><a id="l00565" name="l00565"></a><span class="lineno">  565</span>  }</div>
+</div>
+<div class="line"><a id="l00566" name="l00566"></a><span class="lineno">  566</span> </div>
+<div class="line"><a id="l00567" name="l00567"></a><span class="lineno">  567</span>  <span class="comment">/* Apply epilogue */</span></div>
+<div class="line"><a id="l00568" name="l00568"></a><span class="lineno">  568</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> UnaryEpilogue&gt;</div>
+<div class="foldopen" id="foldopen00569" data-start="{" data-end="}">
+<div class="line"><a id="l00569" name="l00569"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">  569</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">apply_epilogue</a>(thread <span class="keyword">const</span> UnaryEpilogue&amp; epilogue_op) {</div>
+<div class="line"><a id="l00570" name="l00570"></a><span class="lineno">  570</span>    <span class="comment">// Loop over all simdgroup tiles</span></div>
+<div class="line"><a id="l00571" name="l00571"></a><span class="lineno">  571</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00572" name="l00572"></a><span class="lineno">  572</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerTile; i++) {</div>
+<div class="line"><a id="l00573" name="l00573"></a><span class="lineno">  573</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i] = epilogue_op.apply(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i]);</div>
+<div class="line"><a id="l00574" name="l00574"></a><span class="lineno">  574</span>    }</div>
+<div class="line"><a id="l00575" name="l00575"></a><span class="lineno">  575</span>  }</div>
+</div>
+<div class="line"><a id="l00576" name="l00576"></a><span class="lineno">  576</span> </div>
+<div class="line"><a id="l00577" name="l00577"></a><span class="lineno">  577</span>  <span class="comment">/* Apply epilogue */</span></div>
+<div class="line"><a id="l00578" name="l00578"></a><span class="lineno">  578</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> BinaryEpilogue&gt;</div>
+<div class="foldopen" id="foldopen00579" data-start="{" data-end="}">
+<div class="line"><a id="l00579" name="l00579"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae">  579</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae">apply_epilogue</a>(</div>
+<div class="line"><a id="l00580" name="l00580"></a><span class="lineno">  580</span>      <span class="keyword">const</span> device U* C,</div>
+<div class="line"><a id="l00581" name="l00581"></a><span class="lineno">  581</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> ldc,</div>
+<div class="line"><a id="l00582" name="l00582"></a><span class="lineno">  582</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> fdc,</div>
+<div class="line"><a id="l00583" name="l00583"></a><span class="lineno">  583</span>      thread <span class="keyword">const</span> BinaryEpilogue&amp; epilogue_op) {</div>
+<div class="line"><a id="l00584" name="l00584"></a><span class="lineno">  584</span>    <span class="comment">// Adjust for simdgroup and thread location</span></div>
+<div class="line"><a id="l00585" name="l00585"></a><span class="lineno">  585</span>    C += (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>)*ldc + (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>)*fdc;</div>
+<div class="line"><a id="l00586" name="l00586"></a><span class="lineno">  586</span> </div>
+<div class="line"><a id="l00587" name="l00587"></a><span class="lineno">  587</span>    <span class="comment">// Loop over all simdgroup tiles</span></div>
+<div class="line"><a id="l00588" name="l00588"></a><span class="lineno">  588</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00589" name="l00589"></a><span class="lineno">  589</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>; i++) {</div>
+<div class="line"><a id="l00590" name="l00590"></a><span class="lineno">  590</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00591" name="l00591"></a><span class="lineno">  591</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a>; j++) {</div>
+<div class="line"><a id="l00592" name="l00592"></a><span class="lineno">  592</span>        <span class="comment">// Get accumulated result and associated offset in C</span></div>
+<div class="line"><a id="l00593" name="l00593"></a><span class="lineno">  593</span>        thread <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.frag_at(i, j);</div>
+<div class="line"><a id="l00594" name="l00594"></a><span class="lineno">  594</span>        <span class="keywordtype">int</span> offset_c = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldc + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>) * fdc;</div>
+<div class="line"><a id="l00595" name="l00595"></a><span class="lineno">  595</span> </div>
+<div class="line"><a id="l00596" name="l00596"></a><span class="lineno">  596</span>        <span class="comment">// Apply epilogue</span></div>
+<div class="line"><a id="l00597" name="l00597"></a><span class="lineno">  597</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00598" name="l00598"></a><span class="lineno">  598</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerFrag; k++) {</div>
+<div class="line"><a id="l00599" name="l00599"></a><span class="lineno">  599</span>          accum[k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);</div>
+<div class="line"><a id="l00600" name="l00600"></a><span class="lineno">  600</span>        }</div>
+<div class="line"><a id="l00601" name="l00601"></a><span class="lineno">  601</span>      }</div>
+<div class="line"><a id="l00602" name="l00602"></a><span class="lineno">  602</span>    }</div>
+<div class="line"><a id="l00603" name="l00603"></a><span class="lineno">  603</span>  }</div>
+</div>
+<div class="line"><a id="l00604" name="l00604"></a><span class="lineno">  604</span> </div>
+<div class="line"><a id="l00605" name="l00605"></a><span class="lineno">  605</span>  <span class="comment">/* Apply epilogue */</span></div>
+<div class="line"><a id="l00606" name="l00606"></a><span class="lineno">  606</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> BinaryEpilogue&gt;</div>
+<div class="foldopen" id="foldopen00607" data-start="{" data-end="}">
+<div class="line"><a id="l00607" name="l00607"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a">  607</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a">apply_epilogue_safe</a>(</div>
+<div class="line"><a id="l00608" name="l00608"></a><span class="lineno">  608</span>      <span class="keyword">const</span> device U* C,</div>
+<div class="line"><a id="l00609" name="l00609"></a><span class="lineno">  609</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> ldc,</div>
+<div class="line"><a id="l00610" name="l00610"></a><span class="lineno">  610</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> fdc,</div>
+<div class="line"><a id="l00611" name="l00611"></a><span class="lineno">  611</span>      short2 dst_tile_dims,</div>
+<div class="line"><a id="l00612" name="l00612"></a><span class="lineno">  612</span>      thread <span class="keyword">const</span> BinaryEpilogue&amp; epilogue_op) {</div>
+<div class="line"><a id="l00613" name="l00613"></a><span class="lineno">  613</span>    <span class="comment">// Adjust for simdgroup and thread location</span></div>
+<div class="line"><a id="l00614" name="l00614"></a><span class="lineno">  614</span>    C += (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>)*ldc + (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>)*fdc;</div>
+<div class="line"><a id="l00615" name="l00615"></a><span class="lineno">  615</span>    dst_tile_dims -= short2(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>);</div>
+<div class="line"><a id="l00616" name="l00616"></a><span class="lineno">  616</span> </div>
+<div class="line"><a id="l00617" name="l00617"></a><span class="lineno">  617</span>    <span class="keywordflow">if</span> (dst_tile_dims.x &lt;= 0 || dst_tile_dims.y &lt;= 0)</div>
+<div class="line"><a id="l00618" name="l00618"></a><span class="lineno">  618</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00619" name="l00619"></a><span class="lineno">  619</span> </div>
+<div class="line"><a id="l00620" name="l00620"></a><span class="lineno">  620</span>    <span class="comment">// Loop over all simdgroup tiles</span></div>
+<div class="line"><a id="l00621" name="l00621"></a><span class="lineno">  621</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00622" name="l00622"></a><span class="lineno">  622</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>; i++) {</div>
+<div class="line"><a id="l00623" name="l00623"></a><span class="lineno">  623</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00624" name="l00624"></a><span class="lineno">  624</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a>; j++) {</div>
+<div class="line"><a id="l00625" name="l00625"></a><span class="lineno">  625</span>        <span class="comment">// Get accumulated result and associated offset in C</span></div>
+<div class="line"><a id="l00626" name="l00626"></a><span class="lineno">  626</span>        thread <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.frag_at(i, j);</div>
+<div class="line"><a id="l00627" name="l00627"></a><span class="lineno">  627</span>        <span class="keywordtype">int</span> offset_c = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldc + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>) * fdc;</div>
+<div class="line"><a id="l00628" name="l00628"></a><span class="lineno">  628</span> </div>
+<div class="line"><a id="l00629" name="l00629"></a><span class="lineno">  629</span>        <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kelems = <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerFrag;</div>
+<div class="line"><a id="l00630" name="l00630"></a><span class="lineno">  630</span> </div>
+<div class="line"><a id="l00631" name="l00631"></a><span class="lineno">  631</span>        <span class="comment">// Read C</span></div>
+<div class="line"><a id="l00632" name="l00632"></a><span class="lineno">  632</span>        U c_elems[kelems] = {0};</div>
+<div class="line"><a id="l00633" name="l00633"></a><span class="lineno">  633</span> </div>
+<div class="line"><a id="l00634" name="l00634"></a><span class="lineno">  634</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00635" name="l00635"></a><span class="lineno">  635</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; kelems; k++) {</div>
+<div class="line"><a id="l00636" name="l00636"></a><span class="lineno">  636</span>          <span class="keywordflow">if</span> ((j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a> + k) &lt; dst_tile_dims.x) {</div>
+<div class="line"><a id="l00637" name="l00637"></a><span class="lineno">  637</span>            c_elems[k] = C[offset_c + k * fdc];</div>
+<div class="line"><a id="l00638" name="l00638"></a><span class="lineno">  638</span>          }</div>
+<div class="line"><a id="l00639" name="l00639"></a><span class="lineno">  639</span>        }</div>
+<div class="line"><a id="l00640" name="l00640"></a><span class="lineno">  640</span> </div>
+<div class="line"><a id="l00641" name="l00641"></a><span class="lineno">  641</span>        <span class="comment">// Apply epilogue</span></div>
+<div class="line"><a id="l00642" name="l00642"></a><span class="lineno">  642</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00643" name="l00643"></a><span class="lineno">  643</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; kelems; k++) {</div>
+<div class="line"><a id="l00644" name="l00644"></a><span class="lineno">  644</span>          accum[k] = epilogue_op.apply(accum[k], c_elems[k]);</div>
+<div class="line"><a id="l00645" name="l00645"></a><span class="lineno">  645</span>        }</div>
+<div class="line"><a id="l00646" name="l00646"></a><span class="lineno">  646</span>      }</div>
+<div class="line"><a id="l00647" name="l00647"></a><span class="lineno">  647</span>    }</div>
+<div class="line"><a id="l00648" name="l00648"></a><span class="lineno">  648</span>  }</div>
+</div>
+<div class="line"><a id="l00649" name="l00649"></a><span class="lineno">  649</span> </div>
+<div class="line"><a id="l00650" name="l00650"></a><span class="lineno">  650</span>  <span class="comment">/* Store results from simdgroup_matrix results into device memory */</span></div>
+<div class="foldopen" id="foldopen00651" data-start="{" data-end="}">
+<div class="line"><a id="l00651" name="l00651"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3">  651</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3">store_result</a>(</div>
+<div class="line"><a id="l00652" name="l00652"></a><span class="lineno">  652</span>      device U* D,</div>
+<div class="line"><a id="l00653" name="l00653"></a><span class="lineno">  653</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> ldd,</div>
+<div class="line"><a id="l00654" name="l00654"></a><span class="lineno">  654</span>      <span class="keyword">const</span> device U* C,</div>
+<div class="line"><a id="l00655" name="l00655"></a><span class="lineno">  655</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> ldc,</div>
+<div class="line"><a id="l00656" name="l00656"></a><span class="lineno">  656</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> fdc,</div>
+<div class="line"><a id="l00657" name="l00657"></a><span class="lineno">  657</span>      thread <span class="keyword">const</span> Epilogue&amp; epilogue_op)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00658" name="l00658"></a><span class="lineno">  658</span>    <span class="comment">// Adjust for simdgroup and thread location</span></div>
+<div class="line"><a id="l00659" name="l00659"></a><span class="lineno">  659</span>    C += (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>)*ldc + (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>)*fdc;</div>
+<div class="line"><a id="l00660" name="l00660"></a><span class="lineno">  660</span>    D += (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>)*ldd + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>;</div>
+<div class="line"><a id="l00661" name="l00661"></a><span class="lineno">  661</span> </div>
+<div class="line"><a id="l00662" name="l00662"></a><span class="lineno">  662</span>    <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kelems = <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerFrag;</div>
+<div class="line"><a id="l00663" name="l00663"></a><span class="lineno">  663</span> </div>
+<div class="line"><a id="l00664" name="l00664"></a><span class="lineno">  664</span>    <span class="comment">// Loop over all simdgroup tiles</span></div>
+<div class="line"><a id="l00665" name="l00665"></a><span class="lineno">  665</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00666" name="l00666"></a><span class="lineno">  666</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>; i++) {</div>
+<div class="line"><a id="l00667" name="l00667"></a><span class="lineno">  667</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00668" name="l00668"></a><span class="lineno">  668</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a>; j++) {</div>
+<div class="line"><a id="l00669" name="l00669"></a><span class="lineno">  669</span>        <span class="comment">// Get accumulated result and associated offset in C</span></div>
+<div class="line"><a id="l00670" name="l00670"></a><span class="lineno">  670</span>        thread <span class="keyword">const</span> <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.frag_at(i, j);</div>
+<div class="line"><a id="l00671" name="l00671"></a><span class="lineno">  671</span>        <span class="keywordtype">int</span> offset_c = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldc + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>) * fdc;</div>
+<div class="line"><a id="l00672" name="l00672"></a><span class="lineno">  672</span>        <span class="keywordtype">int</span> offset_d = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldd + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>);</div>
+<div class="line"><a id="l00673" name="l00673"></a><span class="lineno">  673</span> </div>
+<div class="line"><a id="l00674" name="l00674"></a><span class="lineno">  674</span>        <span class="comment">// Apply epilogue</span></div>
+<div class="line"><a id="l00675" name="l00675"></a><span class="lineno">  675</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00676" name="l00676"></a><span class="lineno">  676</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; kelems; k++) {</div>
+<div class="line"><a id="l00677" name="l00677"></a><span class="lineno">  677</span>          D[offset_d + k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);</div>
+<div class="line"><a id="l00678" name="l00678"></a><span class="lineno">  678</span>        }</div>
+<div class="line"><a id="l00679" name="l00679"></a><span class="lineno">  679</span>      }</div>
+<div class="line"><a id="l00680" name="l00680"></a><span class="lineno">  680</span>    }</div>
+<div class="line"><a id="l00681" name="l00681"></a><span class="lineno">  681</span>  }</div>
+</div>
+<div class="line"><a id="l00682" name="l00682"></a><span class="lineno">  682</span> </div>
+<div class="foldopen" id="foldopen00683" data-start="{" data-end="}">
+<div class="line"><a id="l00683" name="l00683"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391">  683</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391">store_result_safe</a>(</div>
+<div class="line"><a id="l00684" name="l00684"></a><span class="lineno">  684</span>      device U* D,</div>
+<div class="line"><a id="l00685" name="l00685"></a><span class="lineno">  685</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> ldd,</div>
+<div class="line"><a id="l00686" name="l00686"></a><span class="lineno">  686</span>      <span class="keyword">const</span> device U* C,</div>
+<div class="line"><a id="l00687" name="l00687"></a><span class="lineno">  687</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> ldc,</div>
+<div class="line"><a id="l00688" name="l00688"></a><span class="lineno">  688</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> fdc,</div>
+<div class="line"><a id="l00689" name="l00689"></a><span class="lineno">  689</span>      short2 dst_tile_dims,</div>
+<div class="line"><a id="l00690" name="l00690"></a><span class="lineno">  690</span>      thread <span class="keyword">const</span> Epilogue&amp; epilogue_op)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00691" name="l00691"></a><span class="lineno">  691</span>    <span class="comment">// Adjust for simdgroup and thread location</span></div>
+<div class="line"><a id="l00692" name="l00692"></a><span class="lineno">  692</span>    C += (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>)*ldc + (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>)*fdc;</div>
+<div class="line"><a id="l00693" name="l00693"></a><span class="lineno">  693</span>    D += (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>)*ldd + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>;</div>
+<div class="line"><a id="l00694" name="l00694"></a><span class="lineno">  694</span>    dst_tile_dims -= short2(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>);</div>
+<div class="line"><a id="l00695" name="l00695"></a><span class="lineno">  695</span> </div>
+<div class="line"><a id="l00696" name="l00696"></a><span class="lineno">  696</span>    <span class="keywordflow">if</span> (dst_tile_dims.x &lt;= 0 || dst_tile_dims.y &lt;= 0)</div>
+<div class="line"><a id="l00697" name="l00697"></a><span class="lineno">  697</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00698" name="l00698"></a><span class="lineno">  698</span> </div>
+<div class="line"><a id="l00699" name="l00699"></a><span class="lineno">  699</span>    <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kelems = <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerFrag;</div>
+<div class="line"><a id="l00700" name="l00700"></a><span class="lineno">  700</span> </div>
+<div class="line"><a id="l00701" name="l00701"></a><span class="lineno">  701</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00702" name="l00702"></a><span class="lineno">  702</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>; i++) {</div>
+<div class="line"><a id="l00703" name="l00703"></a><span class="lineno">  703</span>      <span class="keywordflow">if</span> (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a> &lt; dst_tile_dims.y) {</div>
+<div class="line"><a id="l00704" name="l00704"></a><span class="lineno">  704</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00705" name="l00705"></a><span class="lineno">  705</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a>; j++) {</div>
+<div class="line"><a id="l00706" name="l00706"></a><span class="lineno">  706</span>          <span class="comment">// Get accumulated result and associated offset in C</span></div>
+<div class="line"><a id="l00707" name="l00707"></a><span class="lineno">  707</span>          thread <span class="keyword">const</span> <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.frag_at(i, j);</div>
+<div class="line"><a id="l00708" name="l00708"></a><span class="lineno">  708</span>          <span class="keywordtype">int</span> offset_c = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldc + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>) * fdc;</div>
+<div class="line"><a id="l00709" name="l00709"></a><span class="lineno">  709</span>          <span class="keywordtype">int</span> offset_d = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldd + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>);</div>
+<div class="line"><a id="l00710" name="l00710"></a><span class="lineno">  710</span> </div>
+<div class="line"><a id="l00711" name="l00711"></a><span class="lineno">  711</span>          <span class="comment">// Apply epilogue</span></div>
+<div class="line"><a id="l00712" name="l00712"></a><span class="lineno">  712</span>          <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00713" name="l00713"></a><span class="lineno">  713</span>          <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; kelems; k++) {</div>
+<div class="line"><a id="l00714" name="l00714"></a><span class="lineno">  714</span>            <span class="keywordflow">if</span> ((j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a> + k) &lt; dst_tile_dims.x) {</div>
+<div class="line"><a id="l00715" name="l00715"></a><span class="lineno">  715</span>              D[offset_d + k] =</div>
+<div class="line"><a id="l00716" name="l00716"></a><span class="lineno">  716</span>                  epilogue_op.apply(accum[k], C[offset_c + k * fdc]);</div>
+<div class="line"><a id="l00717" name="l00717"></a><span class="lineno">  717</span>            }</div>
+<div class="line"><a id="l00718" name="l00718"></a><span class="lineno">  718</span>          }</div>
+<div class="line"><a id="l00719" name="l00719"></a><span class="lineno">  719</span>        }</div>
+<div class="line"><a id="l00720" name="l00720"></a><span class="lineno">  720</span>      }</div>
+<div class="line"><a id="l00721" name="l00721"></a><span class="lineno">  721</span>    }</div>
+<div class="line"><a id="l00722" name="l00722"></a><span class="lineno">  722</span>  }</div>
+</div>
+<div class="line"><a id="l00723" name="l00723"></a><span class="lineno">  723</span>};</div>
+<div class="line"><a id="l00724" name="l00724"></a><span class="lineno">  724</span> </div>
+<div class="line"><a id="l00725" name="l00725"></a><span class="lineno">  725</span>} <span class="comment">// namespace steel</span></div>
+<div class="line"><a id="l00726" name="l00726"></a><span class="lineno">  726</span>} <span class="comment">// namespace mlx</span></div>
+<div class="ttc" id="abackend_2metal_2kernels_2steel_2attn_2transforms_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2attn_2transforms_8h.html">transforms.h</a></div></div>
+<div class="ttc" id="aintegral__constant_8h_html"><div class="ttname"><a href="integral__constant_8h.html">integral_constant.h</a></div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a5017efc9605e069cfb507137cd1a1852"><div class="ttname"><a href="namespacemetal.html#a5017efc9605e069cfb507137cd1a1852">metal::simd_shuffle_xor</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_xor(bfloat16_t data, ushort mask)</div><div class="ttdef"><b>Definition</b> bf16_math.h:377</div></div>
+<div class="ttc" id="anamespacemetal_html_ac82ee6c3fbe9ec5c78c07329424aaec9"><div class="ttname"><a href="namespacemetal.html#ac82ee6c3fbe9ec5c78c07329424aaec9">metal::pointer_element_t</a></div><div class="ttdeci">typename pointer_element&lt; remove_cv_t&lt; T &gt; &gt;::type pointer_element_t</div><div class="ttdef"><b>Definition</b> type_traits.h:51</div></div>
+<div class="ttc" id="anamespacemlx_1_1steel_html_ad583e6038efc119542410f43b603d4ad"><div class="ttname"><a href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">mlx::steel::tile_matmad</a></div><div class="ttdeci">METAL_FUNC void tile_matmad(thread MMATile&lt; T, M, N &gt; &amp;D, thread MMATile&lt; U, M, K &gt; &amp;A, thread MMATile&lt; U, K, N &gt; &amp;B, thread MMATile&lt; T, M, N &gt; &amp;C)</div><div class="ttdef"><b>Definition</b> mma.h:413</div></div>
+<div class="ttc" id="anamespacemlx_html"><div class="ttname"><a href="namespacemlx.html">mlx</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
+<div class="ttc" id="asteel_2defines_8h_html"><div class="ttname"><a href="steel_2defines_8h.html">defines.h</a></div></div>
+<div class="ttc" id="asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6"><div class="ttname"><a href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div><div class="ttdeci">#define STEEL_PRAGMA_UNROLL</div><div class="ttdef"><b>Definition</b> defines.h:4</div></div>
+<div class="ttc" id="asteel_2defines_8h_html_a90b91c866313ffa46eff6d9cc944ad2b"><div class="ttname"><a href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a></div><div class="ttdeci">#define STEEL_CONST</div><div class="ttdef"><b>Definition</b> defines.h:3</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a1868f57d57c8adedab2c58492ec76946"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma</a></div><div class="ttdeci">static METAL_FUNC constexpr void mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)</div><div class="ttdef"><b>Definition</b> mma.h:180</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a1f0b00daad8eba2f855bb306e70d2328"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store_safe</a></div><div class="ttdeci">static METAL_FUNC constexpr void store_safe(const thread frag_type &amp;src, DstPtrType dst, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=Int&lt; 0 &gt;{}, OffY off_y=Int&lt; 0 &gt;{})</div><div class="ttdef"><b>Definition</b> mma.h:138</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a318c4279bdc7b39b7919f108b1cd8010"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a318c4279bdc7b39b7919f108b1cd8010">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::row_bin_op</a></div><div class="ttdeci">static METAL_FUNC constexpr void row_bin_op(thread frag_type &amp;inp_vals, thread T *row_vals)</div><div class="ttdef"><b>Definition</b> mma.h:204</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a51d662e4cff88b5ad17d7c44bb6b6970"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a51d662e4cff88b5ad17d7c44bb6b6970">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::row_reduce</a></div><div class="ttdeci">static METAL_FUNC constexpr void row_reduce(thread const frag_type &amp;inp_vals, thread T *reduced_vals)</div><div class="ttdef"><b>Definition</b> mma.h:189</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a5ec2e40a8f5ad98c71b825544cdd878b"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a5ec2e40a8f5ad98c71b825544cdd878b">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::row_frag_type</a></div><div class="ttdeci">metal::vec&lt; T, kElemRows &gt; row_frag_type</div><div class="ttdef"><b>Definition</b> mma.h:62</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a7331fff1d12f2f8b72b0006a3ad0dd83"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::get_coord</a></div><div class="ttdeci">static METAL_FUNC constexpr short2 get_coord(ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> mma.h:65</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a8028512f5a3d2b6acaf966be529627a3"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma</a></div><div class="ttdeci">static METAL_FUNC constexpr void mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)</div><div class="ttdef"><b>Definition</b> mma.h:161</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a958b6952cbd9462d7ae9f6e029631887"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mat_type</a></div><div class="ttdeci">metal::simdgroup_matrix&lt; T, kFragRows, kFragCols &gt; mat_type</div><div class="ttdef"><b>Definition</b> mma.h:60</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a9f53a5e9b046b4f217e782b733941b0c"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::frag_type</a></div><div class="ttdeci">metal::vec&lt; T, kElemsPerFrag &gt; frag_type</div><div class="ttdef"><b>Definition</b> mma.h:61</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_aa8f50ea8961ec5b35c1b81366d64f2cb"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store</a></div><div class="ttdeci">static METAL_FUNC constexpr void store(const thread frag_type &amp;src, DstPtrType dst, StrX str_x, StrY str_y)</div><div class="ttdef"><b>Definition</b> mma.h:118</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_aab8dd1c6917247da41dd3a31139a665f"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aab8dd1c6917247da41dd3a31139a665f">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::col_frag_type</a></div><div class="ttdeci">metal::vec&lt; T, kElemCols &gt; col_frag_type</div><div class="ttdef"><b>Definition</b> mma.h:63</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_ac73006b36fc710feda3a7c796e21415c"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load</a></div><div class="ttdeci">static METAL_FUNC constexpr void load(thread frag_type &amp;dst, SrcPtrType src, StrX str_x, StrY str_y)</div><div class="ttdef"><b>Definition</b> mma.h:75</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_ad22aaee4a2938cbdd315b39eda84e07d"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load_safe</a></div><div class="ttdeci">static METAL_FUNC constexpr void load_safe(thread frag_type &amp;dst, SrcPtrType src, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=Int&lt; 0 &gt;{}, OffY off_y=Int&lt; 0 &gt;{})</div><div class="ttdef"><b>Definition</b> mma.h:93</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a></div><div class="ttdef"><b>Definition</b> mma.h:23</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a0461451ffb5041b6a916ea17ed34288b"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b">mlx::steel::BlockMMA::store_result</a></div><div class="ttdeci">METAL_FUNC void store_result(device U *D, const int ldd)</div><div class="ttdef"><b>Definition</b> mma.h:536</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a081ba538d30d1d02498a7f341e6bd611"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611">mlx::steel::BlockMMA::store_result_safe</a></div><div class="ttdeci">METAL_FUNC void store_result_safe(device U *D, const int ldd, short2 dst_tile_dims)</div><div class="ttdef"><b>Definition</b> mma.h:550</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a138ed1bbad2ca88d3a3c7d162cd36562"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">mlx::steel::BlockMMA::As_offset</a></div><div class="ttdeci">short As_offset</div><div class="ttdef"><b>Definition</b> mma.h:485</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a21b0c40d16eced109bd3196186170bc6"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">mlx::steel::BlockMMA::Ctile</a></div><div class="ttdeci">MMATile&lt; AccumType, TM, TN, MMAFrag_acc_t &gt; Ctile</div><div class="ttdef"><b>Definition</b> mma.h:479</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a257287702dc849d0d8a078fced453142"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">mlx::steel::BlockMMA::A_str_k</a></div><div class="ttdeci">STEEL_CONST short A_str_k</div><div class="ttdef"><b>Definition</b> mma.h:466</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a44fca27c821764317263047a780977b0"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">mlx::steel::BlockMMA::Btile</a></div><div class="ttdeci">MMATile&lt; AccumType, 1, TN, MMAFrag_acc_t &gt; Btile</div><div class="ttdef"><b>Definition</b> mma.h:478</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a47e614120c650f7479db79f23a0df586"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">mlx::steel::BlockMMA::Atile</a></div><div class="ttdeci">MMATile&lt; AccumType, TM, 1, MMAFrag_acc_t &gt; Atile</div><div class="ttdef"><b>Definition</b> mma.h:477</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a49538190209e522ddbef45fe95563d17"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">mlx::steel::BlockMMA::B_str_n</a></div><div class="ttdeci">STEEL_CONST short B_str_n</div><div class="ttdef"><b>Definition</b> mma.h:470</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a5b0029866f493363942133b55bff7307"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">mlx::steel::BlockMMA::TM_stride</a></div><div class="ttdeci">STEEL_CONST short TM_stride</div><div class="ttdef"><b>Definition</b> mma.h:455</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a6a2c2a6d5e767d52c41b42a9d36086b0"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0">mlx::steel::BlockMMA::mma</a></div><div class="ttdeci">METAL_FUNC void mma(const threadgroup T *As, const threadgroup T *Bs)</div><div class="ttdef"><b>Definition</b> mma.h:509</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a706ae779c1f8d2eb18f19c248567d424"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">mlx::steel::BlockMMA::TN</a></div><div class="ttdeci">STEEL_CONST short TN</div><div class="ttdef"><b>Definition</b> mma.h:462</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a7b324c992750ed3aaa4c485f15b2f391"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391">mlx::steel::BlockMMA::store_result_safe</a></div><div class="ttdeci">METAL_FUNC void store_result_safe(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const</div><div class="ttdef"><b>Definition</b> mma.h:683</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a7cf757e9785e23997b1417e024559ed3"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3">mlx::steel::BlockMMA::store_result</a></div><div class="ttdeci">METAL_FUNC void store_result(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const</div><div class="ttdef"><b>Definition</b> mma.h:651</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a823c56cbd2086f10272df7284a5247ae"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae">mlx::steel::BlockMMA::apply_epilogue</a></div><div class="ttdeci">METAL_FUNC void apply_epilogue(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)</div><div class="ttdef"><b>Definition</b> mma.h:579</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a8b3690b383afd26563efb38f9c375e50"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">mlx::steel::BlockMMA::TN_stride</a></div><div class="ttdeci">STEEL_CONST short TN_stride</div><div class="ttdef"><b>Definition</b> mma.h:457</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a8fddaa78913cdc8eea5e1cf7d2776330"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">mlx::steel::BlockMMA::tile_stride_a</a></div><div class="ttdeci">STEEL_CONST short tile_stride_a</div><div class="ttdef"><b>Definition</b> mma.h:473</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a92f6aeee432f53638447eac842f43eca"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">mlx::steel::BlockMMA::Bs_offset</a></div><div class="ttdeci">short Bs_offset</div><div class="ttdef"><b>Definition</b> mma.h:486</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a9e48f2d51099ec00171506724faab54a"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a">mlx::steel::BlockMMA::apply_epilogue_safe</a></div><div class="ttdeci">METAL_FUNC void apply_epilogue_safe(const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const BinaryEpilogue &amp;epilogue_op)</div><div class="ttdef"><b>Definition</b> mma.h:607</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aa14406b7298456ac45d23dd3c4642dd8"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8">mlx::steel::BlockMMA::BlockMMA</a></div><div class="ttdeci">METAL_FUNC BlockMMA(ushort simd_group_id, ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> mma.h:489</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aa71400922babd388177f228c2c82b211"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">mlx::steel::BlockMMA::B_str_k</a></div><div class="ttdeci">STEEL_CONST short B_str_k</div><div class="ttdef"><b>Definition</b> mma.h:469</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aa85451edf6900fd6af164d4d50889ae3"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">mlx::steel::BlockMMA::sm</a></div><div class="ttdeci">short sm</div><div class="ttdef"><b>Definition</b> mma.h:482</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_ab9c7f5386594497f5f4df7e59670b877"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">mlx::steel::BlockMMA::A_str_m</a></div><div class="ttdeci">STEEL_CONST short A_str_m</div><div class="ttdef"><b>Definition</b> mma.h:465</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aba5f749fdf32d8bd9d9e29f2a9ae4591"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">mlx::steel::BlockMMA::TM</a></div><div class="ttdeci">STEEL_CONST short TM</div><div class="ttdef"><b>Definition</b> mma.h:460</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_ade420e8b811d597345783c324c23a34a"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">mlx::steel::BlockMMA::sn</a></div><div class="ttdeci">short sn</div><div class="ttdef"><b>Definition</b> mma.h:483</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_ae3f35453b3afbaac9df64ad5966b34a4"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">mlx::steel::BlockMMA::tile_stride_b</a></div><div class="ttdeci">STEEL_CONST short tile_stride_b</div><div class="ttdef"><b>Definition</b> mma.h:474</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aee8caec45c1f9e4428586effbfe6137d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">mlx::steel::BlockMMA::kFragSize</a></div><div class="ttdeci">STEEL_CONST short kFragSize</div><div class="ttdef"><b>Definition</b> mma.h:451</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_af653c0808ba4fa9a25286f1febb7baff"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">mlx::steel::BlockMMA::apply_epilogue</a></div><div class="ttdeci">METAL_FUNC void apply_epilogue(thread const UnaryEpilogue &amp;epilogue_op)</div><div class="ttdef"><b>Definition</b> mma.h:569</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_layout2_d_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_layout2_d.html">mlx::steel::Layout2D</a></div><div class="ttdef"><b>Definition</b> mma.h:31</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_layout2_d_html_a23183747ab1ddbdd3f1fcac6d0faa2cd"><div class="ttname"><a href="structmlx_1_1steel_1_1_layout2_d.html#a23183747ab1ddbdd3f1fcac6d0faa2cd">mlx::steel::Layout2D::shape</a></div><div class="ttdeci">Shape shape</div><div class="ttdef"><b>Definition</b> mma.h:32</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_layout2_d_html_a6beedf1677ee1b192fb48c83a29ac8a1"><div class="ttname"><a href="structmlx_1_1steel_1_1_layout2_d.html#a6beedf1677ee1b192fb48c83a29ac8a1">mlx::steel::Layout2D::layout</a></div><div class="ttdeci">Layout layout</div><div class="ttdef"><b>Definition</b> mma.h:33</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a></div><div class="ttdef"><b>Definition</b> mma.h:178</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a1a6b1446e8c8da46885bbaa8e8fdc7e4"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">mlx::steel::MMATile::frag_at</a></div><div class="ttdeci">METAL_FUNC constexpr thread frag_type &amp; frag_at(const short i, const short j)</div><div class="ttdef"><b>Definition</b> mma.h:256</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a1d126b14910385ab644e224ac1d0307a"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">mlx::steel::MMATile::kTileRows</a></div><div class="ttdeci">STEEL_CONST int kTileRows</div><div class="ttdef"><b>Definition</b> mma.h:230</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a1ea49efd92696b15302ee4b52ecd548c"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1ea49efd92696b15302ee4b52ecd548c">mlx::steel::MMATile::kColsPerThread</a></div><div class="ttdeci">STEEL_CONST int kColsPerThread</div><div class="ttdef"><b>Definition</b> mma.h:240</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a1eeb197c9bdf4db42892a39cdb9bd73a"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mlx::steel::MMATile::mat_type</a></div><div class="ttdeci">MMAFrag_t::mat_type mat_type</div><div class="ttdef"><b>Definition</b> mma.h:242</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a2aadaa3239cb3af0c2ee8af9b88c8a98"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98">mlx::steel::MMATile::store</a></div><div class="ttdeci">METAL_FUNC void store(threadgroup U *dst) const</div><div class="ttdef"><b>Definition</b> mma.h:325</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a323a4f38cd0693bf333832bb4258b28e"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">mlx::steel::MMATile::mat_at</a></div><div class="ttdeci">METAL_FUNC mat_type mat_at(const short i, const short j)</div><div class="ttdef"><b>Definition</b> mma.h:266</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a3d0d5b9c7962658cc6d5afbbbb2f19e2"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2">mlx::steel::MMATile::row_bin_op</a></div><div class="ttdeci">METAL_FUNC void row_bin_op(thread T vals[kRowsPerThread])</div><div class="ttdef"><b>Definition</b> mma.h:296</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a46324d40f8ad61cade08a1ebad6d9ad4"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">mlx::steel::MMATile::kTileCols</a></div><div class="ttdeci">STEEL_CONST int kTileCols</div><div class="ttdef"><b>Definition</b> mma.h:231</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a57703f522c7409dbe2c0a68bb7acc2ba"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba">mlx::steel::MMATile::store_safe</a></div><div class="ttdeci">METAL_FUNC void store_safe(device U *dst, const int ld, const short2 dst_tile_dims) const</div><div class="ttdef"><b>Definition</b> mma.h:393</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a594142f957ffb99296a243f7af7b59e7"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">mlx::steel::MMATile::kFragRows</a></div><div class="ttdeci">STEEL_CONST int kFragRows</div><div class="ttdef"><b>Definition</b> mma.h:226</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a5b1d1c85a5046108a4e38bdc5a0ea74e"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e">mlx::steel::MMATile::kRowsPerThread</a></div><div class="ttdeci">STEEL_CONST int kRowsPerThread</div><div class="ttdef"><b>Definition</b> mma.h:239</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a60ea6b8ff2923b7fe6f598e74ac54323"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">mlx::steel::MMATile::kRows</a></div><div class="ttdeci">STEEL_CONST int kRows</div><div class="ttdef"><b>Definition</b> mma.h:233</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a684e6c6d9f00f583994285b60aaa3b62"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">mlx::steel::MMATile::val_frags</a></div><div class="ttdeci">frag_type val_frags[kNumFrags]</div><div class="ttdef"><b>Definition</b> mma.h:245</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a6dadcd666afb3759a11094e754560dd4"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4">mlx::steel::MMATile::MMAFrag_t</a></div><div class="ttdeci">MMAFrag_ MMAFrag_t</div><div class="ttdef"><b>Definition</b> mma.h:224</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a752f708e4fe5ef37fdd902dae153179f"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f">mlx::steel::MMATile::store</a></div><div class="ttdeci">METAL_FUNC void store(device U *dst, const int ld) const</div><div class="ttdef"><b>Definition</b> mma.h:357</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a824409bc107330805853f932e80a7628"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">mlx::steel::MMATile::elem_type</a></div><div class="ttdeci">T elem_type</div><div class="ttdef"><b>Definition</b> mma.h:225</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a865ece5ad0b9a56937b6d77a18b5a1dc"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">mlx::steel::MMATile::elems</a></div><div class="ttdeci">METAL_FUNC thread elem_type * elems()</div><div class="ttdef"><b>Definition</b> mma.h:275</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a948784652e93830887ee8ad506ec3257"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">mlx::steel::MMATile::kCols</a></div><div class="ttdeci">STEEL_CONST int kCols</div><div class="ttdef"><b>Definition</b> mma.h:234</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a98357339ec98f804a1b12597937b318f"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f">mlx::steel::MMATile::kElemsPerTile</a></div><div class="ttdeci">STEEL_CONST int kElemsPerTile</div><div class="ttdef"><b>Definition</b> mma.h:237</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa0ad5cb750ace934bf230385d8bd9f88"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88">mlx::steel::MMATile::row_reduce</a></div><div class="ttdeci">METAL_FUNC void row_reduce(thread T vals[kRowsPerThread]) const</div><div class="ttdef"><b>Definition</b> mma.h:284</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa3a4af67813908109da08ce7352f82da"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">mlx::steel::MMATile::load_safe</a></div><div class="ttdeci">METAL_FUNC void load_safe(const device U *src, const int ld, const short2 src_tile_dims)</div><div class="ttdef"><b>Definition</b> mma.h:373</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa3fb310dd08ec23c334511f7b316d1b6"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6">mlx::steel::MMATile::MMATile</a></div><div class="ttdeci">METAL_FUNC MMATile() thread</div><div class="ttdef"><b>Definition</b> mma.h:247</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa5426c6beabfb3ee41b58f01b3392a96"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96">mlx::steel::MMATile::load</a></div><div class="ttdeci">METAL_FUNC void load(const threadgroup U *src)</div><div class="ttdef"><b>Definition</b> mma.h:308</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa97a98e423827a889c13a92217626ec7"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">mlx::steel::MMATile::clear</a></div><div class="ttdeci">METAL_FUNC constexpr void clear()</div><div class="ttdef"><b>Definition</b> mma.h:249</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa9e484d8cae936503898d5b772c573f9"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9">mlx::steel::MMATile::load</a></div><div class="ttdeci">METAL_FUNC void load(const device U *src, const int ld)</div><div class="ttdef"><b>Definition</b> mma.h:342</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aac25cd0a9bdf24aa2af809c95f0bd171"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">mlx::steel::MMATile::frag_type</a></div><div class="ttdeci">MMAFrag_t::frag_type frag_type</div><div class="ttdef"><b>Definition</b> mma.h:243</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ad095371db98e7c335ec41ca77c10f906"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">mlx::steel::MMATile::kFragCols</a></div><div class="ttdeci">STEEL_CONST int kFragCols</div><div class="ttdef"><b>Definition</b> mma.h:227</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ad476e1d9a12178fb35c207312339e485"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">mlx::steel::MMATile::frag_at</a></div><div class="ttdeci">METAL_FUNC constexpr const thread frag_type &amp; frag_at(const short i, const short j) const</div><div class="ttdef"><b>Definition</b> mma.h:260</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ae21bb7cce701290de84c6015e064d8a1"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">mlx::steel::MMATile::elems</a></div><div class="ttdeci">METAL_FUNC const thread elem_type * elems() const</div><div class="ttdef"><b>Definition</b> mma.h:279</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ae326e7693eb77c22d5a6e3e9219019d3"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">mlx::steel::MMATile::kNumFrags</a></div><div class="ttdeci">STEEL_CONST int kNumFrags</div><div class="ttdef"><b>Definition</b> mma.h:236</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aef0ea2387e1ff5767bff8563b2d36bd6"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">mlx::steel::MMATile::kElemsPerFrag</a></div><div class="ttdeci">STEEL_CONST int kElemsPerFrag</div><div class="ttdef"><b>Definition</b> mma.h:228</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_shape2_d_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_shape2_d.html">mlx::steel::Shape2D</a></div><div class="ttdef"><b>Definition</b> mma.h:23</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_shape2_d_html_a070ce70eb6d84361c7f313159c438a5c"><div class="ttname"><a href="structmlx_1_1steel_1_1_shape2_d.html#a070ce70eb6d84361c7f313159c438a5c">mlx::steel::Shape2D::Shape2D</a></div><div class="ttdeci">Shape2D(RInt r_, CInt c_)</div><div class="ttdef"><b>Definition</b> mma.h:27</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_shape2_d_html_a6e9e8d56782fc8772bc432c7f58393fe"><div class="ttname"><a href="structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe">mlx::steel::Shape2D::r</a></div><div class="ttdeci">RInt r</div><div class="ttdef"><b>Definition</b> mma.h:24</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_shape2_d_html_ae51347b2131647f2ed735ed43840d26e"><div class="ttname"><a href="structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e">mlx::steel::Shape2D::c</a></div><div class="ttdeci">CInt c</div><div class="ttdef"><b>Definition</b> mma.h:25</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1integral__constant_html"><div class="ttname"><a href="structmlx_1_1steel_1_1integral__constant.html">mlx::steel::integral_constant</a></div><div class="ttdef"><b>Definition</b> integral_constant.h:18</div></div>
+</div><!-- fragment --></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/attn_2params_8h.html b/docs/build/html/attn_2params_8h.html
new file mode 100644
index 000000000..e7fe51aab
--- /dev/null
+++ b/docs/build/html/attn_2params_8h.html
@@ -0,0 +1,119 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/params.h File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#nested-classes">Classes</a> &#124;
+<a href="#namespaces">Namespaces</a>  </div>
+  <div class="headertitle"><div class="title">params.h File Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><a href="attn_2params_8h_source.html">Go to the source code of this file.</a></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
+Classes</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="namespaces" name="namespaces"></a>
+Namespaces</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx.html">mlx</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/attn_2params_8h_source.html b/docs/build/html/attn_2params_8h_source.html
new file mode 100644
index 000000000..b7657a3ee
--- /dev/null
+++ b/docs/build/html/attn_2params_8h_source.html
@@ -0,0 +1,154 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/params.h Source File</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">params.h</div></div>
+</div><!--header-->
+<div class="contents">
+<a href="attn_2params_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2024 Apple Inc.</span></div>
+<div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span><span class="comment">// Attn param classes</span></div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span> </div>
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemlx.html">mlx</a> {</div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="keyword">namespace </span>steel {</div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span> </div>
+<div class="foldopen" id="foldopen00012" data-start="{" data-end="};">
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html">   12</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_attn_params.html">AttnParams</a> {</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a1cba7fedbd02e157922619195997cf4f">   13</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a1cba7fedbd02e157922619195997cf4f">B</a>; </div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a3d286a0c27bace6016ed7a87f43291b7">   14</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a3d286a0c27bace6016ed7a87f43291b7">H</a>; </div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a07ae31628e43e09bce533c7682c8dae3">   15</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a07ae31628e43e09bce533c7682c8dae3">D</a>; </div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span> </div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a59255882cbd78bb6f15e704e3a356a7f">   17</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a59255882cbd78bb6f15e704e3a356a7f">qL</a>; </div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a497b7404bcd25b535c3589c61f269f63">   18</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a497b7404bcd25b535c3589c61f269f63">kL</a>; </div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span> </div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a3b3e18cb993ab24819c852bc64288841">   20</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a3b3e18cb993ab24819c852bc64288841">gqa_factor</a>; </div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#ad81bcd32e6ff8fec0000eca505fb6826">   21</a></span>  <span class="keywordtype">float</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#ad81bcd32e6ff8fec0000eca505fb6826">scale</a>; </div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span> </div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a48575afc94ab9ff74deaba61464e57a1">   23</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a48575afc94ab9ff74deaba61464e57a1">NQ</a>; </div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a68a66e3fafa922dcfd1ab1f6bdc2375e">   24</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a68a66e3fafa922dcfd1ab1f6bdc2375e">NK</a>; </div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span> </div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a4cfd2ccb0fd7eb81c2a781a0614fdcbe">   26</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a4cfd2ccb0fd7eb81c2a781a0614fdcbe">NQ_aligned</a>; </div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#aaf953954274794cfcb4e35e82d681b58">   27</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#aaf953954274794cfcb4e35e82d681b58">NK_aligned</a>; </div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span> </div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a90bba215328201a37eb1c430ce9f8563">   29</a></span>  <span class="keywordtype">size_t</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a90bba215328201a37eb1c430ce9f8563">Q_strides</a>[3]; </div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a03e5480d1cca6af541be54a8720e9974">   30</a></span>  <span class="keywordtype">size_t</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a03e5480d1cca6af541be54a8720e9974">K_strides</a>[3]; </div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#acc4860c3ce09c7230b470182ed002d3c">   31</a></span>  <span class="keywordtype">size_t</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#acc4860c3ce09c7230b470182ed002d3c">V_strides</a>[3]; </div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_attn_params.html#a33dc7fc22d2604a73af9f94eeea45bb4">   32</a></span>  <span class="keywordtype">size_t</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_attn_params.html#a33dc7fc22d2604a73af9f94eeea45bb4">O_strides</a>[3]; </div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>};</div>
+</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span> </div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>} <span class="comment">// namespace steel</span></div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>} <span class="comment">// namespace mlx</span></div>
+<div class="ttc" id="anamespacemlx_html"><div class="ttname"><a href="namespacemlx.html">mlx</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></div><div class="ttdef"><b>Definition</b> params.h:12</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a03e5480d1cca6af541be54a8720e9974"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a03e5480d1cca6af541be54a8720e9974">mlx::steel::AttnParams::K_strides</a></div><div class="ttdeci">size_t K_strides[3]</div><div class="ttdoc">Key strides (B, H, L, D = 1)</div><div class="ttdef"><b>Definition</b> params.h:30</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a07ae31628e43e09bce533c7682c8dae3"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a07ae31628e43e09bce533c7682c8dae3">mlx::steel::AttnParams::D</a></div><div class="ttdeci">int D</div><div class="ttdoc">Head Dim.</div><div class="ttdef"><b>Definition</b> params.h:15</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a1cba7fedbd02e157922619195997cf4f"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a1cba7fedbd02e157922619195997cf4f">mlx::steel::AttnParams::B</a></div><div class="ttdeci">int B</div><div class="ttdoc">Batch Size.</div><div class="ttdef"><b>Definition</b> params.h:13</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a33dc7fc22d2604a73af9f94eeea45bb4"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a33dc7fc22d2604a73af9f94eeea45bb4">mlx::steel::AttnParams::O_strides</a></div><div class="ttdeci">size_t O_strides[3]</div><div class="ttdoc">Output strides (B, H, L, D = 1)</div><div class="ttdef"><b>Definition</b> params.h:32</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a3b3e18cb993ab24819c852bc64288841"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a3b3e18cb993ab24819c852bc64288841">mlx::steel::AttnParams::gqa_factor</a></div><div class="ttdeci">int gqa_factor</div><div class="ttdoc">Group Query factor.</div><div class="ttdef"><b>Definition</b> params.h:20</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a3d286a0c27bace6016ed7a87f43291b7"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a3d286a0c27bace6016ed7a87f43291b7">mlx::steel::AttnParams::H</a></div><div class="ttdeci">int H</div><div class="ttdoc">Heads.</div><div class="ttdef"><b>Definition</b> params.h:14</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a48575afc94ab9ff74deaba61464e57a1"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a48575afc94ab9ff74deaba61464e57a1">mlx::steel::AttnParams::NQ</a></div><div class="ttdeci">int NQ</div><div class="ttdoc">Number of query blocks.</div><div class="ttdef"><b>Definition</b> params.h:23</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a497b7404bcd25b535c3589c61f269f63"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a497b7404bcd25b535c3589c61f269f63">mlx::steel::AttnParams::kL</a></div><div class="ttdeci">int kL</div><div class="ttdoc">Key Sequence Length.</div><div class="ttdef"><b>Definition</b> params.h:18</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a4cfd2ccb0fd7eb81c2a781a0614fdcbe"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a4cfd2ccb0fd7eb81c2a781a0614fdcbe">mlx::steel::AttnParams::NQ_aligned</a></div><div class="ttdeci">int NQ_aligned</div><div class="ttdoc">Number of full query blocks.</div><div class="ttdef"><b>Definition</b> params.h:26</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a59255882cbd78bb6f15e704e3a356a7f"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a59255882cbd78bb6f15e704e3a356a7f">mlx::steel::AttnParams::qL</a></div><div class="ttdeci">int qL</div><div class="ttdoc">Query Sequence Length.</div><div class="ttdef"><b>Definition</b> params.h:17</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a68a66e3fafa922dcfd1ab1f6bdc2375e"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a68a66e3fafa922dcfd1ab1f6bdc2375e">mlx::steel::AttnParams::NK</a></div><div class="ttdeci">int NK</div><div class="ttdoc">Number of key/value blocks.</div><div class="ttdef"><b>Definition</b> params.h:24</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_a90bba215328201a37eb1c430ce9f8563"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#a90bba215328201a37eb1c430ce9f8563">mlx::steel::AttnParams::Q_strides</a></div><div class="ttdeci">size_t Q_strides[3]</div><div class="ttdoc">Query strides (B, H, L, D = 1)</div><div class="ttdef"><b>Definition</b> params.h:29</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_aaf953954274794cfcb4e35e82d681b58"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#aaf953954274794cfcb4e35e82d681b58">mlx::steel::AttnParams::NK_aligned</a></div><div class="ttdeci">int NK_aligned</div><div class="ttdoc">Number of full key/value blocks.</div><div class="ttdef"><b>Definition</b> params.h:27</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_acc4860c3ce09c7230b470182ed002d3c"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#acc4860c3ce09c7230b470182ed002d3c">mlx::steel::AttnParams::V_strides</a></div><div class="ttdeci">size_t V_strides[3]</div><div class="ttdoc">Value strides (B, H, L, D = 1)</div><div class="ttdef"><b>Definition</b> params.h:31</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html_ad81bcd32e6ff8fec0000eca505fb6826"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html#ad81bcd32e6ff8fec0000eca505fb6826">mlx::steel::AttnParams::scale</a></div><div class="ttdeci">float scale</div><div class="ttdoc">Attention scale.</div><div class="ttdef"><b>Definition</b> params.h:21</div></div>
+</div><!-- fragment --></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/attn_8h.html b/docs/build/html/attn_8h.html
new file mode 100644
index 000000000..e7a1ad19a
--- /dev/null
+++ b/docs/build/html/attn_8h.html
@@ -0,0 +1,127 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/attn.h File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#nested-classes">Classes</a> &#124;
+<a href="#namespaces">Namespaces</a>  </div>
+  <div class="headertitle"><div class="title">attn.h File Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<div class="textblock"><code>#include &quot;<a class="el" href="attn_2loader_8h_source.html">mlx/backend/metal/kernels/steel/attn/loader.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="attn_2mma_8h_source.html">mlx/backend/metal/kernels/steel/attn/mma.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="attn_2params_8h_source.html">mlx/backend/metal/kernels/steel/attn/params.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">mlx/backend/metal/kernels/steel/attn/transforms.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="gemm_2params_8h_source.html">mlx/backend/metal/kernels/steel/gemm/params.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="backend_2metal_2kernels_2steel_2utils_8h_source.html">mlx/backend/metal/kernels/steel/utils.h</a>&quot;</code><br />
+</div>
+<p><a href="attn_8h_source.html">Go to the source code of this file.</a></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
+Classes</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html">mlx::steel::LoopAlignment&lt; M_aligned, N_aligned, K_aligned &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="namespaces" name="namespaces"></a>
+Namespaces</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx.html">mlx</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/attn_8h_source.html b/docs/build/html/attn_8h_source.html
new file mode 100644
index 000000000..c8e033f33
--- /dev/null
+++ b/docs/build/html/attn_8h_source.html
@@ -0,0 +1,424 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/attn.h Source File</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">attn.h</div></div>
+</div><!--header-->
+<div class="contents">
+<a href="attn_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2024 Apple Inc.</span></div>
+<div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
+<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#include &quot;<a class="code" href="attn_2loader_8h.html">mlx/backend/metal/kernels/steel/attn/loader.h</a>&quot;</span></div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span><span class="preprocessor">#include &quot;<a class="code" href="attn_2mma_8h.html">mlx/backend/metal/kernels/steel/attn/mma.h</a>&quot;</span></div>
+<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="preprocessor">#include &quot;<a class="code" href="attn_2params_8h.html">mlx/backend/metal/kernels/steel/attn/params.h</a>&quot;</span></div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h.html">mlx/backend/metal/kernels/steel/attn/transforms.h</a>&quot;</span></div>
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span><span class="preprocessor">#include &quot;<a class="code" href="gemm_2params_8h.html">mlx/backend/metal/kernels/steel/gemm/params.h</a>&quot;</span></div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2steel_2utils_8h.html">mlx/backend/metal/kernels/steel/utils.h</a>&quot;</span></div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span> </div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span><span class="keyword">using namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a>;</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span> </div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span><span class="comment">// GEMM kernel class</span></div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span> </div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemlx.html">mlx</a> {</div>
+<div class="foldopen" id="foldopen00019" data-start="{" data-end="}">
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno"><a class="line" href="namespacemlx_1_1steel.html">   19</a></span><span class="keyword">namespace </span>steel {</div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span> </div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span><span class="keyword">template</span> &lt;<span class="keywordtype">bool</span> M_aligned, <span class="keywordtype">bool</span> N_aligned, <span class="keywordtype">bool</span> K_aligned&gt;</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_loop_alignment.html">LoopAlignment</a> {};</div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span> </div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>    <span class="keywordtype">int</span> BM,</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>    <span class="keywordtype">int</span> BN,</div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>    <span class="keywordtype">int</span> BK,</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>    <span class="keywordtype">int</span> WM,</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>    <span class="keywordtype">int</span> WN,</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    <span class="keywordtype">bool</span> transpose_a,</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>    <span class="keywordtype">bool</span> transpose_b,</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    <span class="keywordtype">bool</span> MN_aligned,</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>    <span class="keywordtype">bool</span> K_aligned,</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>    <span class="keyword">typename</span> AccumType = <span class="keyword">typename</span> <a class="code hl_typedef" href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">AccumHelper&lt;T&gt;::accum_type</a>,</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>    <span class="keyword">typename</span> Epilogue = <a class="code hl_struct" href="structmlx_1_1steel_1_1_transform_none.html">TransformNone&lt;U, AccumType&gt;</a>&gt;</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">GEMMKernel</a> {</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">   39</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a> = 16 / <span class="keyword">sizeof</span>(T);</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">   40</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a> = 16 / <span class="keyword">sizeof</span>(T);</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">   41</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">tgp_mem_size_a</a> =</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>      transpose_a ? BK * (BM + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>) : BM * (BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>);</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">   43</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">tgp_mem_size_b</a> =</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>      transpose_b ? BN * (BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>) : BK * (BN + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>);</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da">   45</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da">tgp_mem_size</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">tgp_mem_size_a</a> + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">tgp_mem_size_b</a>;</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span> </div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">   47</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a> = WM * WN * 32;</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span> </div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a98b6ec692580510081e2aa887a61944b">   49</a></span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_a_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a>&lt;</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>      T,</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>      transpose_a ? BK : BM,</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>      transpose_a ? BM : BK,</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>      transpose_a ? BM + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a> : BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>,</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>      !transpose_a,</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a>&gt;;</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1a115d5af0fb6e260165adba2e377635">   56</a></span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_b_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a>&lt;</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>      T,</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>      transpose_b ? BN : BK,</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>      transpose_b ? BK : BN,</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>      transpose_b ? BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a> : BN + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>,</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>      transpose_b,</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a>&gt;;</div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ae52eb09c9478cd4f199662346ac0c83e">   63</a></span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_m_m_a.html">mma_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_m_m_a.html">BlockMMA</a>&lt;</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>      T,</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>      U,</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>      BM,</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>      BN,</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>      BK,</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>      WM,</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>      WN,</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>      transpose_a,</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>      transpose_b,</div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>      transpose_a ? BM + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a> : BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>,</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>      transpose_b ? BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a> : BN + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>,</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>      AccumType,</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>      Epilogue&gt;;</div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span> </div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>  <span class="comment">/* Main kernel function */</span></div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>  <span class="keyword">template</span> &lt;<span class="keywordtype">bool</span> M_aligned, <span class="keywordtype">bool</span> N_aligned, <span class="keywordtype">bool</span> K_aligned_&gt;</div>
+<div class="foldopen" id="foldopen00080" data-start="{" data-end="}">
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">   80</a></span>  <span class="keyword">static</span> METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop</a>(</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>      threadgroup T* As [[threadgroup(0)]],</div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>      threadgroup T* Bs [[threadgroup(1)]],</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> gemm_k_iterations,</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>      thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_a_t</a>&amp; loader_a,</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>      thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_b_t</a>&amp; loader_b,</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>      thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_m_m_a.html">mma_t</a>&amp; mma_op,</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>      thread <span class="keyword">const</span> <span class="keywordtype">short</span>&amp; tgp_bm,</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>      thread <span class="keyword">const</span> <span class="keywordtype">short</span>&amp; tgp_bn,</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>      thread <span class="keyword">const</span> <span class="keywordtype">short</span>&amp; lbk,</div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>      <a class="code hl_struct" href="structmlx_1_1steel_1_1_loop_alignment.html">LoopAlignment&lt;M_aligned, N_aligned, K_aligned_&gt;</a> l = {}) {</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>    <span class="comment">// Appease the compiler</span></div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    (void)l;</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span> </div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>    short2 tile_dims_A = transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm);</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span> </div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>    short2 tile_dims_B = transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK);</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span> </div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; gemm_k_iterations; k++) {</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>      <span class="comment">// Load elements into threadgroup</span></div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>      <span class="keywordflow">if</span> (M_aligned) {</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>        loader_a.load_unsafe();</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>      } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>        loader_a.load_safe(tile_dims_A);</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>      }</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span> </div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>      <span class="keywordflow">if</span> (N_aligned) {</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>        loader_b.load_unsafe();</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>      } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>        loader_b.load_safe(tile_dims_B);</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>      }</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span> </div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span> </div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>      <span class="comment">// Multiply and accumulate threadgroup elements</span></div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>      mma_op.mma(As, Bs);</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span> </div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>      <span class="comment">// Prepare for next iteration</span></div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>      loader_a.next();</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>      loader_b.next();</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    }</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span> </div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    <span class="keywordflow">if</span> (!K_aligned_) {</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span> </div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>      short2 tile_dims_A_last =</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>          transpose_a ? short2(tgp_bm, lbk) : short2(lbk, tgp_bm);</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>      short2 tile_dims_B_last =</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>          transpose_b ? short2(lbk, tgp_bn) : short2(tgp_bn, lbk);</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span> </div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>      loader_a.load_safe(tile_dims_A_last);</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>      loader_b.load_safe(tile_dims_B_last);</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span> </div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span> </div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>      mma_op.mma(As, Bs);</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>    }</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>  }</div>
+</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span> </div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>  <span class="comment">/* Main kernel function */</span></div>
+<div class="foldopen" id="foldopen00141" data-start="{" data-end="}">
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5">  141</a></span>  <span class="keyword">static</span> METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5">run</a>(</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>      <span class="keyword">const</span> device T* A [[buffer(0)]],</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>      <span class="keyword">const</span> device T* B [[buffer(1)]],</div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>      device U* D [[buffer(2)]],</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>      <span class="keyword">const</span> constant <a class="code hl_struct" href="structmlx_1_1steel_1_1_g_e_m_m_params.html">GEMMParams</a>* params [[buffer(3)]],</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>      threadgroup T* As [[threadgroup(0)]],</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>      threadgroup T* Bs [[threadgroup(1)]],</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>      uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>      uint simd_group_id [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>      uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>      uint3 lid [[thread_position_in_threadgroup]]) {</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    <span class="comment">// Pacifying compiler</span></div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    (void)lid;</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span> </div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> tid_y = ((tid.y) &lt;&lt; params-&gt;swizzle_log) +</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>        ((tid.x) &amp; ((1 &lt;&lt; params-&gt;swizzle_log) - 1));</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> tid_x = (tid.x) &gt;&gt; params-&gt;swizzle_log;</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span> </div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    <span class="keywordflow">if</span> (params-&gt;tiles_n &lt;= tid_x || params-&gt;tiles_m &lt;= tid_y) {</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    }</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span> </div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    threadgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span> </div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>    <span class="comment">// Find block in A, B, C</span></div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> c_row = tid_y * BM;</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> c_col = tid_x * BN;</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    <span class="keyword">const</span> <span class="keywordtype">size_t</span> c_row_long = size_t(c_row);</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    <span class="keyword">const</span> <span class="keywordtype">size_t</span> c_col_long = size_t(c_col);</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span> </div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>    A += transpose_a ? c_row_long : c_row_long * params-&gt;lda;</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>    B += transpose_b ? c_col_long * params-&gt;ldb : c_col_long;</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    D += c_row_long * params-&gt;ldd + c_col_long;</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span> </div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>    <span class="comment">// Prepare threadgroup loading operations</span></div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_a_t</a> loader_a(A, params-&gt;lda, As, simd_group_id, simd_lane_id);</div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_b_t</a> loader_b(B, params-&gt;ldb, Bs, simd_group_id, simd_lane_id);</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span> </div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>    <span class="comment">// Prepare threadgroup mma operation</span></div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_m_m_a.html">mma_t</a> mma_op(simd_group_id, simd_lane_id);</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span> </div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>    <span class="keywordtype">int</span> gemm_k_iterations = params-&gt;gemm_k_iterations_aligned;</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span> </div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>    <span class="comment">// MNK aligned loop</span></div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>    <span class="keywordflow">if</span> (MN_aligned) {</div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; gemm_k_iterations; k++) {</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>        <span class="comment">// Load elements into threadgroup</span></div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>        loader_a.load_unsafe();</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>        loader_b.load_unsafe();</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span> </div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span> </div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>        <span class="comment">// Multiply and accumulate threadgroup elements</span></div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>        mma_op.mma(As, Bs);</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span> </div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>        <span class="comment">// Prepare for next iteration</span></div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>        loader_a.next();</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>        loader_b.next();</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>      }</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span> </div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>      threadgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span> </div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      <span class="comment">// Loop tail</span></div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>      <span class="keywordflow">if</span> (!K_aligned) {</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>        <span class="keywordtype">int</span> lbk = params-&gt;K - params-&gt;gemm_k_iterations_aligned * BK;</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>        short2 tile_dims_A = transpose_a ? short2(BM, lbk) : short2(lbk, BM);</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>        short2 tile_dims_B = transpose_b ? short2(lbk, BN) : short2(BN, lbk);</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span> </div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>        loader_a.load_safe(tile_dims_A);</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>        loader_b.load_safe(tile_dims_B);</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span> </div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span> </div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>        mma_op.mma(As, Bs);</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>      }</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span> </div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>      <span class="comment">// Store results to device memory</span></div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>      mma_op.store_result(D, params-&gt;ldd);</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span> </div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>    }</div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>    <span class="comment">// MN unaligned loop</span></div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>    <span class="keywordflow">else</span> { <span class="comment">// Loop over K - unaligned case</span></div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>      <span class="keywordtype">short</span> tgp_bm = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(BM, params-&gt;M - c_row);</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>      <span class="keywordtype">short</span> tgp_bn = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(BN, params-&gt;N - c_col);</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>      <span class="keywordtype">short</span> leftover_bk = params-&gt;K - params-&gt;gemm_k_iterations_aligned * BK;</div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span> </div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>      <span class="keywordflow">if</span> (tgp_bm == BM &amp;&amp; tgp_bn == BN) {</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>        <a class="code hl_function" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop&lt;true, true, K_aligned&gt;</a>(</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>            As,</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>            Bs,</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>            gemm_k_iterations,</div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>            loader_a,</div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>            loader_b,</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>            mma_op,</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>            tgp_bm,</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>            tgp_bn,</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>            leftover_bk);</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span> </div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>        mma_op.store_result(D, params-&gt;ldd);</div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>        <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span> </div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>      } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (tgp_bn == BN) {</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>        <a class="code hl_function" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop&lt;false, true, K_aligned&gt;</a>(</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>            As,</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>            Bs,</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>            gemm_k_iterations,</div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>            loader_a,</div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>            loader_b,</div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>            mma_op,</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>            tgp_bm,</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>            tgp_bn,</div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>            leftover_bk);</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span> </div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>        mma_op.store_result_safe(D, params-&gt;ldd, short2(tgp_bn, tgp_bm));</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>        <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span> </div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>      } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (tgp_bm == BM) {</div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>        <a class="code hl_function" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop&lt;true, false, K_aligned&gt;</a>(</div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>            As,</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>            Bs,</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>            gemm_k_iterations,</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>            loader_a,</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>            loader_b,</div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>            mma_op,</div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>            tgp_bm,</div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>            tgp_bn,</div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>            leftover_bk);</div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span> </div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>        mma_op.store_result_safe(D, params-&gt;ldd, short2(tgp_bn, tgp_bm));</div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>        <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span> </div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>      } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>        <a class="code hl_function" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop&lt;false, false, K_aligned&gt;</a>(</div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>            As,</div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>            Bs,</div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>            gemm_k_iterations,</div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>            loader_a,</div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>            loader_b,</div>
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>            mma_op,</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>            tgp_bm,</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>            tgp_bn,</div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>            leftover_bk);</div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span> </div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>        mma_op.store_result_safe(D, params-&gt;ldd, short2(tgp_bn, tgp_bm));</div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>        <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>      }</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>    }</div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>  }</div>
+</div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>};</div>
+<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span> </div>
+<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>} <span class="comment">// namespace steel</span></div>
+</div>
+<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>} <span class="comment">// namespace mlx</span></div>
+<div class="ttc" id="aattn_2loader_8h_html"><div class="ttname"><a href="attn_2loader_8h.html">loader.h</a></div></div>
+<div class="ttc" id="aattn_2mma_8h_html"><div class="ttname"><a href="attn_2mma_8h.html">mma.h</a></div></div>
+<div class="ttc" id="aattn_2params_8h_html"><div class="ttname"><a href="attn_2params_8h.html">params.h</a></div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2steel_2attn_2transforms_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2attn_2transforms_8h.html">transforms.h</a></div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html">utils.h</a></div></div>
+<div class="ttc" id="agemm_2params_8h_html"><div class="ttname"><a href="gemm_2params_8h.html">params.h</a></div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemlx_html"><div class="ttname"><a href="namespacemlx.html">mlx</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
+<div class="ttc" id="asteel_2defines_8h_html_a90b91c866313ffa46eff6d9cc944ad2b"><div class="ttname"><a href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a></div><div class="ttdeci">#define STEEL_CONST</div><div class="ttdef"><b>Definition</b> defines.h:3</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_accum_helper_html_ae52abf69e7ba6af1a73d65d57182ed26"><div class="ttname"><a href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">mlx::steel::AccumHelper::accum_type</a></div><div class="ttdeci">float accum_type</div><div class="ttdef"><b>Definition</b> transforms.h:57</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a></div><div class="ttdef"><b>Definition</b> loader.h:25</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a></div><div class="ttdef"><b>Definition</b> mma.h:377</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a></div><div class="ttdef"><b>Definition</b> gemm.h:37</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a00e55d4a161758350ed7310817d2d2a5"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5">mlx::steel::GEMMKernel::run</a></div><div class="ttdeci">static METAL_FUNC void run(const device T *A, const device T *B, device U *D, const constant GEMMParams *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</div><div class="ttdef"><b>Definition</b> attn.h:141</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a105af1069668028c6f1bc6d6dd162298"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">mlx::steel::GEMMKernel::tgp_mem_size_b</a></div><div class="ttdeci">STEEL_CONST short tgp_mem_size_b</div><div class="ttdef"><b>Definition</b> attn.h:43</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a1ec583584e69dcbbb72106390a4fc5da"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da">mlx::steel::GEMMKernel::tgp_mem_size</a></div><div class="ttdeci">STEEL_CONST short tgp_mem_size</div><div class="ttdef"><b>Definition</b> attn.h:45</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a756d7bbcc96e2919cd65eec4bc135780"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">mlx::steel::GEMMKernel::gemm_loop</a></div><div class="ttdeci">static METAL_FUNC void gemm_loop(threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread loader_a_t &amp;loader_a, thread loader_b_t &amp;loader_b, thread mma_t &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, LoopAlignment&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})</div><div class="ttdef"><b>Definition</b> attn.h:80</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a9058ddb73e30e83fb9c548ba22817d64"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">mlx::steel::GEMMKernel::tgp_size</a></div><div class="ttdeci">STEEL_CONST short tgp_size</div><div class="ttdef"><b>Definition</b> attn.h:47</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ac00b149d76a903c2f91b0f477dc5037f"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">mlx::steel::GEMMKernel::tgp_mem_size_a</a></div><div class="ttdeci">STEEL_CONST short tgp_mem_size_a</div><div class="ttdef"><b>Definition</b> attn.h:41</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad1b03941e869017558423c08b08bc094"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">mlx::steel::GEMMKernel::tgp_padding_b</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_b</div><div class="ttdef"><b>Definition</b> attn.h:40</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad547704ccbff6c2076abeffa6628c5a0"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">mlx::steel::GEMMKernel::tgp_padding_a</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_a</div><div class="ttdef"><b>Definition</b> attn.h:39</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_params_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_params.html">mlx::steel::GEMMParams</a></div><div class="ttdef"><b>Definition</b> params.h:12</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_loop_alignment_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_loop_alignment.html">mlx::steel::LoopAlignment</a></div><div class="ttdef"><b>Definition</b> gemm.h:21</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_none_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_none.html">mlx::steel::TransformNone</a></div><div class="ttdef"><b>Definition</b> transforms.h:15</div></div>
+</div><!-- fragment --></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/backend_2common_2utils_8h.html b/docs/build/html/backend_2common_2utils_8h.html
index e5c00a23f..f09f54813 100644
--- a/docs/build/html/backend_2common_2utils_8h.html
+++ b/docs/build/html/backend_2common_2utils_8h.html
@@ -142,6 +142,10 @@ Functions</h2></td></tr>
 <tr class="separator:a3ba20a804c306067b7023259429e0e48"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:af650e831ce21759da1ac103037d08d84" id="r_af650e831ce21759da1ac103037d08d84"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#af650e831ce21759da1ac103037d08d84">mlx::core::is_donatable</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
 <tr class="separator:af650e831ce21759da1ac103037d08d84"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a830a47d8a317dffb0c88e5a7afe6aee2" id="r_a830a47d8a317dffb0c88e5a7afe6aee2"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2">mlx::core::move_or_copy</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
+<tr class="separator:a830a47d8a317dffb0c88e5a7afe6aee2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aae1e770954edf1f9a35d19e0de4d857a" id="r_aae1e770954edf1f9a35d19e0de4d857a"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#aae1e770954edf1f9a35d19e0de4d857a">mlx::core::move_or_copy</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::vector&lt; size_t &gt; &amp;strides, <a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html">array::Flags</a> flags, size_t data_size, size_t offset=0)</td></tr>
+<tr class="separator:aae1e770954edf1f9a35d19e0de4d857a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/backend_2common_2utils_8h_source.html b/docs/build/html/backend_2common_2utils_8h_source.html
index 0e6718d7f..be28e1484 100644
--- a/docs/build/html/backend_2common_2utils_8h_source.html
+++ b/docs/build/html/backend_2common_2utils_8h_source.html
@@ -297,7 +297,16 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>}</div>
 </div>
 <div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span> </div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>} <span class="comment">// namespace mlx::core</span></div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2">  181</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2">move_or_copy</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aae1e770954edf1f9a35d19e0de4d857a">  182</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2">move_or_copy</a>(</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>    <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>    <span class="keyword">const</span> std::vector&lt;size_t&gt;&amp; strides,</div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>    <a class="code hl_struct" href="structmlx_1_1core_1_1array_1_1_flags.html">array::Flags</a> flags,</div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>    <span class="keywordtype">size_t</span> data_size,</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    <span class="keywordtype">size_t</span> offset = 0);</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span> </div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html_a0a20a6065ae71b64c1e3aa22a45fd8a1"><div class="ttname"><a href="classmlx_1_1core_1_1array.html#a0a20a6065ae71b64c1e3aa22a45fd8a1">mlx::core::array::flags</a></div><div class="ttdeci">const Flags &amp; flags() const</div><div class="ttdoc">Get the Flags bit-field.</div><div class="ttdef"><b>Definition</b> array.h:302</div></div>
@@ -312,6 +321,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="anamespacemlx_1_1core_html_a38fe6ec5220d13d96c7dad7556d2b613"><div class="ttname"><a href="namespacemlx_1_1core.html#a38fe6ec5220d13d96c7dad7556d2b613">mlx::core::collapse_contiguous_dims</a></div><div class="ttdeci">std::tuple&lt; std::vector&lt; int &gt;, std::vector&lt; std::vector&lt; int64_t &gt; &gt; &gt; collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; int64_t &gt; &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a3ba20a804c306067b7023259429e0e48"><div class="ttname"><a href="namespacemlx_1_1core.html#a3ba20a804c306067b7023259429e0e48">mlx::core::check_contiguity</a></div><div class="ttdeci">auto check_contiguity(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides)</div><div class="ttdef"><b>Definition</b> utils.h:151</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a77657cb50fd9392f7f4c64e43843c2b3"><div class="ttname"><a href="namespacemlx_1_1core.html#a77657cb50fd9392f7f4c64e43843c2b3">mlx::core::elem_to_loc</a></div><div class="ttdeci">StrideT elem_to_loc(int elem, const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides)</div><div class="ttdef"><b>Definition</b> utils.h:12</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_a830a47d8a317dffb0c88e5a7afe6aee2"><div class="ttname"><a href="namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2">mlx::core::move_or_copy</a></div><div class="ttdeci">void move_or_copy(const array &amp;in, array &amp;out)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_af650e831ce21759da1ac103037d08d84"><div class="ttname"><a href="namespacemlx_1_1core.html#af650e831ce21759da1ac103037d08d84">mlx::core::is_donatable</a></div><div class="ttdeci">bool is_donatable(const array &amp;in, const array &amp;out)</div><div class="ttdef"><b>Definition</b> utils.h:174</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_af89751d79339f3e4d9318ea97d64d114"><div class="ttname"><a href="namespacemlx_1_1core.html#af89751d79339f3e4d9318ea97d64d114">mlx::core::enable_for_arrays_t</a></div><div class="ttdeci">typename std::enable_if_t&lt; is_arrays_v&lt; T... &gt; &gt; enable_for_arrays_t</div><div class="ttdef"><b>Definition</b> array.h:611</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1_contiguous_iterator_html"><div class="ttname"><a href="structmlx_1_1core_1_1_contiguous_iterator.html">mlx::core::ContiguousIterator</a></div><div class="ttdef"><b>Definition</b> utils.h:89</div></div>
@@ -322,6 +332,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="astructmlx_1_1core_1_1_contiguous_iterator_html_a68794af4a442d3d8ac4647817af8e1f6"><div class="ttname"><a href="structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6">mlx::core::ContiguousIterator::ContiguousIterator</a></div><div class="ttdeci">ContiguousIterator()</div><div class="ttdef"><b>Definition</b> utils.h:120</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1_contiguous_iterator_html_a6cb378408b6f546eeb6ade1a4faafe3c"><div class="ttname"><a href="structmlx_1_1core_1_1_contiguous_iterator.html#a6cb378408b6f546eeb6ade1a4faafe3c">mlx::core::ContiguousIterator::ContiguousIterator</a></div><div class="ttdeci">ContiguousIterator(const array &amp;a)</div><div class="ttdef"><b>Definition</b> utils.h:122</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1_contiguous_iterator_html_ae230bd52b70a0bbdf560090f8a6589ef"><div class="ttname"><a href="structmlx_1_1core_1_1_contiguous_iterator.html#ae230bd52b70a0bbdf560090f8a6589ef">mlx::core::ContiguousIterator::step</a></div><div class="ttdeci">void step()</div><div class="ttdef"><b>Definition</b> utils.h:90</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1array_1_1_flags_html"><div class="ttname"><a href="structmlx_1_1core_1_1array_1_1_flags.html">mlx::core::array::Flags</a></div><div class="ttdef"><b>Definition</b> array.h:221</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1array_1_1_flags_html_a3170fa381dc7a90f6eabcc029bdf9bfd"><div class="ttname"><a href="structmlx_1_1core_1_1array_1_1_flags.html#a3170fa381dc7a90f6eabcc029bdf9bfd">mlx::core::array::Flags::row_contiguous</a></div><div class="ttdeci">bool row_contiguous</div><div class="ttdef"><b>Definition</b> array.h:233</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/backend_2metal_2device_8h_source.html b/docs/build/html/backend_2metal_2device_8h_source.html
index bb0b3537b..7cf8c3750 100644
--- a/docs/build/html/backend_2metal_2device_8h_source.html
+++ b/docs/build/html/backend_2metal_2device_8h_source.html
@@ -160,225 +160,264 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>  };</div>
 </div>
 <div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span> </div>
-<div class="foldopen" id="foldopen00061" data-start="{" data-end="}">
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966">   61</a></span>  MTL::ComputeCommandEncoder* <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966">operator-&gt;</a>() {</div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>    <span class="keywordflow">return</span> enc_;</div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>  }</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">   61</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">set_input_array</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; a, <span class="keywordtype">int</span> idx, int64_t offset = 0);</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">   62</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">set_output_array</a>(<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; a, <span class="keywordtype">int</span> idx, int64_t offset = 0);</div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a85796b2bf41dbf347ae0978d4660600d">   63</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a85796b2bf41dbf347ae0978d4660600d">dispatch_threadgroups</a>(MTL::Size grid_dims, MTL::Size group_dims);</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a0a8501b940e5a347475fa4bc38fb4c05">   64</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a0a8501b940e5a347475fa4bc38fb4c05">dispatch_threads</a>(MTL::Size grid_dims, MTL::Size group_dims);</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">   65</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">maybeInsertBarrier</a>();</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span> </div>
+<div class="foldopen" id="foldopen00067" data-start="{" data-end="}">
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6d4c03a6585deedb5ccd1a1057d0c6ef">   67</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6d4c03a6585deedb5ccd1a1057d0c6ef">set_compute_pipeline_state</a>(MTL::ComputePipelineState* kernel) {</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    enc_-&gt;setComputePipelineState(kernel);</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>  }</div>
 </div>
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span> </div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">   65</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">set_input_array</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; a, <span class="keywordtype">int</span> idx, int64_t offset = 0);</div>
-<div class="line"><a id="l00066" name="l00066"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">   66</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">set_output_array</a>(<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; a, <span class="keywordtype">int</span> idx, int64_t offset = 0);</div>
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">   67</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">dispatchThreadgroups</a>(MTL::Size grid_dims, MTL::Size group_dims);</div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">   68</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">dispatchThreads</a>(MTL::Size grid_dims, MTL::Size group_dims);</div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">   69</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">maybeInsertBarrier</a>();</div>
 <div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span> </div>
 <div class="foldopen" id="foldopen00071" data-start="{" data-end="}">
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">   71</a></span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">start_concurrent</a>() {</div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a>(*<span class="keyword">this</span>);</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefdadbff4e003dc6f77506840babc088">   71</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefdadbff4e003dc6f77506840babc088">wait_for_fence</a>(MTL::Fence* fence) {</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    enc_-&gt;waitForFence(fence);</div>
 <div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  }</div>
 </div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">   74</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">~CommandEncoder</a>();</div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span> </div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>  <span class="comment">// Inputs to all kernels in the encoder including temporaries</span></div>
-<div class="foldopen" id="foldopen00077" data-start="{" data-end="}">
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">   77</a></span>  std::unordered_set&lt;const void*&gt;&amp; <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">inputs</a>() {</div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    <span class="keywordflow">return</span> all_inputs_;</div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>  };</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span> </div>
+<div class="foldopen" id="foldopen00075" data-start="{" data-end="}">
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aeef08f5f3c015578d40de756a6465aa2">   75</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aeef08f5f3c015578d40de756a6465aa2">update_fence</a>(MTL::Fence* fence) {</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    enc_-&gt;updateFence(fence);</div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>  }</div>
 </div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span> </div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>  <span class="comment">// Outputs of all kernels in the encoder including temporaries</span></div>
-<div class="foldopen" id="foldopen00082" data-start="{" data-end="}">
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">   82</a></span>  std::unordered_set&lt;const void*&gt; <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">outputs</a>() {</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    <span class="keywordflow">return</span> all_outputs_;</div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>  };</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span> </div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00080" data-start="{" data-end="}">
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b">   80</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b">set_vector_bytes</a>(<span class="keyword">const</span> std::vector&lt;T&gt;&amp; vec, <span class="keywordtype">size_t</span> nelems, <span class="keywordtype">int</span> idx) {</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    enc_-&gt;setBytes(vec.data(), nelems * <span class="keyword">sizeof</span>(T), idx);</div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>  }</div>
 </div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span> </div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>  MTL::ComputeCommandEncoder* enc_;</div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>  <span class="keywordtype">bool</span> needs_barrier_{<span class="keyword">false</span>};</div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>  <span class="keywordtype">bool</span> concurrent_{<span class="keyword">false</span>};</div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>  std::unordered_set&lt;MTL::Resource*&gt; prev_outputs_;</div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  std::unordered_set&lt;MTL::Resource*&gt; next_outputs_;</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>  std::unordered_set&lt;MTL::Resource*&gt; concurrent_outputs_;</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>  std::unordered_set&lt;const void*&gt; all_inputs_;</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  std::unordered_set&lt;const void*&gt; all_outputs_;</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>};</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00084" data-start="{" data-end="}">
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a7375adf9ee5355bcf4b7f5f210efd115">   84</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a7375adf9ee5355bcf4b7f5f210efd115">set_vector_bytes</a>(<span class="keyword">const</span> std::vector&lt;T&gt;&amp; vec, <span class="keywordtype">int</span> idx) {</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>    <span class="keywordflow">return</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b">set_vector_bytes</a>(vec, vec.size(), idx);</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>  }</div>
 </div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span> </div>
-<div class="foldopen" id="foldopen00097" data-start="{" data-end="};">
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html">   97</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_fence.html">Fence</a> {</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">   98</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">Fence</a>(MTL::Fence* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>) : <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>(<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>) {}</div>
-<div class="foldopen" id="foldopen00099" data-start="{" data-end="}">
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">   99</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">~Fence</a>() {</div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>-&gt;release();</div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>  }</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span> </div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00089" data-start="{" data-end="}">
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9c343f791812a45c6c03a5c9f27f74d5">   89</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9c343f791812a45c6c03a5c9f27f74d5">set_bytes</a>(<span class="keyword">const</span> T* v, <span class="keywordtype">int</span> n, <span class="keywordtype">int</span> idx) {</div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>    <span class="keywordflow">return</span> enc_-&gt;setBytes(v, n * <span class="keyword">sizeof</span>(T), idx);</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  }</div>
 </div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">  102</a></span>  MTL::Fence* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>;</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>};</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span> </div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00094" data-start="{" data-end="}">
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#abc52d18ea87d213c47fd26062c829849">   94</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#abc52d18ea87d213c47fd26062c829849">set_bytes</a>(<span class="keyword">const</span> T&amp; v, <span class="keywordtype">int</span> idx) {</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>    <span class="keywordflow">return</span> enc_-&gt;setBytes(&amp;v, <span class="keyword">sizeof</span>(T), idx);</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>  }</div>
 </div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span> </div>
-<div class="foldopen" id="foldopen00105" data-start="{" data-end="};">
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">  105</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a> {</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">  106</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">DeviceStream</a>(MTL::CommandQueue* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>) : <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>(<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>) {};</div>
-<div class="foldopen" id="foldopen00107" data-start="{" data-end="}">
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">  107</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">~DeviceStream</a>() {</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>-&gt;release();</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a> != <span class="keyword">nullptr</span>) {</div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>      <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a>-&gt;release();</div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    }</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>  };</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span> </div>
+<div class="foldopen" id="foldopen00098" data-start="{" data-end="}">
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">   98</a></span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">start_concurrent</a>() {</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a>(*<span class="keyword">this</span>);</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>  }</div>
+</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">  101</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">~CommandEncoder</a>();</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span> </div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>  <span class="comment">// Inputs to all kernels in the encoder including temporaries</span></div>
+<div class="foldopen" id="foldopen00104" data-start="{" data-end="}">
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">  104</a></span>  std::unordered_set&lt;const void*&gt;&amp; <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">inputs</a>() {</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <span class="keywordflow">return</span> all_inputs_;</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>  };</div>
+</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span> </div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>  <span class="comment">// Outputs of all kernels in the encoder including temporaries</span></div>
+<div class="foldopen" id="foldopen00109" data-start="{" data-end="}">
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">  109</a></span>  std::unordered_set&lt;const void*&gt; <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">outputs</a>() {</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>    <span class="keywordflow">return</span> all_outputs_;</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  };</div>
+</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span> </div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  MTL::ComputeCommandEncoder* enc_;</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>  <span class="keywordtype">bool</span> needs_barrier_{<span class="keyword">false</span>};</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  <span class="keywordtype">bool</span> concurrent_{<span class="keyword">false</span>};</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>  std::unordered_set&lt;MTL::Resource*&gt; prev_outputs_;</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>  std::unordered_set&lt;MTL::Resource*&gt; next_outputs_;</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  std::unordered_set&lt;MTL::Resource*&gt; concurrent_outputs_;</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  std::unordered_set&lt;const void*&gt; all_inputs_;</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  std::unordered_set&lt;const void*&gt; all_outputs_;</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>};</div>
 </div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">  113</a></span>  MTL::CommandQueue* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>;</div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  <span class="comment">// A map of prior command encoder outputs to their corresponding fence</span></div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">  115</a></span>  std::unordered_map&lt;const void*, std::shared_ptr&lt;Fence&gt;&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">outputs</a>;</div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  <span class="comment">// Used to allow thread-safe access to the outputs map</span></div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">  117</a></span>  std::mutex <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">fence_mtx</a>;</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span> </div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  <span class="comment">// The buffer and buffer op count are updated</span></div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  <span class="comment">// between command buffers</span></div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">  121</a></span>  MTL::CommandBuffer* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a>{<span class="keyword">nullptr</span>};</div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">  122</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">buffer_ops</a>{0};</div>
 <div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span> </div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>  <span class="comment">// The command encoder, fence, and temporaries are updated between command</span></div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>  <span class="comment">// encoders</span></div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">  126</a></span>  std::unique_ptr&lt;CommandEncoder&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">encoder</a>{<span class="keyword">nullptr</span>};</div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">  127</a></span>  std::shared_ptr&lt;Fence&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">fence</a>;</div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">  128</a></span>  std::vector&lt;array&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">temporaries</a>;</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>};</div>
+<div class="foldopen" id="foldopen00124" data-start="{" data-end="};">
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html">  124</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_fence.html">Fence</a> {</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">  125</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">Fence</a>(MTL::Fence* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>) : <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>(<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>) {}</div>
+<div class="foldopen" id="foldopen00126" data-start="{" data-end="}">
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">  126</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">~Fence</a>() {</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>-&gt;release();</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>  }</div>
 </div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span> </div>
-<div class="foldopen" id="foldopen00131" data-start="{" data-end="};">
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html">  131</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a> {</div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6">  133</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6">Device</a>();</div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">  134</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">Device</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp;) = <span class="keyword">delete</span>;</div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">  135</a></span>  <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">operator=</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp;) = <span class="keyword">delete</span>;</div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">  136</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">~Device</a>();</div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span> </div>
-<div class="foldopen" id="foldopen00138" data-start="{" data-end="}">
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">  138</a></span>  MTL::Device* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">mtl_device</a>() {</div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    <span class="keywordflow">return</span> device_;</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>  };</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">  129</a></span>  MTL::Fence* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>;</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>};</div>
 </div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span> </div>
-<div class="foldopen" id="foldopen00142" data-start="{" data-end="}">
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">  142</a></span>  <span class="keyword">const</span> std::string&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">get_architecture</a>() {</div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    <span class="keywordflow">return</span> arch_;</div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>  }</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span> </div>
+<div class="foldopen" id="foldopen00132" data-start="{" data-end="};">
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">  132</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a> {</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">  133</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">DeviceStream</a>(MTL::CommandQueue* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>) : <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>(<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>) {};</div>
+<div class="foldopen" id="foldopen00134" data-start="{" data-end="}">
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">  134</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">~DeviceStream</a>() {</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>    <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>-&gt;release();</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a> != <span class="keyword">nullptr</span>) {</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>      <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a>-&gt;release();</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    }</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>  };</div>
 </div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">  140</a></span>  MTL::CommandQueue* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>;</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>  <span class="comment">// A map of prior command encoder outputs to their corresponding fence</span></div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">  142</a></span>  std::unordered_map&lt;const void*, std::shared_ptr&lt;Fence&gt;&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">outputs</a>;</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>  <span class="comment">// Used to allow thread-safe access to the outputs map</span></div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">  144</a></span>  std::mutex <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">fence_mtx</a>;</div>
 <div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span> </div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">  146</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">new_queue</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">  147</a></span>  MTL::CommandBuffer* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">get_command_buffer</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">  148</a></span>  <span class="keywordtype">int</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">get_command_buffer_ops</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">  149</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">increment_command_buffer_ops</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">  150</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">commit_command_buffer</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6">  151</a></span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6">get_command_encoder</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">  152</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">end_encoding</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span> </div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">  154</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">register_library</a>(</div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>      <span class="keyword">const</span> std::string&amp; lib_name,</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>      <span class="keyword">const</span> std::string&amp; lib_path);</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>  <span class="comment">// The buffer and buffer op count are updated</span></div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>  <span class="comment">// between command buffers</span></div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">  148</a></span>  MTL::CommandBuffer* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a>{<span class="keyword">nullptr</span>};</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">  149</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">buffer_ops</a>{0};</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span> </div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>  <span class="comment">// The command encoder, fence, and temporaries are updated between command</span></div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>  <span class="comment">// encoders</span></div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">  153</a></span>  std::unique_ptr&lt;CommandEncoder&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">encoder</a>{<span class="keyword">nullptr</span>};</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">  154</a></span>  std::shared_ptr&lt;Fence&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">fence</a>;</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">  155</a></span>  std::vector&lt;array&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">temporaries</a>;</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>};</div>
+</div>
 <div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span> </div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>  <span class="comment">// Note, this should remain in the header so that it is not dynamically</span></div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>  <span class="comment">// linked</span></div>
-<div class="foldopen" id="foldopen00160" data-start="{" data-end="}">
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">  160</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">register_library</a>(<span class="keyword">const</span> std::string&amp; lib_name) {</div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    <span class="keywordflow">if</span> (<span class="keyword">auto</span> it = library_map_.find(lib_name); it == library_map_.end()) {</div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>      <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">register_library</a>(lib_name, <a class="code hl_function" href="namespacemlx_1_1core_1_1metal.html#a5fd6ba2040e53a254b9d71ae7ebd315f">get_colocated_mtllib_path</a>(lib_name));</div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    }</div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>  }</div>
+<div class="foldopen" id="foldopen00158" data-start="{" data-end="};">
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html">  158</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a> {</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6">  160</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6">Device</a>();</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">  161</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">Device</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp;) = <span class="keyword">delete</span>;</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">  162</a></span>  <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">operator=</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp;) = <span class="keyword">delete</span>;</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">  163</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">~Device</a>();</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span> </div>
+<div class="foldopen" id="foldopen00165" data-start="{" data-end="}">
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">  165</a></span>  MTL::Device* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">mtl_device</a>() {</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keywordflow">return</span> device_;</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>  };</div>
+</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span> </div>
+<div class="foldopen" id="foldopen00169" data-start="{" data-end="}">
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">  169</a></span>  <span class="keyword">const</span> std::string&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">get_architecture</a>() {</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    <span class="keywordflow">return</span> arch_;</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>  }</div>
+</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span> </div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">  173</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">new_queue</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">  174</a></span>  MTL::CommandBuffer* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">get_command_buffer</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">  175</a></span>  <span class="keywordtype">int</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">get_command_buffer_ops</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">  176</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">increment_command_buffer_ops</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">  177</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">commit_command_buffer</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6">  178</a></span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6">get_command_encoder</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">  179</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">end_encoding</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span> </div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">  181</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">register_library</a>(</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>      <span class="keyword">const</span> std::string&amp; lib_name,</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>      <span class="keyword">const</span> std::string&amp; lib_path);</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span> </div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>  <span class="comment">// Note, this should remain in the header so that it is not dynamically</span></div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>  <span class="comment">// linked</span></div>
+<div class="foldopen" id="foldopen00187" data-start="{" data-end="}">
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">  187</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">register_library</a>(<span class="keyword">const</span> std::string&amp; lib_name) {</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    <span class="keywordflow">if</span> (<span class="keyword">auto</span> it = library_map_.find(lib_name); it == library_map_.end()) {</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>      <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">register_library</a>(lib_name, <a class="code hl_function" href="namespacemlx_1_1core_1_1metal.html#a5fd6ba2040e53a254b9d71ae7ebd315f">get_colocated_mtllib_path</a>(lib_name));</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>    }</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>  }</div>
 </div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span> </div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0">  166</a></span>  MTL::Library* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0">get_library</a>(</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>      <span class="keyword">const</span> std::string&amp; name,</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>      <span class="keyword">const</span> std::function&lt;std::string(<span class="keywordtype">void</span>)&gt;&amp; builder);</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span> </div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">  170</a></span>  MTL::ComputePipelineState* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">get_kernel</a>(</div>
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>      MTL::Library* mtl_lib,</div>
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>      <span class="keyword">const</span> std::string&amp; hash_name = <span class="stringliteral">&quot;&quot;</span>,</div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span> </div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf">  177</a></span>  MTL::ComputePipelineState* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf">get_kernel</a>(</div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>      <span class="keyword">const</span> std::string&amp; lib_name = <span class="stringliteral">&quot;mlx&quot;</span>,</div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>      <span class="keyword">const</span> std::string&amp; hash_name = <span class="stringliteral">&quot;&quot;</span>,</div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span> </div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">  184</a></span>  MTL::ArgumentEncoder* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">argument_encoder</a>(</div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>      <span class="keyword">const</span> std::vector&lt;MTL::ArgumentDescriptor*&gt;&amp; arg_descs) <span class="keyword">const</span>;</div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span> </div>
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>  <span class="comment">// Record temporary arrays for the given stream index</span></div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">  188</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">add_temporary</a>(<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> arr, <span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">  189</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">add_temporaries</a>(std::vector&lt;array&gt; arrays, <span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span> </div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">  191</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">set_residency_set</a>(<span class="keyword">const</span> MTL::ResidencySet* residency_set);</div>
 <div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span> </div>
-<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a>&amp; get_stream_(<span class="keywordtype">int</span> index) {</div>
-<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    <span class="keywordflow">return</span> stream_map_.find(index)-&gt;second;</div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>  }</div>
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>  MTL::Library* get_library_cache_(<span class="keyword">const</span> std::string&amp; name);</div>
-<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span> </div>
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>  MTL::Library* get_library_(<span class="keyword">const</span> std::string&amp; name);</div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>  MTL::Library* build_library_(<span class="keyword">const</span> std::string&amp; source_string);</div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span> </div>
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>  MTL::Function* get_function_(<span class="keyword">const</span> std::string&amp; name, MTL::Library* mtl_lib);</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0">  193</a></span>  MTL::Library* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0">get_library</a>(</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>      <span class="keyword">const</span> std::string&amp; name,</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>      <span class="keyword">const</span> std::function&lt;std::string(<span class="keywordtype">void</span>)&gt;&amp; builder);</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span> </div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">  197</a></span>  MTL::ComputePipelineState* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">get_kernel</a>(</div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>      MTL::Library* mtl_lib,</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>      <span class="keyword">const</span> std::string&amp; hash_name = <span class="stringliteral">&quot;&quot;</span>,</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
 <div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span> </div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>  MTL::Function* get_function_(</div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      <span class="keyword">const</span> std::string&amp; name,</div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>      <span class="keyword">const</span> std::string&amp; specialized_name,</div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts,</div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>      MTL::Library* mtl_lib);</div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span> </div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>  MTL::LinkedFunctions* get_linked_functions_(</div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; funcs);</div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span> </div>
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>  MTL::ComputePipelineState* get_kernel_(</div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>      <span class="keyword">const</span> std::string&amp; name,</div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>      <span class="keyword">const</span> MTL::Function* mtl_function);</div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span> </div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>  MTL::ComputePipelineState* get_kernel_(</div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>      <span class="keyword">const</span> std::string&amp; name,</div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>      <span class="keyword">const</span> MTL::Function* mtl_function,</div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>      <span class="keyword">const</span> MTL::LinkedFunctions* linked_functions);</div>
-<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span> </div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>  MTL::ComputePipelineState* get_kernel_(</div>
-<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
-<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>      MTL::Library* mtl_lib,</div>
-<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>      <span class="keyword">const</span> std::string&amp; hash_name,</div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
-<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf">  204</a></span>  MTL::ComputePipelineState* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf">get_kernel</a>(</div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>      <span class="keyword">const</span> std::string&amp; lib_name = <span class="stringliteral">&quot;mlx&quot;</span>,</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>      <span class="keyword">const</span> std::string&amp; hash_name = <span class="stringliteral">&quot;&quot;</span>,</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span> </div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">  211</a></span>  MTL::ArgumentEncoder* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">argument_encoder</a>(</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>      <span class="keyword">const</span> std::vector&lt;MTL::ArgumentDescriptor*&gt;&amp; arg_descs) <span class="keyword">const</span>;</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span> </div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>  <span class="comment">// Record temporary arrays for the given stream index</span></div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">  215</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">add_temporary</a>(<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> arr, <span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">  216</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">add_temporaries</a>(std::vector&lt;array&gt; arrays, <span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span> </div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">  218</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">set_residency_set</a>(<span class="keyword">const</span> MTL::ResidencySet* residency_set);</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span> </div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a>&amp; get_stream_(<span class="keywordtype">int</span> index) {</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>    <span class="keywordflow">return</span> stream_map_.find(index)-&gt;second;</div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>  }</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>  MTL::Library* get_library_cache_(<span class="keyword">const</span> std::string&amp; name);</div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span> </div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>  MTL::Library* get_library_(<span class="keyword">const</span> std::string&amp; name);</div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>  MTL::Library* build_library_(<span class="keyword">const</span> std::string&amp; source_string);</div>
 <div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span> </div>
-<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>  MTL::Device* device_;</div>
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>  std::unordered_map&lt;int32_t, DeviceStream&gt; stream_map_;</div>
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span> </div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>  std::shared_mutex kernel_mtx_;</div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>  std::unordered_map&lt;std::string, MTL::ComputePipelineState*&gt; kernel_map_;</div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span> </div>
-<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>  std::shared_mutex library_mtx_;</div>
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>  std::unordered_map&lt;std::string, MTL::Library*&gt; library_map_;</div>
-<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>  <span class="keyword">const</span> MTL::ResidencySet* residency_set_{<span class="keyword">nullptr</span>};</div>
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>  std::string arch_;</div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>};</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>  MTL::Function* get_function_(<span class="keyword">const</span> std::string&amp; name, MTL::Library* mtl_lib);</div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span> </div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>  MTL::Function* get_function_(</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>      <span class="keyword">const</span> std::string&amp; name,</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>      <span class="keyword">const</span> std::string&amp; specialized_name,</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts,</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>      MTL::Library* mtl_lib);</div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span> </div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>  MTL::LinkedFunctions* get_linked_functions_(</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; funcs);</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span> </div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>  MTL::ComputePipelineState* get_kernel_(</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>      <span class="keyword">const</span> std::string&amp; name,</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>      <span class="keyword">const</span> MTL::Function* mtl_function);</div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span> </div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>  MTL::ComputePipelineState* get_kernel_(</div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>      <span class="keyword">const</span> std::string&amp; name,</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>      <span class="keyword">const</span> MTL::Function* mtl_function,</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>      <span class="keyword">const</span> MTL::LinkedFunctions* linked_functions);</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span> </div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>  MTL::ComputePipelineState* get_kernel_(</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>      MTL::Library* mtl_lib,</div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>      <span class="keyword">const</span> std::string&amp; hash_name,</div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span> </div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>  MTL::Device* device_;</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>  std::unordered_map&lt;int32_t, DeviceStream&gt; stream_map_;</div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span> </div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>  std::shared_mutex kernel_mtx_;</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>  std::unordered_map&lt;std::string, MTL::ComputePipelineState*&gt; kernel_map_;</div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span> </div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>  std::shared_mutex library_mtx_;</div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>  std::unordered_map&lt;std::string, MTL::Library*&gt; library_map_;</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>  <span class="keyword">const</span> MTL::ResidencySet* residency_set_{<span class="keyword">nullptr</span>};</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>  std::string arch_;</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>};</div>
 </div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span> </div>
-<div class="line"><a id="l00241" name="l00241"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57">  241</a></span><a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp; <a class="code hl_function" href="namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57">device</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_device.html">mlx::core::Device</a>);</div>
-<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span> </div>
-<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>} <span class="comment">// namespace mlx::core::metal</span></div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span> </div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57">  268</a></span><a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp; <a class="code hl_function" href="namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57">device</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_device.html">mlx::core::Device</a>);</div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span> </div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>} <span class="comment">// namespace mlx::core::metal</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:131</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:158</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a03a2f0c712660a1bd437cb16e4aba79f"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">mlx::core::metal::Device::set_residency_set</a></div><div class="ttdeci">void set_residency_set(const MTL::ResidencySet *residency_set)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a064e1cb6a16de7a0619f6447622350f8"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">mlx::core::metal::Device::get_command_buffer_ops</a></div><div class="ttdeci">int get_command_buffer_ops(int index)</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a31dba377f2be44a746db10d1b9367653"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">mlx::core::metal::Device::mtl_device</a></div><div class="ttdeci">MTL::Device * mtl_device()</div><div class="ttdef"><b>Definition</b> device.h:138</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a31dba377f2be44a746db10d1b9367653"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">mlx::core::metal::Device::mtl_device</a></div><div class="ttdeci">MTL::Device * mtl_device()</div><div class="ttdef"><b>Definition</b> device.h:165</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a45945f2efcd242d915ffa2171e92bf9d"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">mlx::core::metal::Device::register_library</a></div><div class="ttdeci">void register_library(const std::string &amp;lib_name, const std::string &amp;lib_path)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a4f39c28c6cdd1d2da1918f5871bcba6e"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">mlx::core::metal::Device::~Device</a></div><div class="ttdeci">~Device()</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a5fe3970fbe92ccc55fce4241ffbe5210"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">mlx::core::metal::Device::get_command_buffer</a></div><div class="ttdeci">MTL::CommandBuffer * get_command_buffer(int index)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a60689f97347811b27e8c5ca23e0372bf"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">mlx::core::metal::Device::end_encoding</a></div><div class="ttdeci">void end_encoding(int index)</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a65f64dd8bafdc704d871fc5be5e7bc0b"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">mlx::core::metal::Device::get_architecture</a></div><div class="ttdeci">const std::string &amp; get_architecture()</div><div class="ttdef"><b>Definition</b> device.h:142</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a65f64dd8bafdc704d871fc5be5e7bc0b"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">mlx::core::metal::Device::get_architecture</a></div><div class="ttdeci">const std::string &amp; get_architecture()</div><div class="ttdef"><b>Definition</b> device.h:169</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a6810c4dcbcfbf93fc51d42aa5ff0fc3a"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">mlx::core::metal::Device::get_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_kernel(const std::string &amp;base_name, MTL::Library *mtl_lib, const std::string &amp;hash_name=&quot;&quot;, const MTLFCList &amp;func_consts={}, const std::vector&lt; MTL::Function * &gt; &amp;linked_functions={})</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a6e33e2b1287324fb4a6575e0da5e5881"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">mlx::core::metal::Device::argument_encoder</a></div><div class="ttdeci">MTL::ArgumentEncoder * argument_encoder(const std::vector&lt; MTL::ArgumentDescriptor * &gt; &amp;arg_descs) const</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a72ad17c96fc6ce825bc77f0bed657901"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">mlx::core::metal::Device::add_temporaries</a></div><div class="ttdeci">void add_temporaries(std::vector&lt; array &gt; arrays, int index)</div></div>
@@ -386,7 +425,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a7a33d4d601423a3d3c23d5ad7072abb6"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">mlx::core::metal::Device::increment_command_buffer_ops</a></div><div class="ttdeci">void increment_command_buffer_ops(int index)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a8135ae2a8c1e6f3861e84d4e60c28b67"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">mlx::core::metal::Device::new_queue</a></div><div class="ttdeci">void new_queue(int index)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a95248f1387824067fd4fed23ace5ac0c"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">mlx::core::metal::Device::commit_command_buffer</a></div><div class="ttdeci">void commit_command_buffer(int index)</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a99ff72689b7beb65ad4541391b0eeabf"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">mlx::core::metal::Device::register_library</a></div><div class="ttdeci">void register_library(const std::string &amp;lib_name)</div><div class="ttdef"><b>Definition</b> device.h:160</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a99ff72689b7beb65ad4541391b0eeabf"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">mlx::core::metal::Device::register_library</a></div><div class="ttdeci">void register_library(const std::string &amp;lib_name)</div><div class="ttdef"><b>Definition</b> device.h:187</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_abf59a4addb5473f9e814e3651ba85f06"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">mlx::core::metal::Device::Device</a></div><div class="ttdeci">Device(const Device &amp;)=delete</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_acb90010af0cffe27fd8cc6c253d3a576"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">mlx::core::metal::Device::add_temporary</a></div><div class="ttdeci">void add_temporary(array arr, int index)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_ad1d6382fd18a46b1906e1b43e0bd2e73"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">mlx::core::metal::Device::operator=</a></div><div class="ttdeci">Device &amp; operator=(const Device &amp;)=delete</div></div>
@@ -403,34 +442,40 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context_html_a28bafec56edec3091e8716d8ccfb6ee1"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#a28bafec56edec3091e8716d8ccfb6ee1">mlx::core::metal::CommandEncoder::ConcurrentContext::~ConcurrentContext</a></div><div class="ttdeci">~ConcurrentContext()</div><div class="ttdef"><b>Definition</b> device.h:50</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context_html_aee044d7729739c96e845823f9ecc5174"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174">mlx::core::metal::CommandEncoder::ConcurrentContext::ConcurrentContext</a></div><div class="ttdeci">ConcurrentContext(CommandEncoder &amp;enc)</div><div class="ttdef"><b>Definition</b> device.h:47</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></div><div class="ttdef"><b>Definition</b> device.h:41</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a1e41477f2f489e38499f7830a91c9810"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">mlx::core::metal::CommandEncoder::dispatchThreads</a></div><div class="ttdeci">void dispatchThreads(MTL::Size grid_dims, MTL::Size group_dims)</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a0a8501b940e5a347475fa4bc38fb4c05"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a0a8501b940e5a347475fa4bc38fb4c05">mlx::core::metal::CommandEncoder::dispatch_threads</a></div><div class="ttdeci">void dispatch_threads(MTL::Size grid_dims, MTL::Size group_dims)</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a2334774486f447213ee997e55c2e52a3"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3">mlx::core::metal::CommandEncoder::CommandEncoder</a></div><div class="ttdeci">CommandEncoder(MTL::CommandBuffer *cbuf)</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a27ded7e54bc1712063c874646b445509"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">mlx::core::metal::CommandEncoder::inputs</a></div><div class="ttdeci">std::unordered_set&lt; const void * &gt; &amp; inputs()</div><div class="ttdef"><b>Definition</b> device.h:77</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a27ded7e54bc1712063c874646b445509"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">mlx::core::metal::CommandEncoder::inputs</a></div><div class="ttdeci">std::unordered_set&lt; const void * &gt; &amp; inputs()</div><div class="ttdef"><b>Definition</b> device.h:104</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a3f42a1362b4a513fa89e7b3dcc570a8e"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e">mlx::core::metal::CommandEncoder::operator=</a></div><div class="ttdeci">CommandEncoder &amp; operator=(const CommandEncoder &amp;)=delete</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a48b548a0b15f9d1279c938a1c6167034"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">mlx::core::metal::CommandEncoder::start_concurrent</a></div><div class="ttdeci">ConcurrentContext start_concurrent()</div><div class="ttdef"><b>Definition</b> device.h:71</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a48b548a0b15f9d1279c938a1c6167034"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">mlx::core::metal::CommandEncoder::start_concurrent</a></div><div class="ttdeci">ConcurrentContext start_concurrent()</div><div class="ttdef"><b>Definition</b> device.h:98</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a68c3c6a036e11ec40211c09811bbed1b"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b">mlx::core::metal::CommandEncoder::set_vector_bytes</a></div><div class="ttdeci">void set_vector_bytes(const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)</div><div class="ttdef"><b>Definition</b> device.h:80</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a6a2e28e542eaa2886041bddd51ff6522"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">mlx::core::metal::CommandEncoder::set_output_array</a></div><div class="ttdeci">void set_output_array(array &amp;a, int idx, int64_t offset=0)</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a74bcd8e35f80f5a62db48c4a2bb0173e"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">mlx::core::metal::CommandEncoder::dispatchThreadgroups</a></div><div class="ttdeci">void dispatchThreadgroups(MTL::Size grid_dims, MTL::Size group_dims)</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a6d4c03a6585deedb5ccd1a1057d0c6ef"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6d4c03a6585deedb5ccd1a1057d0c6ef">mlx::core::metal::CommandEncoder::set_compute_pipeline_state</a></div><div class="ttdeci">void set_compute_pipeline_state(MTL::ComputePipelineState *kernel)</div><div class="ttdef"><b>Definition</b> device.h:67</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a7375adf9ee5355bcf4b7f5f210efd115"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a7375adf9ee5355bcf4b7f5f210efd115">mlx::core::metal::CommandEncoder::set_vector_bytes</a></div><div class="ttdeci">void set_vector_bytes(const std::vector&lt; T &gt; &amp;vec, int idx)</div><div class="ttdef"><b>Definition</b> device.h:84</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a85796b2bf41dbf347ae0978d4660600d"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a85796b2bf41dbf347ae0978d4660600d">mlx::core::metal::CommandEncoder::dispatch_threadgroups</a></div><div class="ttdeci">void dispatch_threadgroups(MTL::Size grid_dims, MTL::Size group_dims)</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a9b6dd221ccd2d939d544004cb6279198"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">mlx::core::metal::CommandEncoder::~CommandEncoder</a></div><div class="ttdeci">~CommandEncoder()</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_aac45ab0630ea32cf7d15c7ba3e229966"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966">mlx::core::metal::CommandEncoder::operator-&gt;</a></div><div class="ttdeci">MTL::ComputeCommandEncoder * operator-&gt;()</div><div class="ttdef"><b>Definition</b> device.h:61</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a9c343f791812a45c6c03a5c9f27f74d5"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9c343f791812a45c6c03a5c9f27f74d5">mlx::core::metal::CommandEncoder::set_bytes</a></div><div class="ttdeci">void set_bytes(const T *v, int n, int idx)</div><div class="ttdef"><b>Definition</b> device.h:89</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_ab69ff0d7f14b9b59db4df0608193dce4"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">mlx::core::metal::CommandEncoder::set_input_array</a></div><div class="ttdeci">void set_input_array(const array &amp;a, int idx, int64_t offset=0)</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_abc52d18ea87d213c47fd26062c829849"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#abc52d18ea87d213c47fd26062c829849">mlx::core::metal::CommandEncoder::set_bytes</a></div><div class="ttdeci">void set_bytes(const T &amp;v, int idx)</div><div class="ttdef"><b>Definition</b> device.h:94</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_ac68ca977b5bde5434284ce7979647f14"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14">mlx::core::metal::CommandEncoder::CommandEncoder</a></div><div class="ttdeci">CommandEncoder(const CommandEncoder &amp;)=delete</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_ad538ae88f90560063f9ba502e2795991"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">mlx::core::metal::CommandEncoder::maybeInsertBarrier</a></div><div class="ttdeci">void maybeInsertBarrier()</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_aefa48740fdee884f02e2d379bca4e78f"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">mlx::core::metal::CommandEncoder::outputs</a></div><div class="ttdeci">std::unordered_set&lt; const void * &gt; outputs()</div><div class="ttdef"><b>Definition</b> device.h:82</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html">mlx::core::metal::DeviceStream</a></div><div class="ttdef"><b>Definition</b> device.h:105</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a1c4397732f64f5811381dd01e30e020e"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">mlx::core::metal::DeviceStream::~DeviceStream</a></div><div class="ttdeci">~DeviceStream()</div><div class="ttdef"><b>Definition</b> device.h:107</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a55a7a92c6abad369c99a5ede7a2521b9"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">mlx::core::metal::DeviceStream::outputs</a></div><div class="ttdeci">std::unordered_map&lt; const void *, std::shared_ptr&lt; Fence &gt; &gt; outputs</div><div class="ttdef"><b>Definition</b> device.h:115</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a573326bc8b48e39076850c7bf52ad0d7"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">mlx::core::metal::DeviceStream::DeviceStream</a></div><div class="ttdeci">DeviceStream(MTL::CommandQueue *queue)</div><div class="ttdef"><b>Definition</b> device.h:106</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a58e435217b9922f882507ebf48bfbbdd"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">mlx::core::metal::DeviceStream::encoder</a></div><div class="ttdeci">std::unique_ptr&lt; CommandEncoder &gt; encoder</div><div class="ttdef"><b>Definition</b> device.h:126</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a6fa08cca881fc3798ae45994a11a4fcd"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">mlx::core::metal::DeviceStream::fence_mtx</a></div><div class="ttdeci">std::mutex fence_mtx</div><div class="ttdef"><b>Definition</b> device.h:117</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a77c75a63c51ea56815a86bd882ed190d"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">mlx::core::metal::DeviceStream::queue</a></div><div class="ttdeci">MTL::CommandQueue * queue</div><div class="ttdef"><b>Definition</b> device.h:113</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a876199de8da1efa9a362451029638499"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">mlx::core::metal::DeviceStream::fence</a></div><div class="ttdeci">std::shared_ptr&lt; Fence &gt; fence</div><div class="ttdef"><b>Definition</b> device.h:127</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a99183c92599edfeb75f7fa0f37e1d9eb"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">mlx::core::metal::DeviceStream::buffer</a></div><div class="ttdeci">MTL::CommandBuffer * buffer</div><div class="ttdef"><b>Definition</b> device.h:121</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_ab6048b329e65a59033834f3bdd351782"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">mlx::core::metal::DeviceStream::buffer_ops</a></div><div class="ttdeci">int buffer_ops</div><div class="ttdef"><b>Definition</b> device.h:122</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_aee88009117dfff1ad121eabe28d5f3de"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">mlx::core::metal::DeviceStream::temporaries</a></div><div class="ttdeci">std::vector&lt; array &gt; temporaries</div><div class="ttdef"><b>Definition</b> device.h:128</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html">mlx::core::metal::Fence</a></div><div class="ttdef"><b>Definition</b> device.h:97</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_a30bee4957ae595e04922952a8010fc79"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">mlx::core::metal::Fence::Fence</a></div><div class="ttdeci">Fence(MTL::Fence *fence)</div><div class="ttdef"><b>Definition</b> device.h:98</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_a4940c1aece13814af7727de9abb511f2"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">mlx::core::metal::Fence::~Fence</a></div><div class="ttdeci">~Fence()</div><div class="ttdef"><b>Definition</b> device.h:99</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_aeccd8f2b81418ae9fc446ae2b6e15b87"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">mlx::core::metal::Fence::fence</a></div><div class="ttdeci">MTL::Fence * fence</div><div class="ttdef"><b>Definition</b> device.h:102</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_aeef08f5f3c015578d40de756a6465aa2"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aeef08f5f3c015578d40de756a6465aa2">mlx::core::metal::CommandEncoder::update_fence</a></div><div class="ttdeci">void update_fence(MTL::Fence *fence)</div><div class="ttdef"><b>Definition</b> device.h:75</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_aefa48740fdee884f02e2d379bca4e78f"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">mlx::core::metal::CommandEncoder::outputs</a></div><div class="ttdeci">std::unordered_set&lt; const void * &gt; outputs()</div><div class="ttdef"><b>Definition</b> device.h:109</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_aefdadbff4e003dc6f77506840babc088"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefdadbff4e003dc6f77506840babc088">mlx::core::metal::CommandEncoder::wait_for_fence</a></div><div class="ttdeci">void wait_for_fence(MTL::Fence *fence)</div><div class="ttdef"><b>Definition</b> device.h:71</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html">mlx::core::metal::DeviceStream</a></div><div class="ttdef"><b>Definition</b> device.h:132</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a1c4397732f64f5811381dd01e30e020e"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">mlx::core::metal::DeviceStream::~DeviceStream</a></div><div class="ttdeci">~DeviceStream()</div><div class="ttdef"><b>Definition</b> device.h:134</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a55a7a92c6abad369c99a5ede7a2521b9"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">mlx::core::metal::DeviceStream::outputs</a></div><div class="ttdeci">std::unordered_map&lt; const void *, std::shared_ptr&lt; Fence &gt; &gt; outputs</div><div class="ttdef"><b>Definition</b> device.h:142</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a573326bc8b48e39076850c7bf52ad0d7"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">mlx::core::metal::DeviceStream::DeviceStream</a></div><div class="ttdeci">DeviceStream(MTL::CommandQueue *queue)</div><div class="ttdef"><b>Definition</b> device.h:133</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a58e435217b9922f882507ebf48bfbbdd"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">mlx::core::metal::DeviceStream::encoder</a></div><div class="ttdeci">std::unique_ptr&lt; CommandEncoder &gt; encoder</div><div class="ttdef"><b>Definition</b> device.h:153</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a6fa08cca881fc3798ae45994a11a4fcd"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">mlx::core::metal::DeviceStream::fence_mtx</a></div><div class="ttdeci">std::mutex fence_mtx</div><div class="ttdef"><b>Definition</b> device.h:144</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a77c75a63c51ea56815a86bd882ed190d"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">mlx::core::metal::DeviceStream::queue</a></div><div class="ttdeci">MTL::CommandQueue * queue</div><div class="ttdef"><b>Definition</b> device.h:140</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a876199de8da1efa9a362451029638499"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">mlx::core::metal::DeviceStream::fence</a></div><div class="ttdeci">std::shared_ptr&lt; Fence &gt; fence</div><div class="ttdef"><b>Definition</b> device.h:154</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a99183c92599edfeb75f7fa0f37e1d9eb"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">mlx::core::metal::DeviceStream::buffer</a></div><div class="ttdeci">MTL::CommandBuffer * buffer</div><div class="ttdef"><b>Definition</b> device.h:148</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_ab6048b329e65a59033834f3bdd351782"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">mlx::core::metal::DeviceStream::buffer_ops</a></div><div class="ttdeci">int buffer_ops</div><div class="ttdef"><b>Definition</b> device.h:149</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_aee88009117dfff1ad121eabe28d5f3de"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">mlx::core::metal::DeviceStream::temporaries</a></div><div class="ttdeci">std::vector&lt; array &gt; temporaries</div><div class="ttdef"><b>Definition</b> device.h:155</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html">mlx::core::metal::Fence</a></div><div class="ttdef"><b>Definition</b> device.h:124</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_a30bee4957ae595e04922952a8010fc79"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">mlx::core::metal::Fence::Fence</a></div><div class="ttdeci">Fence(MTL::Fence *fence)</div><div class="ttdef"><b>Definition</b> device.h:125</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_a4940c1aece13814af7727de9abb511f2"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">mlx::core::metal::Fence::~Fence</a></div><div class="ttdeci">~Fence()</div><div class="ttdef"><b>Definition</b> device.h:126</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_aeccd8f2b81418ae9fc446ae2b6e15b87"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">mlx::core::metal::Fence::fence</a></div><div class="ttdeci">MTL::Fence * fence</div><div class="ttdef"><b>Definition</b> device.h:129</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/backend_2metal_2kernels_2complex_8h_source.html b/docs/build/html/backend_2metal_2kernels_2complex_8h_source.html
index 87aa7be43..9f5645c71 100644
--- a/docs/build/html/backend_2metal_2kernels_2complex_8h_source.html
+++ b/docs/build/html/backend_2metal_2kernels_2complex_8h_source.html
@@ -268,7 +268,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="abackend_2metal_2kernels_2complex_8h_html_ad6af5c6c5ed4898b49758618e5aee189"><div class="ttname"><a href="backend_2metal_2kernels_2complex_8h.html#ad6af5c6c5ed4898b49758618e5aee189">operator+</a></div><div class="ttdeci">constexpr complex64_t operator+(complex64_t a, complex64_t b)</div><div class="ttdef"><b>Definition</b> complex.h:104</div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2complex_8h_html_ae6a708f67d6fd9b0962aa8877cec6d35"><div class="ttname"><a href="backend_2metal_2kernels_2complex_8h.html#ae6a708f67d6fd9b0962aa8877cec6d35">operator/</a></div><div class="ttdeci">constexpr complex64_t operator/(complex64_t a, complex64_t b)</div><div class="ttdef"><b>Definition</b> complex.h:116</div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2complex_8h_html_aee04c9a63c6716a99a027418354debb0"><div class="ttname"><a href="backend_2metal_2kernels_2complex_8h.html#aee04c9a63c6716a99a027418354debb0">operator&lt;=</a></div><div class="ttdeci">constexpr bool operator&lt;=(complex64_t a, complex64_t b)</div><div class="ttdef"><b>Definition</b> complex.h:92</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
 <div class="ttc" id="astructcomplex64__t_html"><div class="ttname"><a href="structcomplex64__t.html">complex64_t</a></div><div class="ttdef"><b>Definition</b> complex.h:20</div></div>
 <div class="ttc" id="astructcomplex64__t_html_a0a27a41206400f1e62b60ceb56960c93"><div class="ttname"><a href="structcomplex64__t.html#a0a27a41206400f1e62b60ceb56960c93">complex64_t::complex64_t</a></div><div class="ttdeci">constexpr complex64_t(T x) const ant</div><div class="ttdef"><b>Definition</b> complex.h:48</div></div>
 <div class="ttc" id="astructcomplex64__t_html_a29782289bb90d6294099667b86509cd3"><div class="ttname"><a href="structcomplex64__t.html#a29782289bb90d6294099667b86509cd3">complex64_t::complex64_t</a></div><div class="ttdeci">constexpr complex64_t()</div><div class="ttdef"><b>Definition</b> complex.h:26</div></div>
diff --git a/docs/build/html/backend_2metal_2kernels_2fft_8h_source.html b/docs/build/html/backend_2metal_2kernels_2fft_8h_source.html
index 80b3a9cb8..85d097d05 100644
--- a/docs/build/html/backend_2metal_2kernels_2fft_8h_source.html
+++ b/docs/build/html/backend_2metal_2kernels_2fft_8h_source.html
@@ -629,8 +629,8 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="abackend_2metal_2kernels_2fft_8h_html_afea05e9a7105bafbaafca25042f4d1b4"><div class="ttname"><a href="backend_2metal_2kernels_2fft_8h.html#afea05e9a7105bafbaafca25042f4d1b4">rader_5_steps_</a></div><div class="ttdeci">static constant constexpr const int rader_5_steps_</div><div class="ttdef"><b>Definition</b> fft.h:44</div></div>
 <div class="ttc" id="agroup__ops_html_gaf8913cabeb9fb193ba687aaeb2087764"><div class="ttname"><a href="group__ops.html#gaf8913cabeb9fb193ba687aaeb2087764">mlx::core::real</a></div><div class="ttdeci">array real(const array &amp;a, StreamOrDevice s={})</div></div>
 <div class="ttc" id="ametal_2kernels_2hadamard_8h_html_a590e5366adc78bab4fe44e37885d413f"><div class="ttname"><a href="metal_2kernels_2hadamard_8h.html#a590e5366adc78bab4fe44e37885d413f">radix_func</a></div><div class="ttdeci">METAL_FUNC void radix_func(thread float *x)</div><div class="ttdef"><b>Definition</b> hadamard.h:11</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
 <div class="ttc" id="aradix_8h_html"><div class="ttname"><a href="radix_8h.html">radix.h</a></div></div>
 <div class="ttc" id="aradix_8h_html_a026e6779e6d2ecdef39ff4aad186091e"><div class="ttname"><a href="radix_8h.html#a026e6779e6d2ecdef39ff4aad186091e">radix5</a></div><div class="ttdeci">METAL_FUNC void radix5(thread float2 *x, thread float2 *y)</div><div class="ttdef"><b>Definition</b> radix.h:69</div></div>
 <div class="ttc" id="aradix_8h_html_a12cb26bd3ad635d16a195ccea750256d"><div class="ttname"><a href="radix_8h.html#a12cb26bd3ad635d16a195ccea750256d">radix4</a></div><div class="ttdeci">METAL_FUNC void radix4(thread float2 *x, thread float2 *y)</div><div class="ttdef"><b>Definition</b> radix.h:56</div></div>
diff --git a/docs/build/html/backend_2metal_2kernels_2jit_2bf16_8h.html b/docs/build/html/backend_2metal_2kernels_2jit_2bf16_8h.html
new file mode 100644
index 000000000..563d1b301
--- /dev/null
+++ b/docs/build/html/backend_2metal_2kernels_2jit_2bf16_8h.html
@@ -0,0 +1,160 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/jit/bf16.h File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_fb5e52e7ad5a84a63db2993d12f7610c.html">jit</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#define-members">Macros</a>  </div>
+  <div class="headertitle"><div class="title">bf16.h File Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<div class="textblock"><code>#include &quot;<a class="el" href="backend_2metal_2kernels_2metal__3__1_2bf16_8h_source.html">mlx/backend/metal/kernels/metal_3_1/bf16.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html">mlx/backend/metal/kernels/metal_3_0/bf16.h</a>&quot;</code><br />
+</div>
+<p><a href="backend_2metal_2kernels_2jit_2bf16_8h_source.html">Go to the source code of this file.</a></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="define-members" name="define-members"></a>
+Macros</h2></td></tr>
+<tr class="memitem:aaf5bb88c2349054a6c4c2aefee63d3d2" id="r_aaf5bb88c2349054a6c4c2aefee63d3d2"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aaf5bb88c2349054a6c4c2aefee63d3d2">jit_if</a>&#160;&#160;&#160;#if</td></tr>
+<tr class="separator:aaf5bb88c2349054a6c4c2aefee63d3d2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a4b2f08732045407adc7ee181e39e5ae3" id="r_a4b2f08732045407adc7ee181e39e5ae3"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4b2f08732045407adc7ee181e39e5ae3">jit_else</a>&#160;&#160;&#160;#else</td></tr>
+<tr class="separator:a4b2f08732045407adc7ee181e39e5ae3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a5049b44a1fffcb837e0c470ae4cafc56" id="r_a5049b44a1fffcb837e0c470ae4cafc56"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5049b44a1fffcb837e0c470ae4cafc56">jit_endif</a>&#160;&#160;&#160;#endif</td></tr>
+<tr class="separator:a5049b44a1fffcb837e0c470ae4cafc56"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Macro Definition Documentation</h2>
+<a id="a4b2f08732045407adc7ee181e39e5ae3" name="a4b2f08732045407adc7ee181e39e5ae3"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a4b2f08732045407adc7ee181e39e5ae3">&#9670;&#160;</a></span>jit_else</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define jit_else&#160;&#160;&#160;#else</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a5049b44a1fffcb837e0c470ae4cafc56" name="a5049b44a1fffcb837e0c470ae4cafc56"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a5049b44a1fffcb837e0c470ae4cafc56">&#9670;&#160;</a></span>jit_endif</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define jit_endif&#160;&#160;&#160;#endif</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aaf5bb88c2349054a6c4c2aefee63d3d2" name="aaf5bb88c2349054a6c4c2aefee63d3d2"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aaf5bb88c2349054a6c4c2aefee63d3d2">&#9670;&#160;</a></span>jit_if</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">#define jit_if&#160;&#160;&#160;#if</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/backend_2metal_2kernels_2jit_2bf16_8h_source.html b/docs/build/html/backend_2metal_2kernels_2jit_2bf16_8h_source.html
new file mode 100644
index 000000000..fd1428656
--- /dev/null
+++ b/docs/build/html/backend_2metal_2kernels_2jit_2bf16_8h_source.html
@@ -0,0 +1,121 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/jit/bf16.h Source File</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_fb5e52e7ad5a84a63db2993d12f7610c.html">jit</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">bf16.h</div></div>
+</div><!--header-->
+<div class="contents">
+<a href="backend_2metal_2kernels_2jit_2bf16_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2024 Apple Inc.</span></div>
+<div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="comment">// clang-format off</span></div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2jit_2bf16_8h.html#aaf5bb88c2349054a6c4c2aefee63d3d2">    4</a></span><span class="preprocessor">#define jit_if #if</span></div>
+<div class="line"><a id="l00005" name="l00005"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2jit_2bf16_8h.html#a4b2f08732045407adc7ee181e39e5ae3">    5</a></span><span class="preprocessor">#define jit_else #else</span></div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2jit_2bf16_8h.html#a5049b44a1fffcb837e0c470ae4cafc56">    6</a></span><span class="preprocessor">#define jit_endif #endif</span></div>
+<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span> </div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span><a class="code hl_define" href="backend_2metal_2kernels_2jit_2bf16_8h.html#aaf5bb88c2349054a6c4c2aefee63d3d2">jit_if</a> (__METAL_VERSION__ &gt;= 310)</div>
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span> </div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="preprocessor">#include &quot;mlx/backend/metal/kernels/metal_3_1/bf16.h&quot;</span></div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span> </div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span><a class="code hl_define" href="backend_2metal_2kernels_2jit_2bf16_8h.html#a4b2f08732045407adc7ee181e39e5ae3">jit_else</a></div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span> </div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html">mlx/backend/metal/kernels/metal_3_0/bf16.h</a>&quot;</span></div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span> </div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span><a class="code hl_define" href="backend_2metal_2kernels_2jit_2bf16_8h.html#a5049b44a1fffcb837e0c470ae4cafc56">jit_endif</a> <span class="comment">// clang-format on</span></div>
+<div class="ttc" id="abackend_2metal_2kernels_2jit_2bf16_8h_html_a4b2f08732045407adc7ee181e39e5ae3"><div class="ttname"><a href="backend_2metal_2kernels_2jit_2bf16_8h.html#a4b2f08732045407adc7ee181e39e5ae3">jit_else</a></div><div class="ttdeci">#define jit_else</div><div class="ttdef"><b>Definition</b> bf16.h:5</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2jit_2bf16_8h_html_a5049b44a1fffcb837e0c470ae4cafc56"><div class="ttname"><a href="backend_2metal_2kernels_2jit_2bf16_8h.html#a5049b44a1fffcb837e0c470ae4cafc56">jit_endif</a></div><div class="ttdeci">#define jit_endif</div><div class="ttdef"><b>Definition</b> bf16.h:6</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2jit_2bf16_8h_html_aaf5bb88c2349054a6c4c2aefee63d3d2"><div class="ttname"><a href="backend_2metal_2kernels_2jit_2bf16_8h.html#aaf5bb88c2349054a6c4c2aefee63d3d2">jit_if</a></div><div class="ttdeci">#define jit_if</div><div class="ttdef"><b>Definition</b> bf16.h:4</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html">bf16.h</a></div></div>
+</div><!-- fragment --></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/backend_2metal_2kernels_2bf16_8h.html b/docs/build/html/backend_2metal_2kernels_2metal__3__0_2bf16_8h.html
similarity index 99%
rename from docs/build/html/backend_2metal_2kernels_2bf16_8h.html
rename to docs/build/html/backend_2metal_2kernels_2metal__3__0_2bf16_8h.html
index 9d74af68c..76a2309e8 100644
--- a/docs/build/html/backend_2metal_2kernels_2bf16_8h.html
+++ b/docs/build/html/backend_2metal_2kernels_2metal__3__0_2bf16_8h.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=11"/>
 <meta name="generator" content="Doxygen 1.12.0"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: mlx/backend/metal/kernels/bf16.h File Reference</title>
+<title>MLX: mlx/backend/metal/kernels/metal_3_0/bf16.h File Reference</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -83,7 +83,7 @@ $(function(){ initResizable(false); });
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_d36f9e79442ec4bd53287b83bdefe7e5.html">metal_3_0</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div id="doc-content">
@@ -99,9 +99,8 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 <div class="textblock"><code>#include &lt;metal_stdlib&gt;</code><br />
-<code>#include &quot;<a class="el" href="bf16__math_8h_source.html">mlx/backend/metal/kernels/bf16_math.h</a>&quot;</code><br />
 </div>
-<p><a href="backend_2metal_2kernels_2bf16_8h_source.html">Go to the source code of this file.</a></p>
+<p><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html">Go to the source code of this file.</a></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
 Classes</h2></td></tr>
@@ -821,6 +820,10 @@ Functions</h2></td></tr>
 <tr class="separator:aa251d6483d3b099d1b5311fbe6f0bce2"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a83320ba983d90dd1fa5847b6940dc0bb" id="r_a83320ba983d90dd1fa5847b6940dc0bb"><td class="memItemLeft" align="right" valign="top">METAL_FUNC bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb">metal::isnan</a> (<a class="el" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> x)</td></tr>
 <tr class="separator:a83320ba983d90dd1fa5847b6940dc0bb"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1420e191fa60d707dce327d0938e3088" id="r_a1420e191fa60d707dce327d0938e3088"><td class="memItemLeft" align="right" valign="top">uint16_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1420e191fa60d707dce327d0938e3088">bfloat16_to_uint16</a> (const <a class="el" href="#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a1420e191fa60d707dce327d0938e3088"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a8d066e48cf3e2a0583c71816fa40f7f4" id="r_a8d066e48cf3e2a0583c71816fa40f7f4"><td class="memItemLeft" align="right" valign="top"><a class="el" href="#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8d066e48cf3e2a0583c71816fa40f7f4">uint16_to_bfloat16</a> (const uint16_t x)</td></tr>
+<tr class="separator:a8d066e48cf3e2a0583c71816fa40f7f4"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="var-members" name="var-members"></a>
 Variables</h2></td></tr>
@@ -858,8 +861,8 @@ Variables</h2></td></tr>
 <div class="line">  bfloat_binop_helper(_op_, _operator_, <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>, uint32_t, <span class="keywordtype">float</span>);     \</div>
 <div class="line">  bfloat_binop_helper(_op_, _operator_, <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>, int64_t, <span class="keywordtype">float</span>);      \</div>
 <div class="line">  bfloat_binop_helper(_op_, _operator_, <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>, uint64_t, <span class="keywordtype">float</span>);</div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a78c92beda4436da9a2e520fa98c59f70"><div class="ttname"><a href="#a78c92beda4436da9a2e520fa98c59f70">bfloat_binop_base</a></div><div class="ttdeci">#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype)</div><div class="ttdef"><b>Definition</b> bf16.h:141</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a78c92beda4436da9a2e520fa98c59f70"><div class="ttname"><a href="#a78c92beda4436da9a2e520fa98c59f70">bfloat_binop_base</a></div><div class="ttdeci">#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype)</div><div class="ttdef"><b>Definition</b> bf16.h:135</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
 </div><!-- fragment -->
 </div>
 </div>
@@ -996,7 +999,7 @@ Variables</h2></td></tr>
 <div class="line">  bfloat_inplace_op_addr_space_helper(-, <span class="keyword">operator</span>-=, itype); \</div>
 <div class="line">  bfloat_inplace_op_addr_space_helper(*, <span class="keyword">operator</span>*=, itype); \</div>
 <div class="line">  bfloat_inplace_op_addr_space_helper(/, <span class="keyword">operator</span>/=, itype);</div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_af30a2cbd2c3415516203b83bd21872f8"><div class="ttname"><a href="#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a></div><div class="ttdeci">#define bfloat_inplace_op_addr_space_helper(__op__, __operator__, itype)</div><div class="ttdef"><b>Definition</b> bf16.h:209</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_af30a2cbd2c3415516203b83bd21872f8"><div class="ttname"><a href="#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a></div><div class="ttdeci">#define bfloat_inplace_op_addr_space_helper(__op__, __operator__, itype)</div><div class="ttdef"><b>Definition</b> bf16.h:203</div></div>
 </div><!-- fragment -->
 </div>
 </div>
@@ -1021,7 +1024,7 @@ Variables</h2></td></tr>
 <b>Value:</b><div class="fragment"><div class="line">  <a class="code hl_define" href="#a2846fd11b5e19b435e9f7ef0998c9b1d">bfloat_inplace_op_helper</a>(__op__, __operator__, device);         \</div>
 <div class="line">  bfloat_inplace_op_helper(__op__, __operator__, thread);         \</div>
 <div class="line">  bfloat_inplace_op_helper(__op__, __operator__, threadgroup);</div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a2846fd11b5e19b435e9f7ef0998c9b1d"><div class="ttname"><a href="#a2846fd11b5e19b435e9f7ef0998c9b1d">bfloat_inplace_op_helper</a></div><div class="ttdeci">#define bfloat_inplace_op_helper(__op__, __operator__, itype, addr_space)</div><div class="ttdef"><b>Definition</b> bf16.h:197</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a2846fd11b5e19b435e9f7ef0998c9b1d"><div class="ttname"><a href="#a2846fd11b5e19b435e9f7ef0998c9b1d">bfloat_inplace_op_helper</a></div><div class="ttdeci">#define bfloat_inplace_op_helper(__op__, __operator__, itype, addr_space)</div><div class="ttdef"><b>Definition</b> bf16.h:191</div></div>
 </div><!-- fragment -->
 </div>
 </div>
@@ -1142,6 +1145,31 @@ Variables</h2></td></tr>
 </div>
 </div>
 <h2 class="groupheader">Function Documentation</h2>
+<a id="a1420e191fa60d707dce327d0938e3088" name="a1420e191fa60d707dce327d0938e3088"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1420e191fa60d707dce327d0938e3088">&#9670;&#160;</a></span>bfloat16_to_uint16()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">uint16_t bfloat16_to_uint16 </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
 <a id="a3b33ae338dc4f223d0f3c748de07bad1" name="a3b33ae338dc4f223d0f3c748de07bad1"></a>
 <h2 class="memtitle"><span class="permalink"><a href="#a3b33ae338dc4f223d0f3c748de07bad1">&#9670;&#160;</a></span>bfloat_bits_to_float()</h2>
 
@@ -10901,6 +10929,31 @@ Variables</h2></td></tr>
 </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="a8d066e48cf3e2a0583c71816fa40f7f4" name="a8d066e48cf3e2a0583c71816fa40f7f4"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a8d066e48cf3e2a0583c71816fa40f7f4">&#9670;&#160;</a></span>uint16_to_bfloat16()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> uint16_to_bfloat16 </td>
+          <td>(</td>
+          <td class="paramtype">const uint16_t</td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <h2 class="groupheader">Variable Documentation</h2>
diff --git a/docs/build/html/backend_2metal_2kernels_2bf16_8h_source.html b/docs/build/html/backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html
similarity index 60%
rename from docs/build/html/backend_2metal_2kernels_2bf16_8h_source.html
rename to docs/build/html/backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html
index c29e3c420..9c4dd6bcd 100644
--- a/docs/build/html/backend_2metal_2kernels_2bf16_8h_source.html
+++ b/docs/build/html/backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=11"/>
 <meta name="generator" content="Doxygen 1.12.0"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: mlx/backend/metal/kernels/bf16.h Source File</title>
+<title>MLX: mlx/backend/metal/kernels/metal_3_0/bf16.h Source File</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -83,7 +83,7 @@ $(function(){ initResizable(false); });
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_d36f9e79442ec4bd53287b83bdefe7e5.html">metal_3_0</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div id="doc-content">
@@ -91,7 +91,7 @@ $(function(){ initResizable(false); });
   <div class="headertitle"><div class="title">bf16.h</div></div>
 </div><!--header-->
 <div class="contents">
-<a href="backend_2metal_2kernels_2bf16_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2023 Apple Inc.</span></div>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2023 Apple Inc.</span></div>
 <div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
 <div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
 <div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
@@ -99,395 +99,394 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span> </div>
 <div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="keyword">using namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a>;</div>
 <div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span> </div>
-<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span><span class="preprocessor">#if (MLX_METAL_VERSION &gt;= 310) || (__METAL_VERSION__ &gt;= 310)</span></div>
-<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span> </div>
-<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span><span class="keyword">typedef</span> bfloat <a class="code hl_typedef" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>;</div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="comment">// Helpers</span></div>
 <div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span> </div>
-<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span><span class="preprocessor">#else</span></div>
-<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span> </div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span><span class="comment">// Helpers</span></div>
-<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span> </div>
-<div class="foldopen" id="foldopen00019" data-start="{" data-end="}">
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">   19</a></span><span class="keyword">constexpr</span> METAL_FUNC uint16_t <a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keywordtype">float</span> x) {</div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>  <span class="comment">// Check for nan</span></div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>  <span class="keywordflow">if</span> ((as_type&lt;uint32_t&gt;(x) &amp; ~_fp_encoding_traits&lt;float&gt;::sign_mask) &gt;</div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>      _fp_encoding_traits&lt;float&gt;::inf_mask) {</div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>    <span class="keywordflow">return</span> uint16_t(as_type&lt;uint32_t&gt;(0x7FC0));</div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>  }</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>  <span class="comment">// Take bits</span></div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>  uint32_t float_bits = as_type&lt;uint32_t&gt;(x);</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span> </div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>  <span class="comment">// Round to nearest even</span></div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  float_bits += ((float_bits &gt;&gt; 16) &amp; 1) + as_type&lt;uint32_t&gt;(0x7FFF);</div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span> </div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="comment">// Take upper 16 bits</span></div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>  <span class="keywordflow">return</span> float_bits &gt;&gt; 16;</div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>}</div>
+<div class="foldopen" id="foldopen00013" data-start="{" data-end="}">
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">   13</a></span><span class="keyword">constexpr</span> METAL_FUNC uint16_t <a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keywordtype">float</span> x) {</div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span>  <span class="comment">// Check for nan</span></div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>  <span class="keywordflow">if</span> ((as_type&lt;uint32_t&gt;(x) &amp; ~_fp_encoding_traits&lt;float&gt;::sign_mask) &gt;</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>      _fp_encoding_traits&lt;float&gt;::inf_mask) {</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    <span class="keywordflow">return</span> uint16_t(as_type&lt;uint32_t&gt;(0x7FC0));</div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>  }</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>  <span class="comment">// Take bits</span></div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>  uint32_t float_bits = as_type&lt;uint32_t&gt;(x);</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span> </div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  <span class="comment">// Round to nearest even</span></div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>  float_bits += ((float_bits &gt;&gt; 16) &amp; 1) + as_type&lt;uint32_t&gt;(0x7FFF);</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span> </div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>  <span class="comment">// Take upper 16 bits</span></div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>  <span class="keywordflow">return</span> float_bits &gt;&gt; 16;</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>}</div>
 </div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span> </div>
-<div class="foldopen" id="foldopen00035" data-start="{" data-end="}">
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">   35</a></span><span class="keyword">constexpr</span> METAL_FUNC <span class="keywordtype">float</span> <a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(uint16_t x) {</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="comment">// Upper 16 bits are the data and lower 16 bits are 0s</span></div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keywordflow">return</span> as_type&lt;float&gt;((uint32_t)x &lt;&lt; 16);</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>}</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span> </div>
+<div class="foldopen" id="foldopen00029" data-start="{" data-end="}">
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">   29</a></span><span class="keyword">constexpr</span> METAL_FUNC <span class="keywordtype">float</span> <a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(uint16_t x) {</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  <span class="comment">// Upper 16 bits are the data and lower 16 bits are 0s</span></div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="keywordflow">return</span> as_type&lt;float&gt;((uint32_t)x &lt;&lt; 16);</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>}</div>
 </div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span> </div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span><span class="keyword">struct </span><a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>;</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span> </div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">   37</a></span><span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">can_convert_to_bfloat</a> =</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>    !is_same_v&lt;T, _MLX_BFloat16&gt; &amp;&amp; is_convertible_v&lt;T, float&gt;;</div>
 <div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span> </div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span><span class="keyword">struct </span><a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>;</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span> </div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">   43</a></span><span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="backend_2metal_2kernels_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">can_convert_to_bfloat</a> =</div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>    !is_same_v&lt;T, _MLX_BFloat16&gt; &amp;&amp; is_convertible_v&lt;T, float&gt;;</div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span> </div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">   47</a></span><span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="backend_2metal_2kernels_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">can_convert_from_bfloat</a> =</div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>    !is_same_v&lt;T, _MLX_BFloat16&gt; &amp;&amp; is_convertible_v&lt;float, T&gt;;</div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span> </div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span><span class="comment">// Bfloat struct</span></div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span> </div>
-<div class="foldopen" id="foldopen00054" data-start="{" data-end="};">
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html">   54</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> {</div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  <span class="comment">// Constructors</span></div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">   57</a></span>  uint16_t <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>;</div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#ab1af7700f5d1e4ab567da6a34fa84668">   58</a></span>  <a class="code hl_function" href="struct___m_l_x___b_float16.html#ab1af7700f5d1e4ab567da6a34fa84668">_MLX_BFloat16</a>() thread = default;</div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#ae5c51644c3bd7cda6b796cb63c60c0b4">   59</a></span>  <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>() threadgroup = default;</div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a21998a3c852d0e0f52681f8b453172bf">   60</a></span>  <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>() device = default;</div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a64d8fc2e2463d7fa19cd3d5dd1ffdae8">   61</a></span>  <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>() constant = default;</div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span> </div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">   63</a></span>  struct <a class="code hl_struct" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">bits_to_bfloat_struct</a> {};</div>
-<div class="foldopen" id="foldopen00064" data-start="{" data-end="}">
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">   64</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_struct" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">bits_to_bfloat_struct</a> <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">bits_to_bfloat</a>() {</div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">bits_to_bfloat_struct</a>();</div>
-<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>  }</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">   41</a></span><span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">can_convert_from_bfloat</a> =</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>    !is_same_v&lt;T, _MLX_BFloat16&gt; &amp;&amp; is_convertible_v&lt;float, T&gt;;</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span> </div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span><span class="comment">// Bfloat struct</span></div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span> </div>
+<div class="foldopen" id="foldopen00048" data-start="{" data-end="};">
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html">   48</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> {</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>  <span class="comment">// Constructors</span></div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">   51</a></span>  uint16_t <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>;</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#ab1af7700f5d1e4ab567da6a34fa84668">   52</a></span>  <a class="code hl_function" href="struct___m_l_x___b_float16.html#ab1af7700f5d1e4ab567da6a34fa84668">_MLX_BFloat16</a>() thread = default;</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#ae5c51644c3bd7cda6b796cb63c60c0b4">   53</a></span>  <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>() threadgroup = default;</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a21998a3c852d0e0f52681f8b453172bf">   54</a></span>  <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>() device = default;</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a64d8fc2e2463d7fa19cd3d5dd1ffdae8">   55</a></span>  <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>() constant = default;</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span> </div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">   57</a></span>  struct <a class="code hl_struct" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">bits_to_bfloat_struct</a> {};</div>
+<div class="foldopen" id="foldopen00058" data-start="{" data-end="}">
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">   58</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_struct" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">bits_to_bfloat_struct</a> <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">bits_to_bfloat</a>() {</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">bits_to_bfloat_struct</a>();</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>  }</div>
 </div>
-<div class="foldopen" id="foldopen00067" data-start="{" data-end="}">
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a50d825f05a162d0ac133ad8b6f3c3112">   67</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#a50d825f05a162d0ac133ad8b6f3c3112">_MLX_BFloat16</a>(uint16_t bits, <a class="code hl_struct" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">bits_to_bfloat_struct</a>)</div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(bits) {}</div>
+<div class="foldopen" id="foldopen00061" data-start="{" data-end="}">
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a50d825f05a162d0ac133ad8b6f3c3112">   61</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#a50d825f05a162d0ac133ad8b6f3c3112">_MLX_BFloat16</a>(uint16_t bits, <a class="code hl_struct" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">bits_to_bfloat_struct</a>)</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(bits) {}</div>
+</div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span> </div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>  <span class="comment">// Conversions to bfloat</span></div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span> </div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>  <span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>      <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_to_bfloat&lt;T&gt;&gt;::type&gt;</div>
+<div class="foldopen" id="foldopen00070" data-start="{" data-end="}">
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a24c5736f234e09a0c82b00c7e44cc547">   70</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#a24c5736f234e09a0c82b00c7e44cc547">_MLX_BFloat16</a>(T x) thread</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(<a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x))) {}</div>
 </div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span> </div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>  <span class="comment">// Conversions to bfloat</span></div>
 <div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span> </div>
 <div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  <span class="keyword">template</span> &lt;</div>
 <div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>      <span class="keyword">typename</span> T,</div>
 <div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_to_bfloat&lt;T&gt;&gt;::type&gt;</div>
 <div class="foldopen" id="foldopen00076" data-start="{" data-end="}">
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a24c5736f234e09a0c82b00c7e44cc547">   76</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#a24c5736f234e09a0c82b00c7e44cc547">_MLX_BFloat16</a>(T x) thread</div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(<a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x))) {}</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#adeb880f31121c6dc40ce47765c6c7455">   76</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#adeb880f31121c6dc40ce47765c6c7455">_MLX_BFloat16</a>(T x) threadgroup</div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(<a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x))) {}</div>
 </div>
 <div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span> </div>
 <div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>  <span class="keyword">template</span> &lt;</div>
 <div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>      <span class="keyword">typename</span> T,</div>
 <div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_to_bfloat&lt;T&gt;&gt;::type&gt;</div>
 <div class="foldopen" id="foldopen00082" data-start="{" data-end="}">
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#adeb880f31121c6dc40ce47765c6c7455">   82</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#adeb880f31121c6dc40ce47765c6c7455">_MLX_BFloat16</a>(T x) threadgroup</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(<a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x))) {}</div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#ad2701d003e8fad168c89abc3907c6e53">   82</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#ad2701d003e8fad168c89abc3907c6e53">_MLX_BFloat16</a>(T x) device</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(<a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x))) {}</div>
 </div>
 <div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span> </div>
 <div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>  <span class="keyword">template</span> &lt;</div>
 <div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>      <span class="keyword">typename</span> T,</div>
 <div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_to_bfloat&lt;T&gt;&gt;::type&gt;</div>
 <div class="foldopen" id="foldopen00088" data-start="{" data-end="}">
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#ad2701d003e8fad168c89abc3907c6e53">   88</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#ad2701d003e8fad168c89abc3907c6e53">_MLX_BFloat16</a>(T x) device</div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(<a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x))) {}</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#aec7fa716fd621ce1843338027bcb0118">   88</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#aec7fa716fd621ce1843338027bcb0118">_MLX_BFloat16</a>(T x) constant</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(<a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x))) {}</div>
 </div>
 <div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span> </div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  <span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>      <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_to_bfloat&lt;T&gt;&gt;::type&gt;</div>
-<div class="foldopen" id="foldopen00094" data-start="{" data-end="}">
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#aec7fa716fd621ce1843338027bcb0118">   94</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_function" href="struct___m_l_x___b_float16.html#aec7fa716fd621ce1843338027bcb0118">_MLX_BFloat16</a>(T x) constant</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>      : <a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>(<a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a>(<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x))) {}</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>  <span class="comment">// Conversions from bfloat</span></div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span> </div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  <span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>      <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_from_bfloat&lt;T&gt;&gt;::type&gt;</div>
+<div class="foldopen" id="foldopen00097" data-start="{" data-end="}">
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#aa7dfefdf0d15e102d2b8258c9ab01836">   97</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <span class="keyword">operator</span> T() const thread {</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(<a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>));</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>  }</div>
 </div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span> </div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  <span class="comment">// Conversions from bfloat</span></div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span> </div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>  <span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>      <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_from_bfloat&lt;T&gt;&gt;::type&gt;</div>
-<div class="foldopen" id="foldopen00103" data-start="{" data-end="}">
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#aa7dfefdf0d15e102d2b8258c9ab01836">  103</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <span class="keyword">operator</span> T() const thread {</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(<a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>));</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>  }</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span> </div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>  <span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>      <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_from_bfloat&lt;T&gt;&gt;::type&gt;</div>
+<div class="foldopen" id="foldopen00104" data-start="{" data-end="}">
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a2546a8afa77e14ed5b3c5da79a281260">  104</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <span class="keyword">operator</span> T() const threadgroup {</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(<a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>));</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>  }</div>
 </div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span> </div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>  <span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>      <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_from_bfloat&lt;T&gt;&gt;::type&gt;</div>
-<div class="foldopen" id="foldopen00110" data-start="{" data-end="}">
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a2546a8afa77e14ed5b3c5da79a281260">  110</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <span class="keyword">operator</span> T() const threadgroup {</div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(<a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>));</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>  }</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span> </div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>  <span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>      <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_from_bfloat&lt;T&gt;&gt;::type&gt;</div>
+<div class="foldopen" id="foldopen00111" data-start="{" data-end="}">
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a1d523f87740fcb852db6ab57896c245a">  111</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <span class="keyword">operator</span> T() const device {</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(<a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>));</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>  }</div>
 </div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span> </div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  <span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>      <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_from_bfloat&lt;T&gt;&gt;::type&gt;</div>
-<div class="foldopen" id="foldopen00117" data-start="{" data-end="}">
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a1d523f87740fcb852db6ab57896c245a">  117</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <span class="keyword">operator</span> T() const device {</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(<a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>));</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  }</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span> </div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>  <span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>      <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_from_bfloat&lt;T&gt;&gt;::type&gt;</div>
+<div class="foldopen" id="foldopen00118" data-start="{" data-end="}">
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a95acd29283024d7093a0bc58c9468a0a">  118</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <span class="keyword">operator</span> T() const constant {</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(<a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>));</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  }</div>
 </div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span> </div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  <span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>      <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>      <span class="keyword">typename</span> = <span class="keyword">typename</span> enable_if&lt;can_convert_from_bfloat&lt;T&gt;&gt;::type&gt;</div>
-<div class="foldopen" id="foldopen00124" data-start="{" data-end="}">
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno"><a class="line" href="struct___m_l_x___b_float16.html#a95acd29283024d7093a0bc58c9468a0a">  124</a></span>  <span class="keyword">constexpr</span> METAL_FUNC <span class="keyword">operator</span> T() const constant {</div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(<a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a>(<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>));</div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>  }</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>};</div>
 </div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>};</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span> </div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span><span class="comment">// Bfloat operators</span></div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span> </div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span><span class="comment">// Unary ops</span></div>
+<div class="foldopen" id="foldopen00129" data-start="{" data-end="}">
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">  129</a></span><span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> <a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">operator-</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> x) {</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>  <span class="keywordflow">return</span> -<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x);</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>}</div>
 </div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span> </div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span><span class="comment">// Bfloat operators</span></div>
 <div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span> </div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span><span class="comment">// Unary ops</span></div>
-<div class="foldopen" id="foldopen00135" data-start="{" data-end="}">
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">  135</a></span><span class="keyword">constexpr</span> METAL_FUNC <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> <a class="code hl_function" href="backend_2metal_2kernels_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">operator-</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> x) {</div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>  <span class="keywordflow">return</span> -<span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x);</div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>}</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span><span class="comment">// Binary operators</span></div>
+<div class="foldopen" id="foldopen00135" data-start="" data-end="">
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70">  135</a></span><span class="preprocessor">#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype) \</span></div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span><span class="preprocessor">  constexpr METAL_FUNC otype __operator__(atype lhs, btype rhs) {           \</span></div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span><span class="preprocessor">    return static_cast&lt;ctype&gt;(lhs) __op__ static_cast&lt;ctype&gt;(rhs);          \</span></div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span><span class="preprocessor">  }</span></div>
 </div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span> </div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span><span class="comment">// Binary operators</span></div>
-<div class="foldopen" id="foldopen00141" data-start="" data-end="">
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70">  141</a></span><span class="preprocessor">#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype) \</span></div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span><span class="preprocessor">  constexpr METAL_FUNC otype __operator__(atype lhs, btype rhs) {           \</span></div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span><span class="preprocessor">    return static_cast&lt;ctype&gt;(lhs) __op__ static_cast&lt;ctype&gt;(rhs);          \</span></div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span><span class="preprocessor">  }</span></div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span> </div>
+<div class="foldopen" id="foldopen00140" data-start="" data-end="">
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594">  140</a></span><span class="preprocessor">#define bfloat_binop_helper(__op__, __operator__, otype, itype, ctype)    \</span></div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span><span class="preprocessor">  constexpr METAL_FUNC otype __operator__(_MLX_BFloat16 lhs, itype rhs) { \</span></div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span><span class="preprocessor">    return static_cast&lt;ctype&gt;(lhs) __op__ static_cast&lt;ctype&gt;(rhs);        \</span></div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span><span class="preprocessor">  }                                                                       \</span></div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span><span class="preprocessor">  constexpr METAL_FUNC otype __operator__(itype lhs, _MLX_BFloat16 rhs) { \</span></div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span><span class="preprocessor">    return static_cast&lt;ctype&gt;(lhs) __op__ static_cast&lt;ctype&gt;(rhs);        \</span></div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span><span class="preprocessor">  }</span></div>
 </div>
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span> </div>
-<div class="foldopen" id="foldopen00146" data-start="" data-end="">
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594">  146</a></span><span class="preprocessor">#define bfloat_binop_helper(__op__, __operator__, otype, itype, ctype)    \</span></div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span><span class="preprocessor">  constexpr METAL_FUNC otype __operator__(_MLX_BFloat16 lhs, itype rhs) { \</span></div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span><span class="preprocessor">    return static_cast&lt;ctype&gt;(lhs) __op__ static_cast&lt;ctype&gt;(rhs);        \</span></div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span><span class="preprocessor">  }                                                                       \</span></div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span><span class="preprocessor">  constexpr METAL_FUNC otype __operator__(itype lhs, _MLX_BFloat16 rhs) { \</span></div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span><span class="preprocessor">    return static_cast&lt;ctype&gt;(lhs) __op__ static_cast&lt;ctype&gt;(rhs);        \</span></div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span><span class="preprocessor">  }</span></div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span> </div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span><span class="comment">// Arithmetic Operators</span></div>
+<div class="foldopen" id="foldopen00150" data-start="" data-end="">
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">  150</a></span><span class="preprocessor">#define bfloat_binop(_op_, _operator_)                                       \</span></div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span><span class="preprocessor">  bfloat_binop_base(                                                         \</span></div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span><span class="preprocessor">      _op_, _operator_, _MLX_BFloat16, _MLX_BFloat16, _MLX_BFloat16, float); \</span></div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, float, float, float);                \</span></div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, float, half, float);                 \</span></div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int32_t, float);      \</span></div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint32_t, float);     \</span></div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int64_t, float);      \</span></div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint64_t, float);</span></div>
 </div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span> </div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span><span class="comment">// Arithmetic Operators</span></div>
-<div class="foldopen" id="foldopen00156" data-start="" data-end="">
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">  156</a></span><span class="preprocessor">#define bfloat_binop(_op_, _operator_)                                       \</span></div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span><span class="preprocessor">  bfloat_binop_base(                                                         \</span></div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span><span class="preprocessor">      _op_, _operator_, _MLX_BFloat16, _MLX_BFloat16, _MLX_BFloat16, float); \</span></div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, float, float, float);                \</span></div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, float, half, float);                 \</span></div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int32_t, float);      \</span></div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint32_t, float);     \</span></div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int64_t, float);      \</span></div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint64_t, float);</span></div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span> </div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a09c1a797eb7f43742578680899932f50">  160</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(+, <span class="keyword">operator</span>+);</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a333f67614dbf8027439a7e124052cb85">  161</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(-, <span class="keyword">operator</span>-);</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8f06316063fc91747533105f256b55b5">  162</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(*, <span class="keyword">operator</span>*);</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c">  163</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(/, <span class="keyword">operator</span>/);</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span> </div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span><span class="comment">// Comparison ops</span></div>
+<div class="foldopen" id="foldopen00167" data-start="" data-end="">
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">  167</a></span><span class="preprocessor">#define bfloat_compop(__op__, __operator__)                             \</span></div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span><span class="preprocessor">  bfloat_binop_base(                                                    \</span></div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span><span class="preprocessor">      __op__, __operator__, bool, _MLX_BFloat16, _MLX_BFloat16, float); \</span></div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, float, float);        \</span></div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, half, float);         \</span></div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, int32_t, float);      \</span></div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, uint32_t, float);     \</span></div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, int64_t, float);      \</span></div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, uint64_t, float);</span></div>
 </div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span> </div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a09c1a797eb7f43742578680899932f50">  166</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(+, <span class="keyword">operator</span>+);</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a333f67614dbf8027439a7e124052cb85">  167</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(-, <span class="keyword">operator</span>-);</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a8f06316063fc91747533105f256b55b5">  168</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(*, <span class="keyword">operator</span>*);</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c">  169</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(/, <span class="keyword">operator</span>/);</div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span> </div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span><span class="comment">// Comparison ops</span></div>
-<div class="foldopen" id="foldopen00173" data-start="" data-end="">
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">  173</a></span><span class="preprocessor">#define bfloat_compop(__op__, __operator__)                             \</span></div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span><span class="preprocessor">  bfloat_binop_base(                                                    \</span></div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span><span class="preprocessor">      __op__, __operator__, bool, _MLX_BFloat16, _MLX_BFloat16, float); \</span></div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, float, float);        \</span></div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, half, float);         \</span></div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, int32_t, float);      \</span></div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, uint32_t, float);     \</span></div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, int64_t, float);      \</span></div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, uint64_t, float);</span></div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span> </div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57">  177</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&gt;, <span class="keyword">operator</span>&gt;);</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25">  178</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&lt;, <span class="keyword">operator</span>&lt;);</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f">  179</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&gt;=, <span class="keyword">operator</span>&gt;=);</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05">  180</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&lt;=, <span class="keyword">operator</span>&lt;=);</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065">  181</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(==, <span class="keyword">operator</span>==);</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55">  182</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(!=, <span class="keyword">operator</span>!=);</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span> </div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span><span class="preprocessor">#undef bfloat_compop</span></div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span><span class="preprocessor">#undef bfloat_binop_base</span></div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span><span class="preprocessor">#undef bfloat_binop_helper</span></div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span><span class="preprocessor">#undef bfloat_binop</span></div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span> </div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span><span class="comment">// Inplace Operators</span></div>
+<div class="foldopen" id="foldopen00191" data-start="" data-end="">
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d">  191</a></span><span class="preprocessor">#define bfloat_inplace_op_helper(__op__, __operator__, itype, addr_space) \</span></div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span><span class="preprocessor">  constexpr METAL_FUNC addr_space _MLX_BFloat16&amp; __operator__(            \</span></div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span><span class="preprocessor">      addr_space _MLX_BFloat16&amp; lhs, itype rhs) {                         \</span></div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span><span class="preprocessor">    lhs = static_cast&lt;float&gt;(lhs) __op__ static_cast&lt;float&gt;(rhs);         \</span></div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span><span class="preprocessor">    return lhs;                                                           \</span></div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span><span class="preprocessor">  }                                                                       \</span></div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span><span class="preprocessor">  constexpr METAL_FUNC addr_space itype&amp; __operator__(                    \</span></div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span><span class="preprocessor">      addr_space itype&amp; lhs, _MLX_BFloat16 rhs) {                         \</span></div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span><span class="preprocessor">    lhs = static_cast&lt;float&gt;(lhs) __op__ static_cast&lt;float&gt;(rhs);         \</span></div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span><span class="preprocessor">    return lhs;                                                           \</span></div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span><span class="preprocessor">  }</span></div>
 </div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span> </div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57">  183</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&gt;, <span class="keyword">operator</span>&gt;);</div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25">  184</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&lt;, <span class="keyword">operator</span>&lt;);</div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f">  185</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&gt;=, <span class="keyword">operator</span>&gt;=);</div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05">  186</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&lt;=, <span class="keyword">operator</span>&lt;=);</div>
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065">  187</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(==, <span class="keyword">operator</span>==);</div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55">  188</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(!=, <span class="keyword">operator</span>!=);</div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span> </div>
-<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span><span class="preprocessor">#undef bfloat_compop</span></div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span><span class="preprocessor">#undef bfloat_binop_base</span></div>
-<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span><span class="preprocessor">#undef bfloat_binop_helper</span></div>
-<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span><span class="preprocessor">#undef bfloat_binop</span></div>
-<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span> </div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span><span class="comment">// Inplace Operators</span></div>
-<div class="foldopen" id="foldopen00197" data-start="" data-end="">
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d">  197</a></span><span class="preprocessor">#define bfloat_inplace_op_helper(__op__, __operator__, itype, addr_space) \</span></div>
-<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span><span class="preprocessor">  constexpr METAL_FUNC addr_space _MLX_BFloat16&amp; __operator__(            \</span></div>
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span><span class="preprocessor">      addr_space _MLX_BFloat16&amp; lhs, itype rhs) {                         \</span></div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span><span class="preprocessor">    lhs = static_cast&lt;float&gt;(lhs) __op__ static_cast&lt;float&gt;(rhs);         \</span></div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span><span class="preprocessor">    return lhs;                                                           \</span></div>
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span><span class="preprocessor">  }                                                                       \</span></div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span><span class="preprocessor">  constexpr METAL_FUNC addr_space itype&amp; __operator__(                    \</span></div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span><span class="preprocessor">      addr_space itype&amp; lhs, _MLX_BFloat16 rhs) {                         \</span></div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span><span class="preprocessor">    lhs = static_cast&lt;float&gt;(lhs) __op__ static_cast&lt;float&gt;(rhs);         \</span></div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span><span class="preprocessor">    return lhs;                                                           \</span></div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span><span class="preprocessor">  }</span></div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span> </div>
+<div class="foldopen" id="foldopen00203" data-start="" data-end="">
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">  203</a></span><span class="preprocessor">#define bfloat_inplace_op_addr_space_helper(__op__, __operator__, itype) \</span></div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, itype, device);         \</span></div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, itype, thread);         \</span></div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, itype, threadgroup);</span></div>
 </div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span> </div>
-<div class="foldopen" id="foldopen00209" data-start="" data-end="">
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">  209</a></span><span class="preprocessor">#define bfloat_inplace_op_addr_space_helper(__op__, __operator__, itype) \</span></div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, itype, device);         \</span></div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, itype, thread);         \</span></div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, itype, threadgroup);</span></div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span> </div>
+<div class="foldopen" id="foldopen00208" data-start="" data-end="">
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">  208</a></span><span class="preprocessor">#define bfloat_inplace_op(itype)                             \</span></div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span><span class="preprocessor">  bfloat_inplace_op_addr_space_helper(+, operator+=, itype); \</span></div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span><span class="preprocessor">  bfloat_inplace_op_addr_space_helper(-, operator-=, itype); \</span></div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span><span class="preprocessor">  bfloat_inplace_op_addr_space_helper(*, operator*=, itype); \</span></div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span><span class="preprocessor">  bfloat_inplace_op_addr_space_helper(/, operator/=, itype);</span></div>
 </div>
 <div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span> </div>
-<div class="foldopen" id="foldopen00214" data-start="" data-end="">
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">  214</a></span><span class="preprocessor">#define bfloat_inplace_op(itype)                             \</span></div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span><span class="preprocessor">  bfloat_inplace_op_addr_space_helper(+, operator+=, itype); \</span></div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span><span class="preprocessor">  bfloat_inplace_op_addr_space_helper(-, operator-=, itype); \</span></div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span><span class="preprocessor">  bfloat_inplace_op_addr_space_helper(*, operator*=, itype); \</span></div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span><span class="preprocessor">  bfloat_inplace_op_addr_space_helper(/, operator/=, itype);</span></div>
-</div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span> </div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419">  220</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(<span class="keywordtype">float</span>);</div>
-<div class="line"><a id="l00221" name="l00221"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#ab706af260b61f735b28464877d02137c">  221</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(half);</div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a81f65b04a87a25c7eb1a751d1be9fa55">  222</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(int16_t);</div>
-<div class="line"><a id="l00223" name="l00223"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#ae97ab6c3ddcc2754b24f86319a5398be">  223</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(int32_t);</div>
-<div class="line"><a id="l00224" name="l00224"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a917354f77eac26189da8a2f610a00074">  224</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(int64_t);</div>
-<div class="line"><a id="l00225" name="l00225"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a99aa4cc110d1c7aa3b4c8c5cbf9235b7">  225</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(uint16_t);</div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#aeff4c28986f98c23de1df17043edb0f5">  226</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(uint32_t);</div>
-<div class="line"><a id="l00227" name="l00227"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a1a2a683ff40490226eb1371fb905023d">  227</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(uint64_t);</div>
-<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span> </div>
-<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span><span class="preprocessor">#undef bfloat_inplace_op_helper</span></div>
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span><span class="preprocessor">#undef bfloat_inplace_op_addr_space_helper</span></div>
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span><span class="preprocessor">#undef bfloat_inplace_op</span></div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span> </div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span><span class="preprocessor">#define bfloat_inplace_op_helper(__op__, __operator__, addr_space) \</span></div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span><span class="preprocessor">  constexpr METAL_FUNC addr_space _MLX_BFloat16&amp; __operator__(     \</span></div>
-<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span><span class="preprocessor">      addr_space _MLX_BFloat16&amp; lhs, _MLX_BFloat16 rhs) {          \</span></div>
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span><span class="preprocessor">    lhs = static_cast&lt;float&gt;(lhs) __op__ static_cast&lt;float&gt;(rhs);  \</span></div>
-<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span><span class="preprocessor">    return lhs;                                                    \</span></div>
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span><span class="preprocessor">  }</span></div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span> </div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span><span class="preprocessor">#define bfloat_inplace_op_addr_space_helper(__op__, __operator__) \</span></div>
-<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, device);         \</span></div>
-<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, thread);         \</span></div>
-<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, threadgroup);</span></div>
-<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span> </div>
-<div class="line"><a id="l00245" name="l00245"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a24ca436ab299a710263d65302532dd3b">  245</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a>(+, <span class="keyword">operator</span>+=);</div>
-<div class="line"><a id="l00246" name="l00246"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a0d3fb52437c677c5d0f1a3642384b15c">  246</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a>(-, <span class="keyword">operator</span>-=);</div>
-<div class="line"><a id="l00247" name="l00247"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a13aa79165ec87710e977f33fe0361e91">  247</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a>(*, <span class="keyword">operator</span>*=);</div>
-<div class="line"><a id="l00248" name="l00248"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#af1a12a1efb618a57da6dd41ae18cb53c">  248</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a>(/, <span class="keyword">operator</span>/=);</div>
-<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span> </div>
-<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span><span class="preprocessor">#undef bfloat_inplace_op_helper</span></div>
-<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span><span class="preprocessor">#undef bfloat_inplace_op_addr_space_helper</span></div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419">  214</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(<span class="keywordtype">float</span>);</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab706af260b61f735b28464877d02137c">  215</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(half);</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a81f65b04a87a25c7eb1a751d1be9fa55">  216</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(int16_t);</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae97ab6c3ddcc2754b24f86319a5398be">  217</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(int32_t);</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a917354f77eac26189da8a2f610a00074">  218</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(int64_t);</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a99aa4cc110d1c7aa3b4c8c5cbf9235b7">  219</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(uint16_t);</div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeff4c28986f98c23de1df17043edb0f5">  220</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(uint32_t);</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1a2a683ff40490226eb1371fb905023d">  221</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(uint64_t);</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span> </div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span><span class="preprocessor">#undef bfloat_inplace_op_helper</span></div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span><span class="preprocessor">#undef bfloat_inplace_op_addr_space_helper</span></div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span><span class="preprocessor">#undef bfloat_inplace_op</span></div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span> </div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span><span class="preprocessor">#define bfloat_inplace_op_helper(__op__, __operator__, addr_space) \</span></div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span><span class="preprocessor">  constexpr METAL_FUNC addr_space _MLX_BFloat16&amp; __operator__(     \</span></div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span><span class="preprocessor">      addr_space _MLX_BFloat16&amp; lhs, _MLX_BFloat16 rhs) {          \</span></div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span><span class="preprocessor">    lhs = static_cast&lt;float&gt;(lhs) __op__ static_cast&lt;float&gt;(rhs);  \</span></div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span><span class="preprocessor">    return lhs;                                                    \</span></div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span><span class="preprocessor">  }</span></div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span> </div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span><span class="preprocessor">#define bfloat_inplace_op_addr_space_helper(__op__, __operator__) \</span></div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, device);         \</span></div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, thread);         \</span></div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span><span class="preprocessor">  bfloat_inplace_op_helper(__op__, __operator__, threadgroup);</span></div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span> </div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24ca436ab299a710263d65302532dd3b">  239</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a>(+, <span class="keyword">operator</span>+=);</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0d3fb52437c677c5d0f1a3642384b15c">  240</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a>(-, <span class="keyword">operator</span>-=);</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a13aa79165ec87710e977f33fe0361e91">  241</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a>(*, <span class="keyword">operator</span>*=);</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af1a12a1efb618a57da6dd41ae18cb53c">  242</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a>(/, <span class="keyword">operator</span>/=);</div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span> </div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span><span class="preprocessor">#undef bfloat_inplace_op_helper</span></div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span><span class="preprocessor">#undef bfloat_inplace_op_addr_space_helper</span></div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span> </div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span><span class="comment">// Bfloat typedef</span></div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span> </div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">  251</a></span><span class="keyword">typedef</span> <span class="keyword">struct </span><a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>;</div>
 <div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span> </div>
-<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span><span class="comment">// Bfloat typedef</span></div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span><span class="comment">// Bfloat numeric limits</span></div>
 <div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span> </div>
-<div class="line"><a id="l00257" name="l00257"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">  257</a></span><span class="keyword">typedef</span> <span class="keyword">struct </span><a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>;</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span><span class="preprocessor">#pragma METAL internals : enable</span></div>
 <div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span> </div>
-<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span><span class="comment">// Bfloat numeric limits</span></div>
-<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span> </div>
-<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span><span class="preprocessor">#pragma METAL internals : enable</span></div>
-<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span> </div>
-<div class="foldopen" id="foldopen00265" data-start="{" data-end="}">
-<div class="line"><a id="l00265" name="l00265"></a><span class="lineno"><a class="line" href="namespacemetal.html">  265</a></span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a> {</div>
-<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span> </div>
-<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span><span class="keyword">template</span> &lt;&gt;</div>
-<div class="foldopen" id="foldopen00268" data-start="{" data-end="};">
-<div class="line"><a id="l00268" name="l00268"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html">  268</a></span><span class="keyword">struct </span>_numeric_limits_impl&lt;<a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>&gt; : _fp_numeric_limits_impl_base {</div>
-<div class="line"><a id="l00269" name="l00269"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#af6a681edff230c8d734a1feefb8d1879">  269</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> digits = 8;</div>
-<div class="line"><a id="l00270" name="l00270"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a0f48dd0c8a2d2dfa825067fb212b2e6b">  270</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> digits10 = 2;</div>
-<div class="line"><a id="l00271" name="l00271"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a8d3905e6f158379a0c52682266e8d0e2">  271</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> max_digits10 = 4;</div>
-<div class="line"><a id="l00272" name="l00272"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aaefa8c2cadd11ac7e22f7b2c5edbd1cd">  272</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> radix = 2;</div>
-<div class="line"><a id="l00273" name="l00273"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a13829f8c7a7c0efdc8946eff5d3c9470">  273</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> min_exponent = -125;</div>
-<div class="line"><a id="l00274" name="l00274"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aeaed172780720e06b8731cef3177e277">  274</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> min_exponent10 = -37;</div>
-<div class="line"><a id="l00275" name="l00275"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61bb136f819fa392c50bdf3c38f3aad2">  275</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> max_exponent = 128;</div>
-<div class="line"><a id="l00276" name="l00276"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a76bfb2deb0e0afc011f77bf5a6d0ed94">  276</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> max_exponent10 = 38;</div>
-<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span> </div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a> {</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span> </div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span><span class="keyword">template</span> &lt;&gt;</div>
+<div class="foldopen" id="foldopen00262" data-start="{" data-end="};">
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html">  262</a></span><span class="keyword">struct </span>_numeric_limits_impl&lt;<a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>&gt; : _fp_numeric_limits_impl_base {</div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#af6a681edff230c8d734a1feefb8d1879">  263</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> digits = 8;</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a0f48dd0c8a2d2dfa825067fb212b2e6b">  264</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> digits10 = 2;</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a8d3905e6f158379a0c52682266e8d0e2">  265</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> max_digits10 = 4;</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aaefa8c2cadd11ac7e22f7b2c5edbd1cd">  266</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> radix = 2;</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a13829f8c7a7c0efdc8946eff5d3c9470">  267</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> min_exponent = -125;</div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aeaed172780720e06b8731cef3177e277">  268</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> min_exponent10 = -37;</div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61bb136f819fa392c50bdf3c38f3aad2">  269</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> max_exponent = 128;</div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a76bfb2deb0e0afc011f77bf5a6d0ed94">  270</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">int</span> max_exponent10 = 38;</div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span> </div>
+<div class="foldopen" id="foldopen00272" data-start="{" data-end="}">
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f">  272</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f">min</a>() {</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x0080, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>  }</div>
+</div>
+<div class="foldopen" id="foldopen00275" data-start="{" data-end="}">
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116">  275</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116">lowest</a>() {</div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0xFF7F, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>  }</div>
+</div>
 <div class="foldopen" id="foldopen00278" data-start="{" data-end="}">
-<div class="line"><a id="l00278" name="l00278"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f">  278</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f">min</a>() {</div>
-<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x0080, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a92320d40a58218e40cc414986ac95c50">  278</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a92320d40a58218e40cc414986ac95c50">max</a>() {</div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x7F7F, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
 <div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>  }</div>
 </div>
 <div class="foldopen" id="foldopen00281" data-start="{" data-end="}">
-<div class="line"><a id="l00281" name="l00281"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116">  281</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116">lowest</a>() {</div>
-<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0xFF7F, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a96c4197e3076f0aa9065370b8ece49ca">  281</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a96c4197e3076f0aa9065370b8ece49ca">epsilon</a>() {</div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x3C00, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
 <div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>  }</div>
 </div>
 <div class="foldopen" id="foldopen00284" data-start="{" data-end="}">
-<div class="line"><a id="l00284" name="l00284"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a92320d40a58218e40cc414986ac95c50">  284</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a92320d40a58218e40cc414986ac95c50">max</a>() {</div>
-<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x7F7F, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3">  284</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3">round_error</a>() {</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x3F00, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
 <div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>  }</div>
 </div>
 <div class="foldopen" id="foldopen00287" data-start="{" data-end="}">
-<div class="line"><a id="l00287" name="l00287"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a96c4197e3076f0aa9065370b8ece49ca">  287</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a96c4197e3076f0aa9065370b8ece49ca">epsilon</a>() {</div>
-<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x3C00, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61eb741e7af49046beb863abf023b206">  287</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61eb741e7af49046beb863abf023b206">infinity</a>() {</div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x7F80, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
 <div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>  }</div>
 </div>
 <div class="foldopen" id="foldopen00290" data-start="{" data-end="}">
-<div class="line"><a id="l00290" name="l00290"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3">  290</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3">round_error</a>() {</div>
-<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x3F00, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b">  290</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b">quiet_NaN</a>() {</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x7FC0, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
 <div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>  }</div>
 </div>
 <div class="foldopen" id="foldopen00293" data-start="{" data-end="}">
-<div class="line"><a id="l00293" name="l00293"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61eb741e7af49046beb863abf023b206">  293</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61eb741e7af49046beb863abf023b206">infinity</a>() {</div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80">  293</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80">signaling_NaN</a>() {</div>
 <div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x7F80, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
 <div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>  }</div>
 </div>
 <div class="foldopen" id="foldopen00296" data-start="{" data-end="}">
-<div class="line"><a id="l00296" name="l00296"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b">  296</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b">quiet_NaN</a>() {</div>
-<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x7FC0, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
+<div class="line"><a id="l00296" name="l00296"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a6a9dbcba4dd79cad50876dda506b9eed">  296</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a6a9dbcba4dd79cad50876dda506b9eed">denorm_min</a>() {</div>
+<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x0001, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
 <div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>  }</div>
 </div>
-<div class="foldopen" id="foldopen00299" data-start="{" data-end="}">
-<div class="line"><a id="l00299" name="l00299"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80">  299</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80">signaling_NaN</a>() {</div>
-<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x7F80, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
-<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>  }</div>
+<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>};</div>
 </div>
-<div class="foldopen" id="foldopen00302" data-start="{" data-end="}">
-<div class="line"><a id="l00302" name="l00302"></a><span class="lineno"><a class="line" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a6a9dbcba4dd79cad50876dda506b9eed">  302</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a6a9dbcba4dd79cad50876dda506b9eed">denorm_min</a>() {</div>
-<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(0x0001, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
-<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>  }</div>
-</div>
-<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>};</div>
+<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span> </div>
+<div class="foldopen" id="foldopen00301" data-start="{" data-end="}">
+<div class="line"><a id="l00301" name="l00301"></a><span class="lineno"><a class="line" href="namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb">  301</a></span>METAL_FUNC <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb">isnan</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> x) {</div>
+<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>  <span class="keywordflow">return</span> x != x;</div>
+<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>}</div>
 </div>
+<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span> </div>
+<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>} <span class="comment">// namespace metal</span></div>
 <div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span> </div>
-<div class="foldopen" id="foldopen00307" data-start="{" data-end="}">
-<div class="line"><a id="l00307" name="l00307"></a><span class="lineno"><a class="line" href="namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb">  307</a></span>METAL_FUNC <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb">isnan</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> x) {</div>
-<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>  <span class="keywordflow">return</span> x != x;</div>
-<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>}</div>
+<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span><span class="preprocessor">#pragma METAL internals : disable</span></div>
+<div class="foldopen" id="foldopen00308" data-start="{" data-end="}">
+<div class="line"><a id="l00308" name="l00308"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">  308</a></span><span class="keyword">inline</span> uint16_t <a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">bfloat16_to_uint16</a>(<span class="keyword">const</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> x) {</div>
+<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>  <span class="keywordflow">return</span> x.<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>;</div>
+<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>}</div>
 </div>
-<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span> </div>
-<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>} <span class="comment">// namespace metal</span></div>
+<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span> </div>
+<div class="foldopen" id="foldopen00312" data-start="{" data-end="}">
+<div class="line"><a id="l00312" name="l00312"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">  312</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">uint16_to_bfloat16</a>(<span class="keyword">const</span> uint16_t x) {</div>
+<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>  <span class="keywordflow">return</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(x, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>());</div>
+<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>}</div>
 </div>
-<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span> </div>
-<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span><span class="preprocessor">#pragma METAL internals : disable</span></div>
-<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span> </div>
-<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span><span class="preprocessor">#endif</span></div>
-<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span> </div>
-<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span><span class="preprocessor">#include &quot;<a class="code" href="bf16__math_8h.html">mlx/backend/metal/kernels/bf16_math.h</a>&quot;</span></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a31ce5e8e860295fa236e0d4b0befeae1"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a></div><div class="ttdeci">constexpr METAL_FUNC uint16_t float_to_bfloat_bits(float x)</div><div class="ttdef"><b>Definition</b> bf16.h:19</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a330a0883503cb640f1cf628a7ca50239"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a></div><div class="ttdeci">#define bfloat_compop(__op__, __operator__)</div><div class="ttdef"><b>Definition</b> bf16.h:173</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a3b33ae338dc4f223d0f3c748de07bad1"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a></div><div class="ttdeci">constexpr METAL_FUNC float bfloat_bits_to_float(uint16_t x)</div><div class="ttdef"><b>Definition</b> bf16.h:35</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a4ac82467fbc674e990090f482b9c1e5c"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a></div><div class="ttdeci">#define bfloat_inplace_op(itype)</div><div class="ttdef"><b>Definition</b> bf16.h:214</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a6aedc8d6d0980134ac69b96f22d9a855"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">operator-</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16 operator-(_MLX_BFloat16 x)</div><div class="ttdef"><b>Definition</b> bf16.h:135</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a7694892a131c0e31e5153c088cccb707"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a></div><div class="ttdeci">#define bfloat_binop(_op_, _operator_)</div><div class="ttdef"><b>Definition</b> bf16.h:156</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a7782de82393104dd4ad754ce3b316e82"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></div><div class="ttdeci">struct _MLX_BFloat16 bfloat16_t</div><div class="ttdef"><b>Definition</b> bf16.h:257</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a7e5992f7fcd8f2cdadcc1d7f6aefbb5a"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">can_convert_from_bfloat</a></div><div class="ttdeci">static constexpr constant bool can_convert_from_bfloat</div><div class="ttdef"><b>Definition</b> bf16.h:47</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_aae77817d261452b2f001f4d947a3e04e"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">can_convert_to_bfloat</a></div><div class="ttdeci">static constexpr constant bool can_convert_to_bfloat</div><div class="ttdef"><b>Definition</b> bf16.h:43</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_af30a2cbd2c3415516203b83bd21872f8"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a></div><div class="ttdeci">#define bfloat_inplace_op_addr_space_helper(__op__, __operator__, itype)</div><div class="ttdef"><b>Definition</b> bf16.h:209</div></div>
-<div class="ttc" id="abf16__math_8h_html"><div class="ttname"><a href="bf16__math_8h.html">bf16_math.h</a></div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a83320ba983d90dd1fa5847b6940dc0bb"><div class="ttname"><a href="namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb">metal::isnan</a></div><div class="ttdeci">METAL_FUNC bool isnan(_MLX_BFloat16 x)</div><div class="ttdef"><b>Definition</b> bf16.h:307</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_1_1bits__to__bfloat__struct_html"><div class="ttname"><a href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">_MLX_BFloat16::bits_to_bfloat_struct</a></div><div class="ttdef"><b>Definition</b> bf16.h:63</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html_a24c5736f234e09a0c82b00c7e44cc547"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a24c5736f234e09a0c82b00c7e44cc547">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(T x) thread</div><div class="ttdef"><b>Definition</b> bf16.h:76</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html_a4113263b63e3757ea8334cc4f0f5c3c8"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">_MLX_BFloat16::bits_</a></div><div class="ttdeci">uint16_t bits_</div><div class="ttdef"><b>Definition</b> bf16.h:57</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html_a50d825f05a162d0ac133ad8b6f3c3112"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a50d825f05a162d0ac133ad8b6f3c3112">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(uint16_t bits, bits_to_bfloat_struct)</div><div class="ttdef"><b>Definition</b> bf16.h:67</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html_a91ccb774773b65f8d4c1aea3f1c6e1ca"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a></div><div class="ttdeci">static constexpr METAL_FUNC bits_to_bfloat_struct bits_to_bfloat()</div><div class="ttdef"><b>Definition</b> bf16.h:64</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a1420e191fa60d707dce327d0938e3088"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">bfloat16_to_uint16</a></div><div class="ttdeci">uint16_t bfloat16_to_uint16(const bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16.h:308</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a31ce5e8e860295fa236e0d4b0befeae1"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">float_to_bfloat_bits</a></div><div class="ttdeci">constexpr METAL_FUNC uint16_t float_to_bfloat_bits(float x)</div><div class="ttdef"><b>Definition</b> bf16.h:13</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a330a0883503cb640f1cf628a7ca50239"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a></div><div class="ttdeci">#define bfloat_compop(__op__, __operator__)</div><div class="ttdef"><b>Definition</b> bf16.h:167</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a3b33ae338dc4f223d0f3c748de07bad1"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bfloat_bits_to_float</a></div><div class="ttdeci">constexpr METAL_FUNC float bfloat_bits_to_float(uint16_t x)</div><div class="ttdef"><b>Definition</b> bf16.h:29</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a4ac82467fbc674e990090f482b9c1e5c"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a></div><div class="ttdeci">#define bfloat_inplace_op(itype)</div><div class="ttdef"><b>Definition</b> bf16.h:208</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a6aedc8d6d0980134ac69b96f22d9a855"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">operator-</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16 operator-(_MLX_BFloat16 x)</div><div class="ttdef"><b>Definition</b> bf16.h:129</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a7694892a131c0e31e5153c088cccb707"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a></div><div class="ttdeci">#define bfloat_binop(_op_, _operator_)</div><div class="ttdef"><b>Definition</b> bf16.h:150</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a7e5992f7fcd8f2cdadcc1d7f6aefbb5a"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">can_convert_from_bfloat</a></div><div class="ttdeci">static constexpr constant bool can_convert_from_bfloat</div><div class="ttdef"><b>Definition</b> bf16.h:41</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a8d066e48cf3e2a0583c71816fa40f7f4"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">uint16_to_bfloat16</a></div><div class="ttdeci">bfloat16_t uint16_to_bfloat16(const uint16_t x)</div><div class="ttdef"><b>Definition</b> bf16.h:312</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_aae77817d261452b2f001f4d947a3e04e"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">can_convert_to_bfloat</a></div><div class="ttdeci">static constexpr constant bool can_convert_to_bfloat</div><div class="ttdef"><b>Definition</b> bf16.h:37</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_af30a2cbd2c3415516203b83bd21872f8"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bfloat_inplace_op_addr_space_helper</a></div><div class="ttdeci">#define bfloat_inplace_op_addr_space_helper(__op__, __operator__, itype)</div><div class="ttdef"><b>Definition</b> bf16.h:203</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a83320ba983d90dd1fa5847b6940dc0bb"><div class="ttname"><a href="namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb">metal::isnan</a></div><div class="ttdeci">METAL_FUNC bool isnan(_MLX_BFloat16 x)</div><div class="ttdef"><b>Definition</b> bf16.h:301</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_1_1bits__to__bfloat__struct_html"><div class="ttname"><a href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">_MLX_BFloat16::bits_to_bfloat_struct</a></div><div class="ttdef"><b>Definition</b> bf16.h:57</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html_a24c5736f234e09a0c82b00c7e44cc547"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a24c5736f234e09a0c82b00c7e44cc547">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(T x) thread</div><div class="ttdef"><b>Definition</b> bf16.h:70</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html_a4113263b63e3757ea8334cc4f0f5c3c8"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">_MLX_BFloat16::bits_</a></div><div class="ttdeci">uint16_t bits_</div><div class="ttdef"><b>Definition</b> bf16.h:51</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html_a50d825f05a162d0ac133ad8b6f3c3112"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a50d825f05a162d0ac133ad8b6f3c3112">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(uint16_t bits, bits_to_bfloat_struct)</div><div class="ttdef"><b>Definition</b> bf16.h:61</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html_a91ccb774773b65f8d4c1aea3f1c6e1ca"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a></div><div class="ttdeci">static constexpr METAL_FUNC bits_to_bfloat_struct bits_to_bfloat()</div><div class="ttdef"><b>Definition</b> bf16.h:58</div></div>
 <div class="ttc" id="astruct___m_l_x___b_float16_html_ab1af7700f5d1e4ab567da6a34fa84668"><div class="ttname"><a href="struct___m_l_x___b_float16.html#ab1af7700f5d1e4ab567da6a34fa84668">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">_MLX_BFloat16() thread=default</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html_ad2701d003e8fad168c89abc3907c6e53"><div class="ttname"><a href="struct___m_l_x___b_float16.html#ad2701d003e8fad168c89abc3907c6e53">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(T x) device</div><div class="ttdef"><b>Definition</b> bf16.h:88</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html_adeb880f31121c6dc40ce47765c6c7455"><div class="ttname"><a href="struct___m_l_x___b_float16.html#adeb880f31121c6dc40ce47765c6c7455">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(T x) threadgroup</div><div class="ttdef"><b>Definition</b> bf16.h:82</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html_aec7fa716fd621ce1843338027bcb0118"><div class="ttname"><a href="struct___m_l_x___b_float16.html#aec7fa716fd621ce1843338027bcb0118">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(T x) const ant</div><div class="ttdef"><b>Definition</b> bf16.h:94</div></div>
-<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_a61eb741e7af49046beb863abf023b206"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61eb741e7af49046beb863abf023b206">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::infinity</a></div><div class="ttdeci">static constexpr bfloat16_t infinity()</div><div class="ttdef"><b>Definition</b> bf16.h:293</div></div>
-<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_a6a9dbcba4dd79cad50876dda506b9eed"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a6a9dbcba4dd79cad50876dda506b9eed">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::denorm_min</a></div><div class="ttdeci">static constexpr bfloat16_t denorm_min()</div><div class="ttdef"><b>Definition</b> bf16.h:302</div></div>
-<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_a92320d40a58218e40cc414986ac95c50"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a92320d40a58218e40cc414986ac95c50">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::max</a></div><div class="ttdeci">static constexpr bfloat16_t max()</div><div class="ttdef"><b>Definition</b> bf16.h:284</div></div>
-<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_a96c4197e3076f0aa9065370b8ece49ca"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a96c4197e3076f0aa9065370b8ece49ca">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::epsilon</a></div><div class="ttdeci">static constexpr bfloat16_t epsilon()</div><div class="ttdef"><b>Definition</b> bf16.h:287</div></div>
-<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_ad1f76a43c7d51a3765174aa6e0dd9f80"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::signaling_NaN</a></div><div class="ttdeci">static constexpr bfloat16_t signaling_NaN()</div><div class="ttdef"><b>Definition</b> bf16.h:299</div></div>
-<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_adaed80031f5ca0ff69d30ec4c5d0c98f"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min</a></div><div class="ttdeci">static constexpr bfloat16_t min()</div><div class="ttdef"><b>Definition</b> bf16.h:278</div></div>
-<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_ae81c58b8223e504965183c99d19a2116"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::lowest</a></div><div class="ttdeci">static constexpr bfloat16_t lowest()</div><div class="ttdef"><b>Definition</b> bf16.h:281</div></div>
-<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_aebeb07c01984be246bc2d1b8f8e4ac7b"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::quiet_NaN</a></div><div class="ttdeci">static constexpr bfloat16_t quiet_NaN()</div><div class="ttdef"><b>Definition</b> bf16.h:296</div></div>
-<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_afa223448fa4f04c1113a85345dd720c3"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::round_error</a></div><div class="ttdeci">static constexpr bfloat16_t round_error()</div><div class="ttdef"><b>Definition</b> bf16.h:290</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html_ad2701d003e8fad168c89abc3907c6e53"><div class="ttname"><a href="struct___m_l_x___b_float16.html#ad2701d003e8fad168c89abc3907c6e53">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(T x) device</div><div class="ttdef"><b>Definition</b> bf16.h:82</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html_adeb880f31121c6dc40ce47765c6c7455"><div class="ttname"><a href="struct___m_l_x___b_float16.html#adeb880f31121c6dc40ce47765c6c7455">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(T x) threadgroup</div><div class="ttdef"><b>Definition</b> bf16.h:76</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html_aec7fa716fd621ce1843338027bcb0118"><div class="ttname"><a href="struct___m_l_x___b_float16.html#aec7fa716fd621ce1843338027bcb0118">_MLX_BFloat16::_MLX_BFloat16</a></div><div class="ttdeci">constexpr METAL_FUNC _MLX_BFloat16(T x) const ant</div><div class="ttdef"><b>Definition</b> bf16.h:88</div></div>
+<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_a61eb741e7af49046beb863abf023b206"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61eb741e7af49046beb863abf023b206">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::infinity</a></div><div class="ttdeci">static constexpr bfloat16_t infinity()</div><div class="ttdef"><b>Definition</b> bf16.h:287</div></div>
+<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_a6a9dbcba4dd79cad50876dda506b9eed"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a6a9dbcba4dd79cad50876dda506b9eed">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::denorm_min</a></div><div class="ttdeci">static constexpr bfloat16_t denorm_min()</div><div class="ttdef"><b>Definition</b> bf16.h:296</div></div>
+<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_a92320d40a58218e40cc414986ac95c50"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a92320d40a58218e40cc414986ac95c50">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::max</a></div><div class="ttdeci">static constexpr bfloat16_t max()</div><div class="ttdef"><b>Definition</b> bf16.h:278</div></div>
+<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_a96c4197e3076f0aa9065370b8ece49ca"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a96c4197e3076f0aa9065370b8ece49ca">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::epsilon</a></div><div class="ttdeci">static constexpr bfloat16_t epsilon()</div><div class="ttdef"><b>Definition</b> bf16.h:281</div></div>
+<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_ad1f76a43c7d51a3765174aa6e0dd9f80"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::signaling_NaN</a></div><div class="ttdeci">static constexpr bfloat16_t signaling_NaN()</div><div class="ttdef"><b>Definition</b> bf16.h:293</div></div>
+<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_adaed80031f5ca0ff69d30ec4c5d0c98f"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min</a></div><div class="ttdeci">static constexpr bfloat16_t min()</div><div class="ttdef"><b>Definition</b> bf16.h:272</div></div>
+<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_ae81c58b8223e504965183c99d19a2116"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::lowest</a></div><div class="ttdeci">static constexpr bfloat16_t lowest()</div><div class="ttdef"><b>Definition</b> bf16.h:275</div></div>
+<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_aebeb07c01984be246bc2d1b8f8e4ac7b"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::quiet_NaN</a></div><div class="ttdeci">static constexpr bfloat16_t quiet_NaN()</div><div class="ttdef"><b>Definition</b> bf16.h:290</div></div>
+<div class="ttc" id="astructmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4_html_afa223448fa4f04c1113a85345dd720c3"><div class="ttname"><a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3">metal::_numeric_limits_impl&lt; bfloat16_t &gt;::round_error</a></div><div class="ttdeci">static constexpr bfloat16_t round_error()</div><div class="ttdef"><b>Definition</b> bf16.h:284</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/backend_2metal_2kernels_2metal__3__1_2bf16_8h.html b/docs/build/html/backend_2metal_2kernels_2metal__3__1_2bf16_8h.html
new file mode 100644
index 000000000..628c1311f
--- /dev/null
+++ b/docs/build/html/backend_2metal_2kernels_2metal__3__1_2bf16_8h.html
@@ -0,0 +1,186 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/metal_3_1/bf16.h File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_83367edb60e23ad59b1a493d8c883287.html">metal_3_1</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#typedef-members">Typedefs</a> &#124;
+<a href="#func-members">Functions</a>  </div>
+  <div class="headertitle"><div class="title">bf16.h File Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<div class="textblock"><code>#include &lt;metal_stdlib&gt;</code><br />
+</div>
+<p><a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h_source.html">Go to the source code of this file.</a></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="typedef-members" name="typedef-members"></a>
+Typedefs</h2></td></tr>
+<tr class="memitem:a58e15a77da988b9104fee00cdf8b280e" id="r_a58e15a77da988b9104fee00cdf8b280e"><td class="memItemLeft" align="right" valign="top">typedef bfloat&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a58e15a77da988b9104fee00cdf8b280e">bfloat16_t</a></td></tr>
+<tr class="separator:a58e15a77da988b9104fee00cdf8b280e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
+Functions</h2></td></tr>
+<tr class="memitem:a1420e191fa60d707dce327d0938e3088" id="r_a1420e191fa60d707dce327d0938e3088"><td class="memItemLeft" align="right" valign="top">uint16_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1420e191fa60d707dce327d0938e3088">bfloat16_to_uint16</a> (const <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a1420e191fa60d707dce327d0938e3088"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a8d066e48cf3e2a0583c71816fa40f7f4" id="r_a8d066e48cf3e2a0583c71816fa40f7f4"><td class="memItemLeft" align="right" valign="top"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8d066e48cf3e2a0583c71816fa40f7f4">uint16_to_bfloat16</a> (const uint16_t x)</td></tr>
+<tr class="separator:a8d066e48cf3e2a0583c71816fa40f7f4"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Typedef Documentation</h2>
+<a id="a58e15a77da988b9104fee00cdf8b280e" name="a58e15a77da988b9104fee00cdf8b280e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a58e15a77da988b9104fee00cdf8b280e">&#9670;&#160;</a></span>bfloat16_t</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">typedef bfloat <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<h2 class="groupheader">Function Documentation</h2>
+<a id="a1420e191fa60d707dce327d0938e3088" name="a1420e191fa60d707dce327d0938e3088"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1420e191fa60d707dce327d0938e3088">&#9670;&#160;</a></span>bfloat16_to_uint16()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">uint16_t bfloat16_to_uint16 </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a8d066e48cf3e2a0583c71816fa40f7f4" name="a8d066e48cf3e2a0583c71816fa40f7f4"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a8d066e48cf3e2a0583c71816fa40f7f4">&#9670;&#160;</a></span>uint16_to_bfloat16()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> uint16_to_bfloat16 </td>
+          <td>(</td>
+          <td class="paramtype">const uint16_t</td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/backend_2metal_2kernels_2metal__3__1_2bf16_8h_source.html b/docs/build/html/backend_2metal_2kernels_2metal__3__1_2bf16_8h_source.html
new file mode 100644
index 000000000..8f9809902
--- /dev/null
+++ b/docs/build/html/backend_2metal_2kernels_2metal__3__1_2bf16_8h_source.html
@@ -0,0 +1,126 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/metal_3_1/bf16.h Source File</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_83367edb60e23ad59b1a493d8c883287.html">metal_3_1</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">bf16.h</div></div>
+</div><!--header-->
+<div class="contents">
+<a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2023 Apple Inc.</span></div>
+<div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
+<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#include &lt;metal_stdlib&gt;</span></div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span> </div>
+<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="keyword">using namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a>;</div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span> </div>
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a58e15a77da988b9104fee00cdf8b280e">    9</a></span><span class="keyword">typedef</span> bfloat <a class="code hl_typedef" href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a58e15a77da988b9104fee00cdf8b280e">bfloat16_t</a>;</div>
+<div class="foldopen" id="foldopen00010" data-start="{" data-end="}">
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">   10</a></span><span class="keyword">inline</span> uint16_t <a class="code hl_function" href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">bfloat16_to_uint16</a>(<span class="keyword">const</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> x) {</div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span>  <span class="keywordflow">return</span> as_type&lt;uint16_t&gt;(x);</div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span>}</div>
+</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span> </div>
+<div class="foldopen" id="foldopen00014" data-start="{" data-end="}">
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">   14</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">uint16_to_bfloat16</a>(<span class="keyword">const</span> uint16_t x) {</div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>  <span class="keywordflow">return</span> as_type&lt;bfloat16_t&gt;(x);</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>}</div>
+</div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__1_2bf16_8h_html_a1420e191fa60d707dce327d0938e3088"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">bfloat16_to_uint16</a></div><div class="ttdeci">uint16_t bfloat16_to_uint16(const bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16.h:10</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__1_2bf16_8h_html_a58e15a77da988b9104fee00cdf8b280e"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a58e15a77da988b9104fee00cdf8b280e">bfloat16_t</a></div><div class="ttdeci">bfloat bfloat16_t</div><div class="ttdef"><b>Definition</b> bf16.h:9</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__1_2bf16_8h_html_a8d066e48cf3e2a0583c71816fa40f7f4"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">uint16_to_bfloat16</a></div><div class="ttdeci">bfloat16_t uint16_to_bfloat16(const uint16_t x)</div><div class="ttdef"><b>Definition</b> bf16.h:14</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
+</div><!-- fragment --></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/backend_2metal_2kernels_2reduction_2ops_8h.html b/docs/build/html/backend_2metal_2kernels_2reduction_2ops_8h.html
index e22c96954..7feb28464 100644
--- a/docs/build/html/backend_2metal_2kernels_2reduction_2ops_8h.html
+++ b/docs/build/html/backend_2metal_2kernels_2reduction_2ops_8h.html
@@ -157,7 +157,7 @@ Variables</h2></td></tr>
 <div class="line">    <span class="keywordflow">return</span> val;                                                          \</div>
 <div class="line">  }</div>
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3"><div class="ttname"><a href="#a515b75d563a93d3c09ee677948dc83e3">simd_size</a></div><div class="ttdeci">static constant constexpr const uint8_t simd_size</div><div class="ttdef"><b>Definition</b> ops.h:22</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_aba6279624b1d30c525efee856a222b5c"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c">simd_shuffle_down</a></div><div class="ttdeci">uint64_t simd_shuffle_down(uint64_t data, uint16_t delta)</div><div class="ttdef"><b>Definition</b> utils.h:305</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_aba6279624b1d30c525efee856a222b5c"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c">simd_shuffle_down</a></div><div class="ttdeci">uint64_t simd_shuffle_down(uint64_t data, uint16_t delta)</div><div class="ttdef"><b>Definition</b> utils.h:346</div></div>
 </div><!-- fragment -->
 </div>
 </div>
diff --git a/docs/build/html/backend_2metal_2kernels_2reduction_2ops_8h_source.html b/docs/build/html/backend_2metal_2kernels_2reduction_2ops_8h_source.html
index 3bd0aa0c8..35b73c8b1 100644
--- a/docs/build/html/backend_2metal_2kernels_2reduction_2ops_8h_source.html
+++ b/docs/build/html/backend_2metal_2kernels_2reduction_2ops_8h_source.html
@@ -325,7 +325,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3"><div class="ttname"><a href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a></div><div class="ttdeci">static constant constexpr const uint8_t simd_size</div><div class="ttdef"><b>Definition</b> ops.h:22</div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_acacf99e0ba629ed062ccc3c2eba89b05"><div class="ttname"><a href="backend_2metal_2kernels_2reduction_2ops_8h.html#acacf99e0ba629ed062ccc3c2eba89b05">DEFINE_SIMD_REDUCE</a></div><div class="ttdeci">#define DEFINE_SIMD_REDUCE()</div><div class="ttdef"><b>Definition</b> ops.h:8</div></div>
 <div class="ttc" id="astruct_and_html"><div class="ttname"><a href="struct_and.html">And</a></div><div class="ttdef"><b>Definition</b> ops.h:37</div></div>
-<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:17</div></div>
+<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:23</div></div>
 <div class="ttc" id="astruct_max_html"><div class="ttname"><a href="struct_max.html">Max</a></div><div class="ttdef"><b>Definition</b> ops.h:185</div></div>
 <div class="ttc" id="astruct_max_html_adfee65117dbf49404241861d374b9c4d"><div class="ttname"><a href="struct_max.html#adfee65117dbf49404241861d374b9c4d">Max::a</a></div><div class="ttdeci">b a</div><div class="ttdef"><b>Definition</b> ops.h:202</div></div>
 <div class="ttc" id="astruct_min_html"><div class="ttname"><a href="struct_min.html">Min</a></div><div class="ttdef"><b>Definition</b> ops.h:163</div></div>
diff --git a/docs/build/html/backend_2metal_2kernels_2steel_2attn_2transforms_8h.html b/docs/build/html/backend_2metal_2kernels_2steel_2attn_2transforms_8h.html
new file mode 100644
index 000000000..9e8b79ebe
--- /dev/null
+++ b/docs/build/html/backend_2metal_2kernels_2steel_2attn_2transforms_8h.html
@@ -0,0 +1,128 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/transforms.h File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#nested-classes">Classes</a> &#124;
+<a href="#namespaces">Namespaces</a>  </div>
+  <div class="headertitle"><div class="title">transforms.h File Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<div class="textblock"><code>#include &quot;<a class="el" href="backend_2metal_2kernels_2steel_2utils_8h_source.html">mlx/backend/metal/kernels/steel/utils.h</a>&quot;</code><br />
+</div>
+<p><a href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">Go to the source code of this file.</a></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
+Classes</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_transform_none.html">mlx::steel::TransformNone&lt; OutT, InT &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_accum_helper.html">mlx::steel::AccumHelper&lt; T &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html">mlx::steel::BlockSwizzle</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="namespaces" name="namespaces"></a>
+Namespaces</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx.html">mlx</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html b/docs/build/html/backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html
new file mode 100644
index 000000000..38d8120ab
--- /dev/null
+++ b/docs/build/html/backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html
@@ -0,0 +1,201 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/transforms.h Source File</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">transforms.h</div></div>
+</div><!--header-->
+<div class="contents">
+<a href="backend_2metal_2kernels_2steel_2attn_2transforms_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2024 Apple Inc.</span></div>
+<div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
+<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2steel_2utils_8h.html">mlx/backend/metal/kernels/steel/utils.h</a>&quot;</span></div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span> </div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span><span class="comment">// Transforms and Epilogues</span></div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span> </div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemlx.html">mlx</a> {</div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span><span class="keyword">namespace </span>steel {</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span> </div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> OutT, <span class="keyword">typename</span> InT&gt;</div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span><span class="keyword">struct </span>TransformNone {</div>
+<div class="foldopen" id="foldopen00016" data-start="{" data-end="}">
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75">   16</a></span>  <span class="keyword">static</span> METAL_FUNC OutT <a class="code hl_function" href="structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75">apply</a>(InT x) {</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>OutT<span class="keyword">&gt;</span>(x);</div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>  }</div>
+</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span> </div>
+<div class="foldopen" id="foldopen00020" data-start="{" data-end="}">
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90">   20</a></span>  <span class="keyword">static</span> METAL_FUNC OutT <a class="code hl_function" href="structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90">apply</a>(InT x, OutT) {</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>OutT<span class="keyword">&gt;</span>(x);</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  }</div>
+</div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>};</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span> </div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> OutT, <span class="keyword">typename</span> InT&gt;</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span><span class="keyword">struct </span>TransformAdd {</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae">   27</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae">TransformAdd</a>(<span class="keyword">const</span> <span class="keywordtype">float</span>, <span class="keyword">const</span> <span class="keywordtype">float</span>) {}</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span> </div>
+<div class="foldopen" id="foldopen00029" data-start="{" data-end="}">
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf">   29</a></span>  <span class="keyword">static</span> METAL_FUNC OutT <a class="code hl_function" href="structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf">apply</a>(InT x) {</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>OutT<span class="keyword">&gt;</span>(x);</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  }</div>
+</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span> </div>
+<div class="foldopen" id="foldopen00033" data-start="{" data-end="}">
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19">   33</a></span>  <span class="keyword">static</span> METAL_FUNC OutT <a class="code hl_function" href="structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19">apply</a>(InT x, OutT c) {</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>OutT<span class="keyword">&gt;</span>(x) + c;</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>  }</div>
+</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>};</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span> </div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> OutT, <span class="keyword">typename</span> InT&gt;</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span><span class="keyword">struct </span>TransformAxpby {</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">   40</a></span>  <span class="keyword">const</span> <span class="keywordtype">float</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">alpha</a>;</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">   41</a></span>  <span class="keyword">const</span> <span class="keywordtype">float</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">beta</a>;</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span> </div>
+<div class="foldopen" id="foldopen00043" data-start="{" data-end="}">
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9">   43</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9">TransformAxpby</a>(<span class="keyword">const</span> <span class="keywordtype">float</span> alpha_, <span class="keyword">const</span> <span class="keywordtype">float</span> beta_)</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>      : <a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">alpha</a>(alpha_), <a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">beta</a>(beta_) {}</div>
+</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span> </div>
+<div class="foldopen" id="foldopen00046" data-start="{" data-end="}">
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87">   46</a></span>  <span class="keyword">static</span> METAL_FUNC OutT <a class="code hl_function" href="structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87">apply</a>(InT x) {</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>OutT<span class="keyword">&gt;</span>(x);</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>  }</div>
+</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span> </div>
+<div class="foldopen" id="foldopen00050" data-start="{" data-end="}">
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba">   50</a></span>  METAL_FUNC OutT <a class="code hl_function" href="structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba">apply</a>(InT x, OutT c)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span>OutT<span class="keyword">&gt;</span>(x * <a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">alpha</a> + (<a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">beta</a> * c));</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>  }</div>
+</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>};</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span> </div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span><span class="keyword">struct </span>AccumHelper {</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">   57</a></span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> <a class="code hl_typedef" href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">accum_type</a>;</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>};</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span> </div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_swizzle.html">BlockSwizzle</a> {</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>  <span class="keyword">static</span> METAL_FUNC int2</div>
+<div class="foldopen" id="foldopen00062" data-start="{" data-end="}">
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760">   62</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760">swizzle</a>(uint3 tid [[threadgroup_position_in_grid]], <span class="keyword">const</span> <span class="keywordtype">int</span> swizzle_log) {</div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> tid_x = (tid.x) &gt;&gt; swizzle_log;</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> tid_y =</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>        ((tid.y) &lt;&lt; swizzle_log) + ((tid.x) &amp; ((1 &lt;&lt; swizzle_log) - 1));</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    <span class="keywordflow">return</span> int2(tid_x, tid_y);</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>  }</div>
+</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>};</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span> </div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>} <span class="comment">// namespace steel</span></div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>} <span class="comment">// namespace mlx</span></div>
+<div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html">utils.h</a></div></div>
+<div class="ttc" id="anamespacemlx_html"><div class="ttname"><a href="namespacemlx.html">mlx</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_accum_helper_html_ae52abf69e7ba6af1a73d65d57182ed26"><div class="ttname"><a href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">mlx::steel::AccumHelper::accum_type</a></div><div class="ttdeci">float accum_type</div><div class="ttdef"><b>Definition</b> transforms.h:57</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_swizzle_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_swizzle.html">mlx::steel::BlockSwizzle</a></div><div class="ttdef"><b>Definition</b> transforms.h:60</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_swizzle_html_a98e558d63826d2aaa06d3e65a06d2760"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760">mlx::steel::BlockSwizzle::swizzle</a></div><div class="ttdeci">static METAL_FUNC int2 swizzle(uint3 tid, const int swizzle_log)</div><div class="ttdef"><b>Definition</b> transforms.h:62</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_add_html_a4923b0059d88099b2739f2cf0273ea19"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19">mlx::steel::TransformAdd::apply</a></div><div class="ttdeci">static METAL_FUNC OutT apply(InT x, OutT c)</div><div class="ttdef"><b>Definition</b> transforms.h:33</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_add_html_a7c1b7292910b74281e5296b3dac157ae"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae">mlx::steel::TransformAdd::TransformAdd</a></div><div class="ttdeci">TransformAdd(const float, const float)</div><div class="ttdef"><b>Definition</b> transforms.h:27</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_add_html_afbb688d84443fd622b4dd2768cfe0acf"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf">mlx::steel::TransformAdd::apply</a></div><div class="ttdeci">static METAL_FUNC OutT apply(InT x)</div><div class="ttdef"><b>Definition</b> transforms.h:29</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_axpby_html_a14ad48b0189d6bdde06c66f1b567ae87"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87">mlx::steel::TransformAxpby::apply</a></div><div class="ttdeci">static METAL_FUNC OutT apply(InT x)</div><div class="ttdef"><b>Definition</b> transforms.h:46</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_axpby_html_a5fc726f085bafd1acbc391886f7fb8b6"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">mlx::steel::TransformAxpby::beta</a></div><div class="ttdeci">const float beta</div><div class="ttdef"><b>Definition</b> transforms.h:41</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_axpby_html_aaf3a45e25d7abf7a34b48cc612e631ba"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba">mlx::steel::TransformAxpby::apply</a></div><div class="ttdeci">METAL_FUNC OutT apply(InT x, OutT c) const</div><div class="ttdef"><b>Definition</b> transforms.h:50</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_axpby_html_ab3223b49c6b3b7f89eba91aeaff9dcff"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">mlx::steel::TransformAxpby::alpha</a></div><div class="ttdeci">const float alpha</div><div class="ttdef"><b>Definition</b> transforms.h:40</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_axpby_html_ad7d11c53de13646b725921391d15bbe9"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9">mlx::steel::TransformAxpby::TransformAxpby</a></div><div class="ttdeci">TransformAxpby(const float alpha_, const float beta_)</div><div class="ttdef"><b>Definition</b> transforms.h:43</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_none_html_a84daa89be5b3348b5715bf8c5a01da75"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75">mlx::steel::TransformNone::apply</a></div><div class="ttdeci">static METAL_FUNC OutT apply(InT x)</div><div class="ttdef"><b>Definition</b> transforms.h:16</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_transform_none_html_ae4c397038f386b13eaa386638a0fce90"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90">mlx::steel::TransformNone::apply</a></div><div class="ttdeci">static METAL_FUNC OutT apply(InT x, OutT)</div><div class="ttdef"><b>Definition</b> transforms.h:20</div></div>
+</div><!-- fragment --></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html b/docs/build/html/backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html
index e5a0a3b9d..c8295ad7f 100644
--- a/docs/build/html/backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html
+++ b/docs/build/html/backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html
@@ -141,8 +141,8 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> OutT, <span class="keyword">typename</span> InT&gt;</div>
 <div class="foldopen" id="foldopen00039" data-start="{" data-end="};">
 <div class="line"><a id="l00039" name="l00039"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_axpby.html">   39</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_transform_axpby.html">TransformAxpby</a> {</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">   40</a></span>  <span class="keyword">const</span> <span class="keywordtype">float</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">alpha</a>;</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">   41</a></span>  <span class="keyword">const</span> <span class="keywordtype">float</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">beta</a>;</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  <span class="keyword">const</span> <span class="keywordtype">float</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">alpha</a>;</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  <span class="keyword">const</span> <span class="keywordtype">float</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">beta</a>;</div>
 <div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span> </div>
 <div class="foldopen" id="foldopen00043" data-start="{" data-end="}">
 <div class="line"><a id="l00043" name="l00043"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9">   43</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9">TransformAxpby</a>(<span class="keyword">const</span> <span class="keywordtype">float</span> alpha_, <span class="keyword">const</span> <span class="keywordtype">float</span> beta_)</div>
@@ -166,7 +166,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
 <div class="foldopen" id="foldopen00056" data-start="{" data-end="};">
 <div class="line"><a id="l00056" name="l00056"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_accum_helper.html">   56</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_accum_helper.html">AccumHelper</a> {</div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da">   57</a></span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> <a class="code hl_typedef" href="structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da">accum_type</a>;</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> <a class="code hl_typedef" href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">accum_type</a>;</div>
 <div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>};</div>
 </div>
 <div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span> </div>
@@ -189,7 +189,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html">utils.h</a></div></div>
 <div class="ttc" id="anamespacemlx_html"><div class="ttname"><a href="namespacemlx.html">mlx</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_accum_helper_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_accum_helper.html">mlx::steel::AccumHelper</a></div><div class="ttdef"><b>Definition</b> transforms.h:56</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_accum_helper_html_ab594958b88746f759aa7ca573f1903da"><div class="ttname"><a href="structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da">mlx::steel::AccumHelper::accum_type</a></div><div class="ttdeci">float accum_type</div><div class="ttdef"><b>Definition</b> transforms.h:57</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_accum_helper_html_ae52abf69e7ba6af1a73d65d57182ed26"><div class="ttname"><a href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">mlx::steel::AccumHelper::accum_type</a></div><div class="ttdeci">float accum_type</div><div class="ttdef"><b>Definition</b> transforms.h:57</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_swizzle_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_swizzle.html">mlx::steel::BlockSwizzle</a></div><div class="ttdef"><b>Definition</b> transforms.h:60</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_swizzle_html_a98e558d63826d2aaa06d3e65a06d2760"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760">mlx::steel::BlockSwizzle::swizzle</a></div><div class="ttdeci">static METAL_FUNC int2 swizzle(uint3 tid, const int swizzle_log)</div><div class="ttdef"><b>Definition</b> transforms.h:62</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_transform_add_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd</a></div><div class="ttdef"><b>Definition</b> transforms.h:26</div></div>
diff --git a/docs/build/html/backend_2metal_2kernels_2utils_8h.html b/docs/build/html/backend_2metal_2kernels_2utils_8h.html
index 390670848..90774282d 100644
--- a/docs/build/html/backend_2metal_2kernels_2utils_8h.html
+++ b/docs/build/html/backend_2metal_2kernels_2utils_8h.html
@@ -97,7 +97,8 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 <div class="textblock"><code>#include &lt;metal_math&gt;</code><br />
-<code>#include &quot;<a class="el" href="backend_2metal_2kernels_2bf16_8h_source.html">mlx/backend/metal/kernels/bf16.h</a>&quot;</code><br />
+<code>#include &quot;bf16.h&quot;</code><br />
+<code>#include &quot;<a class="el" href="bf16__math_8h_source.html">mlx/backend/metal/kernels/bf16_math.h</a>&quot;</code><br />
 <code>#include &quot;<a class="el" href="backend_2metal_2kernels_2complex_8h_source.html">mlx/backend/metal/kernels/complex.h</a>&quot;</code><br />
 <code>#include &quot;<a class="el" href="defines_8h_source.html">mlx/backend/metal/kernels/defines.h</a>&quot;</code><br />
 </div>
@@ -133,11 +134,11 @@ Classes</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_limits_3_01complex64__t_01_4.html">Limits&lt; complex64_t &gt;</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt; dim, offset_t &gt;</a></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 1, offset_t &gt;</a></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 0, offset_t &gt;</a></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="define-members" name="define-members"></a>
@@ -156,36 +157,37 @@ Typedefs</h2></td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:a8fd0c8fc6058e650fc99bca8b6acd7d1" id="r_a8fd0c8fc6058e650fc99bca8b6acd7d1"><td class="memTemplParams" colspan="2">template&lt;typename stride_t &gt; </td></tr>
-<tr class="memitem:a8fd0c8fc6058e650fc99bca8b6acd7d1"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC stride_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a> (uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</td></tr>
-<tr class="separator:a8fd0c8fc6058e650fc99bca8b6acd7d1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa6b041005351293e68e19b5abf1286cd" id="r_aa6b041005351293e68e19b5abf1286cd"><td class="memTemplParams" colspan="2">template&lt;typename stride_t &gt; </td></tr>
-<tr class="memitem:aa6b041005351293e68e19b5abf1286cd"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC stride_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa6b041005351293e68e19b5abf1286cd">elem_to_loc</a> (stride_t elem, constant const int *shape, constant const stride_t *strides, int ndim)</td></tr>
-<tr class="separator:aa6b041005351293e68e19b5abf1286cd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a37e00d94751710e81c9632bca2f91e51" id="r_a37e00d94751710e81c9632bca2f91e51"><td class="memTemplParams" colspan="2">template&lt;typename stride_t &gt; </td></tr>
-<tr class="memitem:a37e00d94751710e81c9632bca2f91e51"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC stride_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a37e00d94751710e81c9632bca2f91e51">elem_to_loc</a> (uint3 elem, constant const int *shape, constant const stride_t *strides, int ndim)</td></tr>
-<tr class="separator:a37e00d94751710e81c9632bca2f91e51"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a196a07022b812b241d4c06192c0fa83d" id="r_a196a07022b812b241d4c06192c0fa83d"><td class="memTemplParams" colspan="2">template&lt;typename stride_t &gt; </td></tr>
-<tr class="memitem:a196a07022b812b241d4c06192c0fa83d"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC stride_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a> (uint elem, constant const stride_t &amp;stride)</td></tr>
-<tr class="separator:a196a07022b812b241d4c06192c0fa83d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad6c45cacca97899cd362df49c06fea79" id="r_ad6c45cacca97899cd362df49c06fea79"><td class="memTemplParams" colspan="2">template&lt;typename stride_t &gt; </td></tr>
-<tr class="memitem:ad6c45cacca97899cd362df49c06fea79"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC stride_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a> (uint2 elem, constant const stride_t strides[2])</td></tr>
-<tr class="separator:ad6c45cacca97899cd362df49c06fea79"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2c34ed54714c69e6e1b44344f9e6e330" id="r_a2c34ed54714c69e6e1b44344f9e6e330"><td class="memTemplParams" colspan="2">template&lt;typename stride_t &gt; </td></tr>
-<tr class="memitem:a2c34ed54714c69e6e1b44344f9e6e330"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC stride_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a> (uint3 elem, constant const stride_t strides[3])</td></tr>
-<tr class="separator:a2c34ed54714c69e6e1b44344f9e6e330"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a01c9309978a6c12f79b6e4108728a953" id="r_a01c9309978a6c12f79b6e4108728a953"><td class="memTemplParams" colspan="2">template&lt;typename stride_t &gt; </td></tr>
-<tr class="memitem:a01c9309978a6c12f79b6e4108728a953"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC ulong2&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a01c9309978a6c12f79b6e4108728a953">elem_to_loc_2_nd</a> (uint3 elem, constant const int *shape, constant const stride_t *a_strides, constant const stride_t *b_strides, int ndim)</td></tr>
-<tr class="separator:a01c9309978a6c12f79b6e4108728a953"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a66940b1cc3d64651d24634bc696d528b" id="r_a66940b1cc3d64651d24634bc696d528b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC ulong3&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a66940b1cc3d64651d24634bc696d528b">elem_to_loc_3_nd</a> (uint3 elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, int ndim)</td></tr>
-<tr class="separator:a66940b1cc3d64651d24634bc696d528b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a22eaa505dbc7dd2a63a895f2e16712f5" id="r_a22eaa505dbc7dd2a63a895f2e16712f5"><td class="memTemplParams" colspan="2">template&lt;typename StrideT , typename IdxT  = StrideT&gt; </td></tr>
+<tr class="memitem:a22eaa505dbc7dd2a63a895f2e16712f5"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC IdxT&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a> (uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</td></tr>
+<tr class="separator:a22eaa505dbc7dd2a63a895f2e16712f5"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a4b53fb0679f67f9063deba94753d4185" id="r_a4b53fb0679f67f9063deba94753d4185"><td class="memTemplParams" colspan="2">template&lt;typename StrideT , typename IdxT  = StrideT&gt; </td></tr>
+<tr class="memitem:a4b53fb0679f67f9063deba94753d4185"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC IdxT&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a4b53fb0679f67f9063deba94753d4185">elem_to_loc</a> (StrideT elem, constant const int *shape, constant const StrideT *strides, int ndim)</td></tr>
+<tr class="separator:a4b53fb0679f67f9063deba94753d4185"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aec82f4bf0e22b8d1b89ad654ad8d8753" id="r_aec82f4bf0e22b8d1b89ad654ad8d8753"><td class="memTemplParams" colspan="2">template&lt;typename StrideT , typename IdxT  = StrideT&gt; </td></tr>
+<tr class="memitem:aec82f4bf0e22b8d1b89ad654ad8d8753"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC IdxT&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aec82f4bf0e22b8d1b89ad654ad8d8753">elem_to_loc</a> (uint3 elem, constant const int *shape, constant const StrideT *strides, int ndim)</td></tr>
+<tr class="separator:aec82f4bf0e22b8d1b89ad654ad8d8753"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac612d0ae30b8257198339debe04916a3" id="r_ac612d0ae30b8257198339debe04916a3"><td class="memTemplParams" colspan="2">template&lt;typename StrideT , typename IdxT  = StrideT&gt; </td></tr>
+<tr class="memitem:ac612d0ae30b8257198339debe04916a3"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC IdxT&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1</a> (uint elem, constant const StrideT &amp;stride)</td></tr>
+<tr class="separator:ac612d0ae30b8257198339debe04916a3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a43f33efc000962d6de881a3aab7458de" id="r_a43f33efc000962d6de881a3aab7458de"><td class="memTemplParams" colspan="2">template&lt;typename StrideT , typename IdxT  = StrideT&gt; </td></tr>
+<tr class="memitem:a43f33efc000962d6de881a3aab7458de"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC IdxT&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2</a> (uint2 elem, constant const StrideT strides[2])</td></tr>
+<tr class="separator:a43f33efc000962d6de881a3aab7458de"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a650f8ea8cf9f9519da9e301aad0308dc" id="r_a650f8ea8cf9f9519da9e301aad0308dc"><td class="memTemplParams" colspan="2">template&lt;typename StrideT , typename IdxT  = StrideT&gt; </td></tr>
+<tr class="memitem:a650f8ea8cf9f9519da9e301aad0308dc"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC IdxT&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3</a> (uint3 elem, constant const StrideT strides[3])</td></tr>
+<tr class="separator:a650f8ea8cf9f9519da9e301aad0308dc"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a66a2d7eec0262b12db16cd6c781ccf9a" id="r_a66a2d7eec0262b12db16cd6c781ccf9a"><td class="memTemplParams" colspan="2">template&lt;typename StrideT , typename IdxT  = StrideT&gt; </td></tr>
+<tr class="memitem:a66a2d7eec0262b12db16cd6c781ccf9a"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC vec&lt; IdxT, 2 &gt;&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a66a2d7eec0262b12db16cd6c781ccf9a">elem_to_loc_2_nd</a> (uint3 elem, constant const int *shape, constant const StrideT *a_strides, constant const StrideT *b_strides, int ndim)</td></tr>
+<tr class="separator:a66a2d7eec0262b12db16cd6c781ccf9a"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a65d87b425e1f8ca19df97c15049f8733" id="r_a65d87b425e1f8ca19df97c15049f8733"><td class="memTemplParams" colspan="2">template&lt;typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:a65d87b425e1f8ca19df97c15049f8733"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC vec&lt; IdxT, 3 &gt;&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a65d87b425e1f8ca19df97c15049f8733">elem_to_loc_3_nd</a> (uint3 elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, int ndim)</td></tr>
+<tr class="separator:a65d87b425e1f8ca19df97c15049f8733"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a8e5a4b0fb5d018d7b078d147efe4f1e3" id="r_a8e5a4b0fb5d018d7b078d147efe4f1e3"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U &gt; </td></tr>
 <tr class="memitem:a8e5a4b0fb5d018d7b078d147efe4f1e3"><td class="memTemplItemLeft" align="right" valign="top">T&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a8e5a4b0fb5d018d7b078d147efe4f1e3">ceildiv</a> (T N, U M)</td></tr>
 <tr class="memdesc:a8e5a4b0fb5d018d7b078d147efe4f1e3"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compute ceil((float)N/(float)M)  <br /></td></tr>
 <tr class="separator:a8e5a4b0fb5d018d7b078d147efe4f1e3"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a27c03f2f90ab56db2e4d59559a3d2e9a" id="r_a27c03f2f90ab56db2e4d59559a3d2e9a"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a> (float x)</td></tr>
 <tr class="separator:a27c03f2f90ab56db2e4d59559a3d2e9a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3501b665c8837eabf9789ea27a7d6946" id="r_a3501b665c8837eabf9789ea27a7d6946"><td class="memItemLeft" align="right" valign="top"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3501b665c8837eabf9789ea27a7d6946">log1p</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a3501b665c8837eabf9789ea27a7d6946" id="r_a3501b665c8837eabf9789ea27a7d6946"><td class="memItemLeft" align="right" valign="top"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3501b665c8837eabf9789ea27a7d6946">log1p</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a3501b665c8837eabf9789ea27a7d6946"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aba6279624b1d30c525efee856a222b5c" id="r_aba6279624b1d30c525efee856a222b5c"><td class="memItemLeft" align="right" valign="top">uint64_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aba6279624b1d30c525efee856a222b5c">simd_shuffle_down</a> (uint64_t data, uint16_t delta)</td></tr>
 <tr class="separator:aba6279624b1d30c525efee856a222b5c"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -244,11 +246,11 @@ Functions</h2></td></tr>
 <div class="line">    <span class="keyword">static</span> <span class="keyword">constexpr</span> constant type <a class="code hl_variable" href="struct_limits.html#ae7469d21f2688797ca3e388d919ef05e">finite_min</a> =                              \</div>
 <div class="line">        metal::numeric_limits&lt;type&gt;::min();                                  \</div>
 <div class="line">  };</div>
-<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:17</div></div>
-<div class="ttc" id="astruct_limits_html_a2f0673b6f9da89ce1d64f9f3d74f50a8"><div class="ttname"><a href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">Limits::max</a></div><div class="ttdeci">static const constant U max</div><div class="ttdef"><b>Definition</b> utils.h:18</div></div>
-<div class="ttc" id="astruct_limits_html_a5a3eae6d244fbea2aa7b9200001463e5"><div class="ttname"><a href="struct_limits.html#a5a3eae6d244fbea2aa7b9200001463e5">Limits::finite_max</a></div><div class="ttdeci">static const constant U finite_max</div><div class="ttdef"><b>Definition</b> utils.h:20</div></div>
-<div class="ttc" id="astruct_limits_html_a6e81584ba65a4dc6ff9366b458e3a20e"><div class="ttname"><a href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">Limits::min</a></div><div class="ttdeci">static const constant U min</div><div class="ttdef"><b>Definition</b> utils.h:19</div></div>
-<div class="ttc" id="astruct_limits_html_ae7469d21f2688797ca3e388d919ef05e"><div class="ttname"><a href="struct_limits.html#ae7469d21f2688797ca3e388d919ef05e">Limits::finite_min</a></div><div class="ttdeci">static const constant U finite_min</div><div class="ttdef"><b>Definition</b> utils.h:21</div></div>
+<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:23</div></div>
+<div class="ttc" id="astruct_limits_html_a2f0673b6f9da89ce1d64f9f3d74f50a8"><div class="ttname"><a href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">Limits::max</a></div><div class="ttdeci">static const constant U max</div><div class="ttdef"><b>Definition</b> utils.h:24</div></div>
+<div class="ttc" id="astruct_limits_html_a5a3eae6d244fbea2aa7b9200001463e5"><div class="ttname"><a href="struct_limits.html#a5a3eae6d244fbea2aa7b9200001463e5">Limits::finite_max</a></div><div class="ttdeci">static const constant U finite_max</div><div class="ttdef"><b>Definition</b> utils.h:26</div></div>
+<div class="ttc" id="astruct_limits_html_a6e81584ba65a4dc6ff9366b458e3a20e"><div class="ttname"><a href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">Limits::min</a></div><div class="ttdeci">static const constant U min</div><div class="ttdef"><b>Definition</b> utils.h:25</div></div>
+<div class="ttc" id="astruct_limits_html_ae7469d21f2688797ca3e388d919ef05e"><div class="ttname"><a href="struct_limits.html#ae7469d21f2688797ca3e388d919ef05e">Limits::finite_min</a></div><div class="ttdeci">static const constant U finite_min</div><div class="ttdef"><b>Definition</b> utils.h:27</div></div>
 </div><!-- fragment -->
 </div>
 </div>
@@ -343,18 +345,18 @@ template&lt;typename T , typename U &gt; </div>
 
 </div>
 </div>
-<a id="aa6b041005351293e68e19b5abf1286cd" name="aa6b041005351293e68e19b5abf1286cd"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa6b041005351293e68e19b5abf1286cd">&#9670;&#160;</a></span>elem_to_loc() <span class="overload">[1/3]</span></h2>
+<a id="a4b53fb0679f67f9063deba94753d4185" name="a4b53fb0679f67f9063deba94753d4185"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a4b53fb0679f67f9063deba94753d4185">&#9670;&#160;</a></span>elem_to_loc() <span class="overload">[1/3]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename stride_t &gt; </div>
+template&lt;typename StrideT , typename IdxT  = StrideT&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC stride_t elem_to_loc </td>
+          <td class="memname">METAL_FUNC IdxT elem_to_loc </td>
           <td>(</td>
-          <td class="paramtype">stride_t</td>          <td class="paramname"><span class="paramname"><em>elem</em></span>, </td>
+          <td class="paramtype">StrideT</td>          <td class="paramname"><span class="paramname"><em>elem</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -364,7 +366,7 @@ template&lt;typename stride_t &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">constant const stride_t *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
+          <td class="paramtype">constant const StrideT *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -376,16 +378,16 @@ template&lt;typename stride_t &gt; </div>
 
 </div>
 </div>
-<a id="a8fd0c8fc6058e650fc99bca8b6acd7d1" name="a8fd0c8fc6058e650fc99bca8b6acd7d1"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a8fd0c8fc6058e650fc99bca8b6acd7d1">&#9670;&#160;</a></span>elem_to_loc() <span class="overload">[2/3]</span></h2>
+<a id="a22eaa505dbc7dd2a63a895f2e16712f5" name="a22eaa505dbc7dd2a63a895f2e16712f5"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a22eaa505dbc7dd2a63a895f2e16712f5">&#9670;&#160;</a></span>elem_to_loc() <span class="overload">[2/3]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename stride_t &gt; </div>
+template&lt;typename StrideT , typename IdxT  = StrideT&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC stride_t elem_to_loc </td>
+          <td class="memname">METAL_FUNC IdxT elem_to_loc </td>
           <td>(</td>
           <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>elem</em></span>, </td>
         </tr>
@@ -397,7 +399,7 @@ template&lt;typename stride_t &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">constant const stride_t *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
+          <td class="paramtype">constant const StrideT *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -409,16 +411,16 @@ template&lt;typename stride_t &gt; </div>
 
 </div>
 </div>
-<a id="a37e00d94751710e81c9632bca2f91e51" name="a37e00d94751710e81c9632bca2f91e51"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a37e00d94751710e81c9632bca2f91e51">&#9670;&#160;</a></span>elem_to_loc() <span class="overload">[3/3]</span></h2>
+<a id="aec82f4bf0e22b8d1b89ad654ad8d8753" name="aec82f4bf0e22b8d1b89ad654ad8d8753"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aec82f4bf0e22b8d1b89ad654ad8d8753">&#9670;&#160;</a></span>elem_to_loc() <span class="overload">[3/3]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename stride_t &gt; </div>
+template&lt;typename StrideT , typename IdxT  = StrideT&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC stride_t elem_to_loc </td>
+          <td class="memname">METAL_FUNC IdxT elem_to_loc </td>
           <td>(</td>
           <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>elem</em></span>, </td>
         </tr>
@@ -430,7 +432,7 @@ template&lt;typename stride_t &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">constant const stride_t *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
+          <td class="paramtype">constant const StrideT *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -442,62 +444,62 @@ template&lt;typename stride_t &gt; </div>
 
 </div>
 </div>
-<a id="a196a07022b812b241d4c06192c0fa83d" name="a196a07022b812b241d4c06192c0fa83d"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a196a07022b812b241d4c06192c0fa83d">&#9670;&#160;</a></span>elem_to_loc_1()</h2>
+<a id="ac612d0ae30b8257198339debe04916a3" name="ac612d0ae30b8257198339debe04916a3"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ac612d0ae30b8257198339debe04916a3">&#9670;&#160;</a></span>elem_to_loc_1()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename stride_t &gt; </div>
+template&lt;typename StrideT , typename IdxT  = StrideT&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC stride_t elem_to_loc_1 </td>
+          <td class="memname">METAL_FUNC IdxT elem_to_loc_1 </td>
           <td>(</td>
           <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>elem</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">constant const stride_t &amp;</td>          <td class="paramname"><span class="paramname"><em>stride</em></span>&#160;)</td>
+          <td class="paramtype">constant const StrideT &amp;</td>          <td class="paramname"><span class="paramname"><em>stride</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="ad6c45cacca97899cd362df49c06fea79" name="ad6c45cacca97899cd362df49c06fea79"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ad6c45cacca97899cd362df49c06fea79">&#9670;&#160;</a></span>elem_to_loc_2()</h2>
+<a id="a43f33efc000962d6de881a3aab7458de" name="a43f33efc000962d6de881a3aab7458de"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a43f33efc000962d6de881a3aab7458de">&#9670;&#160;</a></span>elem_to_loc_2()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename stride_t &gt; </div>
+template&lt;typename StrideT , typename IdxT  = StrideT&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC stride_t elem_to_loc_2 </td>
+          <td class="memname">METAL_FUNC IdxT elem_to_loc_2 </td>
           <td>(</td>
           <td class="paramtype">uint2</td>          <td class="paramname"><span class="paramname"><em>elem</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">constant const stride_t</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>[2]&#160;)</td>
+          <td class="paramtype">constant const StrideT</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>[2]&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a01c9309978a6c12f79b6e4108728a953" name="a01c9309978a6c12f79b6e4108728a953"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a01c9309978a6c12f79b6e4108728a953">&#9670;&#160;</a></span>elem_to_loc_2_nd()</h2>
+<a id="a66a2d7eec0262b12db16cd6c781ccf9a" name="a66a2d7eec0262b12db16cd6c781ccf9a"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a66a2d7eec0262b12db16cd6c781ccf9a">&#9670;&#160;</a></span>elem_to_loc_2_nd()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename stride_t &gt; </div>
+template&lt;typename StrideT , typename IdxT  = StrideT&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC ulong2 elem_to_loc_2_nd </td>
+          <td class="memname">METAL_FUNC vec&lt; IdxT, 2 &gt; elem_to_loc_2_nd </td>
           <td>(</td>
           <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>elem</em></span>, </td>
         </tr>
@@ -509,12 +511,12 @@ template&lt;typename stride_t &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">constant const stride_t *</td>          <td class="paramname"><span class="paramname"><em>a_strides</em></span>, </td>
+          <td class="paramtype">constant const StrideT *</td>          <td class="paramname"><span class="paramname"><em>a_strides</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">constant const stride_t *</td>          <td class="paramname"><span class="paramname"><em>b_strides</em></span>, </td>
+          <td class="paramtype">constant const StrideT *</td>          <td class="paramname"><span class="paramname"><em>b_strides</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -526,37 +528,39 @@ template&lt;typename stride_t &gt; </div>
 
 </div>
 </div>
-<a id="a2c34ed54714c69e6e1b44344f9e6e330" name="a2c34ed54714c69e6e1b44344f9e6e330"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a2c34ed54714c69e6e1b44344f9e6e330">&#9670;&#160;</a></span>elem_to_loc_3()</h2>
+<a id="a650f8ea8cf9f9519da9e301aad0308dc" name="a650f8ea8cf9f9519da9e301aad0308dc"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a650f8ea8cf9f9519da9e301aad0308dc">&#9670;&#160;</a></span>elem_to_loc_3()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename stride_t &gt; </div>
+template&lt;typename StrideT , typename IdxT  = StrideT&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC stride_t elem_to_loc_3 </td>
+          <td class="memname">METAL_FUNC IdxT elem_to_loc_3 </td>
           <td>(</td>
           <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>elem</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">constant const stride_t</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>[3]&#160;)</td>
+          <td class="paramtype">constant const StrideT</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>[3]&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a66940b1cc3d64651d24634bc696d528b" name="a66940b1cc3d64651d24634bc696d528b"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a66940b1cc3d64651d24634bc696d528b">&#9670;&#160;</a></span>elem_to_loc_3_nd()</h2>
+<a id="a65d87b425e1f8ca19df97c15049f8733" name="a65d87b425e1f8ca19df97c15049f8733"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a65d87b425e1f8ca19df97c15049f8733">&#9670;&#160;</a></span>elem_to_loc_3_nd()</h2>
 
 <div class="memitem">
 <div class="memproto">
+<div class="memtemplate">
+template&lt;typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC ulong3 elem_to_loc_3_nd </td>
+          <td class="memname">METAL_FUNC vec&lt; IdxT, 3 &gt; elem_to_loc_3_nd </td>
           <td>(</td>
           <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>elem</em></span>, </td>
         </tr>
@@ -600,9 +604,9 @@ template&lt;typename stride_t &gt; </div>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> log1p </td>
+          <td class="memname"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> log1p </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
diff --git a/docs/build/html/backend_2metal_2kernels_2utils_8h_source.html b/docs/build/html/backend_2metal_2kernels_2utils_8h_source.html
index ddb1325ce..66c8011c3 100644
--- a/docs/build/html/backend_2metal_2kernels_2utils_8h_source.html
+++ b/docs/build/html/backend_2metal_2kernels_2utils_8h_source.html
@@ -96,497 +96,546 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
 <div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
 <div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#include &lt;metal_math&gt;</span></div>
-<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2bf16_8h.html">mlx/backend/metal/kernels/bf16.h</a>&quot;</span></div>
-<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2complex_8h.html">mlx/backend/metal/kernels/complex.h</a>&quot;</span></div>
-<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span><span class="preprocessor">#include &quot;<a class="code" href="defines_8h.html">mlx/backend/metal/kernels/defines.h</a>&quot;</span></div>
-<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span> </div>
-<div class="line"><a id="l00010" name="l00010"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">   10</a></span><span class="keyword">typedef</span> half <a class="code hl_typedef" href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a>;</div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span> </div>
+<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="comment">// The correct bf16.h is included based on the metal version</span></div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span><span class="comment">// by giving the correct path to -I during compilation</span></div>
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span><span class="comment">// e.g. mlx/backend/metal/kernels/metal_3_0/ for Metal 3.0</span></div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="preprocessor">#include &quot;bf16.h&quot;</span></div>
 <div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span> </div>
-<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span><span class="comment">// Type limits utils</span></div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span><span class="preprocessor">#include &quot;<a class="code" href="bf16__math_8h.html">mlx/backend/metal/kernels/bf16_math.h</a>&quot;</span></div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2complex_8h.html">mlx/backend/metal/kernels/complex.h</a>&quot;</span></div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span><span class="preprocessor">#include &quot;<a class="code" href="defines_8h.html">mlx/backend/metal/kernels/defines.h</a>&quot;</span></div>
 <div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span> </div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U&gt;</div>
-<div class="foldopen" id="foldopen00017" data-start="{" data-end="};">
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno"><a class="line" href="struct_limits.html">   17</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_limits.html">Limits</a> {</div>
-<div class="line"><a id="l00018" name="l00018"></a><span class="lineno"><a class="line" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">   18</a></span>  <span class="keyword">static</span> <span class="keyword">const</span> constant U <a class="code hl_variable" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">max</a> = metal::numeric_limits&lt;U&gt;::max();</div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno"><a class="line" href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">   19</a></span>  <span class="keyword">static</span> <span class="keyword">const</span> constant U <a class="code hl_variable" href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">min</a> = metal::numeric_limits&lt;U&gt;::min();</div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno"><a class="line" href="struct_limits.html#a5a3eae6d244fbea2aa7b9200001463e5">   20</a></span>  <span class="keyword">static</span> <span class="keyword">const</span> constant U <a class="code hl_variable" href="struct_limits.html#a5a3eae6d244fbea2aa7b9200001463e5">finite_max</a> = metal::numeric_limits&lt;U&gt;::max();</div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno"><a class="line" href="struct_limits.html#ae7469d21f2688797ca3e388d919ef05e">   21</a></span>  <span class="keyword">static</span> <span class="keyword">const</span> constant U <a class="code hl_variable" href="struct_limits.html#ae7469d21f2688797ca3e388d919ef05e">finite_min</a> = metal::numeric_limits&lt;U&gt;::min();</div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>};</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">   16</a></span><span class="keyword">typedef</span> half <a class="code hl_typedef" href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a>;</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span> </div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span><span class="comment">// Type limits utils</span></div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span> </div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U&gt;</div>
+<div class="foldopen" id="foldopen00023" data-start="{" data-end="};">
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno"><a class="line" href="struct_limits.html">   23</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_limits.html">Limits</a> {</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno"><a class="line" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">   24</a></span>  <span class="keyword">static</span> <span class="keyword">const</span> constant U <a class="code hl_variable" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">max</a> = metal::numeric_limits&lt;U&gt;::max();</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno"><a class="line" href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">   25</a></span>  <span class="keyword">static</span> <span class="keyword">const</span> constant U <a class="code hl_variable" href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">min</a> = metal::numeric_limits&lt;U&gt;::min();</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno"><a class="line" href="struct_limits.html#a5a3eae6d244fbea2aa7b9200001463e5">   26</a></span>  <span class="keyword">static</span> <span class="keyword">const</span> constant U <a class="code hl_variable" href="struct_limits.html#a5a3eae6d244fbea2aa7b9200001463e5">finite_max</a> = metal::numeric_limits&lt;U&gt;::max();</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno"><a class="line" href="struct_limits.html#ae7469d21f2688797ca3e388d919ef05e">   27</a></span>  <span class="keyword">static</span> <span class="keyword">const</span> constant U <a class="code hl_variable" href="struct_limits.html#ae7469d21f2688797ca3e388d919ef05e">finite_min</a> = metal::numeric_limits&lt;U&gt;::min();</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>};</div>
 </div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span> </div>
-<div class="foldopen" id="foldopen00024" data-start="" data-end="">
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">   24</a></span><span class="preprocessor">#define instantiate_default_limit(type)                                      \</span></div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span><span class="preprocessor">  template &lt;&gt;                                                                \</span></div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span><span class="preprocessor">  struct Limits&lt;type&gt; {                                                      \</span></div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span><span class="preprocessor">    static constexpr constant type max = metal::numeric_limits&lt;type&gt;::max(); \</span></div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span><span class="preprocessor">    static constexpr constant type min = metal::numeric_limits&lt;type&gt;::min(); \</span></div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span><span class="preprocessor">    static constexpr constant type finite_max =                              \</span></div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span><span class="preprocessor">        metal::numeric_limits&lt;type&gt;::max();                                  \</span></div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span><span class="preprocessor">    static constexpr constant type finite_min =                              \</span></div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span><span class="preprocessor">        metal::numeric_limits&lt;type&gt;::min();                                  \</span></div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span><span class="preprocessor">  };</span></div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span> </div>
+<div class="foldopen" id="foldopen00030" data-start="" data-end="">
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">   30</a></span><span class="preprocessor">#define instantiate_default_limit(type)                                      \</span></div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span><span class="preprocessor">  template &lt;&gt;                                                                \</span></div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span><span class="preprocessor">  struct Limits&lt;type&gt; {                                                      \</span></div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span><span class="preprocessor">    static constexpr constant type max = metal::numeric_limits&lt;type&gt;::max(); \</span></div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span><span class="preprocessor">    static constexpr constant type min = metal::numeric_limits&lt;type&gt;::min(); \</span></div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span><span class="preprocessor">    static constexpr constant type finite_max =                              \</span></div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span><span class="preprocessor">        metal::numeric_limits&lt;type&gt;::max();                                  \</span></div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span><span class="preprocessor">    static constexpr constant type finite_min =                              \</span></div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span><span class="preprocessor">        metal::numeric_limits&lt;type&gt;::min();                                  \</span></div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span><span class="preprocessor">  };</span></div>
 </div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span> </div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno"><a class="line" href="struct_limits_3_01uint8__t_01_4.html#a55f48b89033e8c8683f8540ec6b23f02">   35</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(uint8_t);</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno"><a class="line" href="struct_limits_3_01uint16__t_01_4.html#a9d517d8265ea1898b6b16e91b8595146">   36</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(uint16_t);</div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno"><a class="line" href="struct_limits_3_01uint32__t_01_4.html#a0698139f3fe440d7aa08ac5029d72235">   37</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(uint32_t);</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno"><a class="line" href="struct_limits_3_01uint64__t_01_4.html#aff101ff38be5ccdbb9790aecb3069071">   38</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(uint64_t);</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno"><a class="line" href="struct_limits_3_01int8__t_01_4.html#a24cdab873e0fb778393c69f1dc9ecf73">   39</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(int8_t);</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno"><a class="line" href="struct_limits_3_01int16__t_01_4.html#acb2936d1cdbf347a9a014c8e036a5782">   40</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(int16_t);</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno"><a class="line" href="struct_limits_3_01int32__t_01_4.html#aa9ed9f0e8c7400d8fc92e1cba9588794">   41</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(int32_t);</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno"><a class="line" href="struct_limits_3_01int64__t_01_4.html#a6c7254b641878fa0fb9538814c45457a">   42</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(int64_t);</div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span> </div>
-<div class="foldopen" id="foldopen00044" data-start="" data-end="">
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">   44</a></span><span class="preprocessor">#define instantiate_float_limit(type)             \</span></div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span><span class="preprocessor">  template &lt;&gt;                                     \</span></div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span><span class="preprocessor">  struct Limits&lt;type&gt; {                           \</span></div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span><span class="preprocessor">    static constexpr constant type max =          \</span></div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span><span class="preprocessor">        metal::numeric_limits&lt;type&gt;::infinity();  \</span></div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span><span class="preprocessor">    static constexpr constant type min =          \</span></div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span><span class="preprocessor">        -metal::numeric_limits&lt;type&gt;::infinity(); \</span></div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span><span class="preprocessor">    static constexpr constant type finite_max =   \</span></div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span><span class="preprocessor">        metal::numeric_limits&lt;type&gt;::max();       \</span></div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span><span class="preprocessor">    static constexpr constant type finite_min =   \</span></div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span><span class="preprocessor">        -metal::numeric_limits&lt;type&gt;::max();      \</span></div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span><span class="preprocessor">  };</span></div>
-</div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span> </div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno"><a class="line" href="struct_limits_3_01half_01_4.html#aedaf0190aabf23da20510e558e2690b4">   57</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">instantiate_float_limit</a>(half);</div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno"><a class="line" href="struct_limits_3_01float_01_4.html#a291eea590113fc1858b7f83f2e0c977d">   58</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">instantiate_float_limit</a>(<span class="keywordtype">float</span>);</div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno"><a class="line" href="struct_limits_3_01bfloat16__t_01_4.html#a6337dc35207b3f6f7185cd73eabac211">   59</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">instantiate_float_limit</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>);</div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span> </div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span><span class="keyword">template</span> &lt;&gt;</div>
-<div class="foldopen" id="foldopen00062" data-start="{" data-end="};">
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno"><a class="line" href="struct_limits_3_01bool_01_4.html">   62</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_limits.html">Limits</a>&lt;bool&gt; {</div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno"><a class="line" href="struct_limits_3_01bool_01_4.html#acbd2132145888d51220558a101ffcff4">   63</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">max</a> = <span class="keyword">true</span>;</div>
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno"><a class="line" href="struct_limits_3_01bool_01_4.html#a139f787b57536d455490b8ef801d37cc">   64</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">min</a> = <span class="keyword">false</span>;</div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>};</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span> </div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno"><a class="line" href="struct_limits_3_01uint8__t_01_4.html#a55f48b89033e8c8683f8540ec6b23f02">   41</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(uint8_t);</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno"><a class="line" href="struct_limits_3_01uint16__t_01_4.html#a9d517d8265ea1898b6b16e91b8595146">   42</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(uint16_t);</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno"><a class="line" href="struct_limits_3_01uint32__t_01_4.html#a0698139f3fe440d7aa08ac5029d72235">   43</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(uint32_t);</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno"><a class="line" href="struct_limits_3_01uint64__t_01_4.html#aff101ff38be5ccdbb9790aecb3069071">   44</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(uint64_t);</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno"><a class="line" href="struct_limits_3_01int8__t_01_4.html#a24cdab873e0fb778393c69f1dc9ecf73">   45</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(int8_t);</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno"><a class="line" href="struct_limits_3_01int16__t_01_4.html#acb2936d1cdbf347a9a014c8e036a5782">   46</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(int16_t);</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno"><a class="line" href="struct_limits_3_01int32__t_01_4.html#aa9ed9f0e8c7400d8fc92e1cba9588794">   47</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(int32_t);</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno"><a class="line" href="struct_limits_3_01int64__t_01_4.html#a6c7254b641878fa0fb9538814c45457a">   48</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a>(int64_t);</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span> </div>
+<div class="foldopen" id="foldopen00050" data-start="" data-end="">
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">   50</a></span><span class="preprocessor">#define instantiate_float_limit(type)             \</span></div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span><span class="preprocessor">  template &lt;&gt;                                     \</span></div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span><span class="preprocessor">  struct Limits&lt;type&gt; {                           \</span></div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span><span class="preprocessor">    static constexpr constant type max =          \</span></div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span><span class="preprocessor">        metal::numeric_limits&lt;type&gt;::infinity();  \</span></div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span><span class="preprocessor">    static constexpr constant type min =          \</span></div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span><span class="preprocessor">        -metal::numeric_limits&lt;type&gt;::infinity(); \</span></div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span><span class="preprocessor">    static constexpr constant type finite_max =   \</span></div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span><span class="preprocessor">        metal::numeric_limits&lt;type&gt;::max();       \</span></div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span><span class="preprocessor">    static constexpr constant type finite_min =   \</span></div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span><span class="preprocessor">        -metal::numeric_limits&lt;type&gt;::max();      \</span></div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span><span class="preprocessor">  };</span></div>
 </div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span> </div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno"><a class="line" href="struct_limits_3_01half_01_4.html#aedaf0190aabf23da20510e558e2690b4">   63</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">instantiate_float_limit</a>(half);</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno"><a class="line" href="struct_limits_3_01float_01_4.html#a291eea590113fc1858b7f83f2e0c977d">   64</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">instantiate_float_limit</a>(<span class="keywordtype">float</span>);</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno"><a class="line" href="struct_limits_3_01bfloat16__t_01_4.html#a6337dc35207b3f6f7185cd73eabac211">   65</a></span><a class="code hl_define" href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">instantiate_float_limit</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>);</div>
 <div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span> </div>
 <div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span><span class="keyword">template</span> &lt;&gt;</div>
 <div class="foldopen" id="foldopen00068" data-start="{" data-end="};">
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno"><a class="line" href="struct_limits_3_01complex64__t_01_4.html">   68</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_limits.html">Limits</a>&lt;<a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>&gt; {</div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno"><a class="line" href="struct_limits_3_01complex64__t_01_4.html#ac01c274b224b90f5210b675a484f4607">   69</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_variable" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">max</a> = <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>      metal::numeric_limits&lt;float&gt;::infinity(),</div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>      metal::numeric_limits&lt;float&gt;::infinity());</div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno"><a class="line" href="struct_limits_3_01complex64__t_01_4.html#aa67b04aa7abcd67f7af0808737ab8e14">   72</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_variable" href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">min</a> = <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>      -metal::numeric_limits&lt;float&gt;::infinity(),</div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>      -metal::numeric_limits&lt;float&gt;::infinity());</div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>};</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno"><a class="line" href="struct_limits_3_01bool_01_4.html">   68</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_limits.html">Limits</a>&lt;bool&gt; {</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno"><a class="line" href="struct_limits_3_01bool_01_4.html#acbd2132145888d51220558a101ffcff4">   69</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">max</a> = <span class="keyword">true</span>;</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno"><a class="line" href="struct_limits_3_01bool_01_4.html#a139f787b57536d455490b8ef801d37cc">   70</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">min</a> = <span class="keyword">false</span>;</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>};</div>
+</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span> </div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span><span class="keyword">template</span> &lt;&gt;</div>
+<div class="foldopen" id="foldopen00074" data-start="{" data-end="};">
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno"><a class="line" href="struct_limits_3_01complex64__t_01_4.html">   74</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_limits.html">Limits</a>&lt;<a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>&gt; {</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno"><a class="line" href="struct_limits_3_01complex64__t_01_4.html#ac01c274b224b90f5210b675a484f4607">   75</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_variable" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">max</a> = <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>      metal::numeric_limits&lt;float&gt;::infinity(),</div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>      metal::numeric_limits&lt;float&gt;::infinity());</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno"><a class="line" href="struct_limits_3_01complex64__t_01_4.html#aa67b04aa7abcd67f7af0808737ab8e14">   78</a></span>  <span class="keyword">static</span> <span class="keyword">constexpr</span> constant <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_variable" href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">min</a> = <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>      -metal::numeric_limits&lt;float&gt;::infinity(),</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>      -metal::numeric_limits&lt;float&gt;::infinity());</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>};</div>
 </div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span> </div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span><span class="comment">// Indexing utils</span></div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span> </div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a069b682d7d21827461544817d722bfd3">   81</a></span><span class="preprocessor">#define MLX_MTL_PRAGMA_UNROLL _Pragma(&quot;clang loop unroll(full)&quot;)</span></div>
 <div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span> </div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span><span class="comment">// Single Array with generic dims</span></div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span> </div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> str<span class="keywordtype">id</span>e_t&gt;</div>
-<div class="foldopen" id="foldopen00087" data-start="{" data-end="}">
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">   87</a></span>METAL_FUNC stride_t <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(</div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>    uint elem,</div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>    constant <span class="keyword">const</span> stride_t* strides,</div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>    <span class="keywordtype">int</span> ndim) {</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>  stride_t loc = 0;</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = ndim - 1; i &gt;= 0 &amp;&amp; elem &gt; 0; --i) {</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>    loc += (elem % shape[i]) * strides[i];</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>    elem /= shape[i];</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>  }</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>  <span class="keywordflow">return</span> loc;</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>}</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span><span class="comment">// Indexing utils</span></div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span> </div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a069b682d7d21827461544817d722bfd3">   87</a></span><span class="preprocessor">#define MLX_MTL_PRAGMA_UNROLL _Pragma(&quot;clang loop unroll(full)&quot;)</span></div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span> </div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span><span class="comment">// Single Array with generic dims</span></div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span> </div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> Str<span class="keywordtype">id</span>eT, <span class="keyword">typename</span> IdxT = Str<span class="keywordtype">id</span>eT&gt;</div>
+<div class="foldopen" id="foldopen00093" data-start="{" data-end="}">
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">   93</a></span>METAL_FUNC IdxT <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>    uint elem,</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>    constant <span class="keyword">const</span> StrideT* strides,</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>    <span class="keywordtype">int</span> ndim) {</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  IdxT loc = 0;</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = ndim - 1; i &gt;= 0 &amp;&amp; elem &gt; 0; --i) {</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    loc += (elem % shape[i]) * IdxT(strides[i]);</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>    elem /= shape[i];</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>  }</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>  <span class="keywordflow">return</span> loc;</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>}</div>
 </div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span> </div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> str<span class="keywordtype">id</span>e_t&gt;</div>
-<div class="foldopen" id="foldopen00101" data-start="{" data-end="}">
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#aa6b041005351293e68e19b5abf1286cd">  101</a></span>METAL_FUNC stride_t <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(</div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    stride_t elem,</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    constant <span class="keyword">const</span> stride_t* strides,</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <span class="keywordtype">int</span> ndim) {</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>  stride_t loc = 0;</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = ndim - 1; i &gt;= 0 &amp;&amp; elem &gt; 0; --i) {</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    loc += (elem % shape[i]) * strides[i];</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>    elem /= shape[i];</div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  }</div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  <span class="keywordflow">return</span> loc;</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>}</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span> </div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> Str<span class="keywordtype">id</span>eT, <span class="keyword">typename</span> IdxT = Str<span class="keywordtype">id</span>eT&gt;</div>
+<div class="foldopen" id="foldopen00107" data-start="{" data-end="}">
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a4b53fb0679f67f9063deba94753d4185">  107</a></span>METAL_FUNC IdxT <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    StrideT elem,</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>    constant <span class="keyword">const</span> StrideT* strides,</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    <span class="keywordtype">int</span> ndim) {</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>  IdxT loc = 0;</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = ndim - 1; i &gt;= 0 &amp;&amp; elem &gt; 0; --i) {</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>    loc += (elem % shape[i]) * IdxT(strides[i]);</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>    elem /= shape[i];</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  }</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>  <span class="keywordflow">return</span> loc;</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>}</div>
 </div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span> </div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span><span class="comment">// Non templated version to handle arbitrary dims</span></div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> str<span class="keywordtype">id</span>e_t&gt;</div>
-<div class="foldopen" id="foldopen00116" data-start="{" data-end="}">
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a37e00d94751710e81c9632bca2f91e51">  116</a></span>METAL_FUNC stride_t <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(</div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    uint3 elem,</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    constant <span class="keyword">const</span> stride_t* strides,</div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>    <span class="keywordtype">int</span> ndim) {</div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  stride_t loc = elem.x * strides[ndim - 1] + elem.y * strides[ndim - 2];</div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> d = ndim - 3; d &gt;= 0; --d) {</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    loc += (elem.z % shape[d]) * strides[d];</div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    elem.z /= shape[d];</div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>  }</div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>  <span class="keywordflow">return</span> loc;</div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>}</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span> </div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span><span class="comment">// Non templated version to handle arbitrary dims</span></div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> Str<span class="keywordtype">id</span>eT, <span class="keyword">typename</span> IdxT = Str<span class="keywordtype">id</span>eT&gt;</div>
+<div class="foldopen" id="foldopen00122" data-start="{" data-end="}">
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#aec82f4bf0e22b8d1b89ad654ad8d8753">  122</a></span>METAL_FUNC IdxT <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    uint3 elem,</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>    constant <span class="keyword">const</span> StrideT* strides,</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>    <span class="keywordtype">int</span> ndim) {</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>  IdxT loc =</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>      elem.x * IdxT(strides[ndim - 1]) + elem.y * IdxT(strides[ndim - 2]);</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> d = ndim - 3; d &gt;= 0; --d) {</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    loc += (elem.z % shape[d]) * IdxT(strides[d]);</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    elem.z /= shape[d];</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>  }</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  <span class="keywordflow">return</span> loc;</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>}</div>
 </div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span> </div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span><span class="comment">// Single Array with fixed N dims</span></div>
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span> </div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> str<span class="keywordtype">id</span>e_t&gt;</div>
-<div class="foldopen" id="foldopen00133" data-start="{" data-end="}">
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">  133</a></span>METAL_FUNC stride_t <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(uint elem, constant <span class="keyword">const</span> stride_t&amp; stride) {</div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>  <span class="keywordflow">return</span> elem * stride;</div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>}</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span> </div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span><span class="comment">// Single Array with fixed N dims</span></div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span> </div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> Str<span class="keywordtype">id</span>eT, <span class="keyword">typename</span> IdxT = Str<span class="keywordtype">id</span>eT&gt;</div>
+<div class="foldopen" id="foldopen00140" data-start="{" data-end="}">
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">  140</a></span>METAL_FUNC IdxT <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1</a>(uint elem, constant <span class="keyword">const</span> StrideT&amp; stride) {</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>  <span class="keywordflow">return</span> elem * IdxT(stride);</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>}</div>
 </div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span> </div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> str<span class="keywordtype">id</span>e_t&gt;</div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>METAL_FUNC <a class="code hl_typedef" href="namespacepocketfft_1_1detail.html#afb987c919e9424a996d0fc8b3c23cc84">stride_t</a></div>
-<div class="foldopen" id="foldopen00139" data-start="{" data-end="}">
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">  139</a></span><a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(uint2 elem, constant <span class="keyword">const</span> stride_t strides[2]) {</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>  <span class="keywordflow">return</span> elem.x * strides[1] + elem.y * strides[0];</div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>}</div>
-</div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span> </div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> str<span class="keywordtype">id</span>e_t&gt;</div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>METAL_FUNC <a class="code hl_typedef" href="namespacepocketfft_1_1detail.html#afb987c919e9424a996d0fc8b3c23cc84">stride_t</a></div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span> </div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> Str<span class="keywordtype">id</span>eT, <span class="keyword">typename</span> IdxT = Str<span class="keywordtype">id</span>eT&gt;</div>
 <div class="foldopen" id="foldopen00145" data-start="{" data-end="}">
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">  145</a></span><a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(uint3 elem, constant <span class="keyword">const</span> stride_t strides[3]) {</div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>  <span class="keywordflow">return</span> elem.x * strides[2] + elem.y * strides[1] + elem.z * strides[0];</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">  145</a></span>METAL_FUNC IdxT <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2</a>(uint2 elem, constant <span class="keyword">const</span> StrideT strides[2]) {</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>  <span class="keywordflow">return</span> elem.x * IdxT(strides[1]) + elem.y * IdxT(strides[0]);</div>
 <div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>}</div>
 </div>
 <div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span> </div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span><span class="comment">// Multiple Arrays with generic dims</span></div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span> </div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> str<span class="keywordtype">id</span>e_t&gt;</div>
-<div class="foldopen" id="foldopen00153" data-start="{" data-end="}">
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">  153</a></span>METAL_FUNC ulong2 <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">elem_to_loc_2_nd</a>(</div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>    uint3 elem,</div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    constant <span class="keyword">const</span> stride_t* a_strides,</div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    constant <span class="keyword">const</span> stride_t* b_strides,</div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    <span class="keywordtype">int</span> ndim) {</div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>  ulong2 loc = {</div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>      ulong(elem.x * a_strides[ndim - 1] + elem.y * a_strides[ndim - 2]),</div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>      ulong(elem.x * b_strides[ndim - 1] + elem.y * b_strides[ndim - 2])};</div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> d = ndim - 3; d &gt;= 0; --d) {</div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    uint l = elem.z % shape[d];</div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    loc.x += l * a_strides[d];</div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>    loc.y += l * b_strides[d];</div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    elem.z /= shape[d];</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>  }</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>  <span class="keywordflow">return</span> loc;</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>}</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> Str<span class="keywordtype">id</span>eT, <span class="keyword">typename</span> IdxT = Str<span class="keywordtype">id</span>eT&gt;</div>
+<div class="foldopen" id="foldopen00150" data-start="{" data-end="}">
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">  150</a></span>METAL_FUNC IdxT <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3</a>(uint3 elem, constant <span class="keyword">const</span> StrideT strides[3]) {</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>  <span class="keywordflow">return</span> elem.x * IdxT(strides[2]) + elem.y * IdxT(strides[1]) +</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>      elem.z * IdxT(strides[0]);</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>}</div>
 </div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span> </div>
-<div class="foldopen" id="foldopen00171" data-start="{" data-end="}">
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b">  171</a></span>METAL_FUNC ulong3 <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b">elem_to_loc_3_nd</a>(</div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>    uint3 elem,</div>
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* a_strides,</div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* b_strides,</div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* c_strides,</div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    <span class="keywordtype">int</span> ndim) {</div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>  ulong3 loc = {</div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>      elem.x * a_strides[ndim - 1] + elem.y * a_strides[ndim - 2],</div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>      elem.x * b_strides[ndim - 1] + elem.y * b_strides[ndim - 2],</div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>      elem.x * c_strides[ndim - 1] + elem.y * c_strides[ndim - 2]};</div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> d = ndim - 3; d &gt;= 0; --d) {</div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>    uint l = elem.z % shape[d];</div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>    loc.x += l * a_strides[d];</div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>    loc.y += l * b_strides[d];</div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>    loc.z += l * c_strides[d];</div>
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>    elem.z /= shape[d];</div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>  }</div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>  <span class="keywordflow">return</span> loc;</div>
-<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>}</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span> </div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span><span class="comment">// Multiple Arrays with generic dims</span></div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span> </div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> Str<span class="keywordtype">id</span>eT, <span class="keyword">typename</span> IdxT = Str<span class="keywordtype">id</span>eT&gt;</div>
+<div class="foldopen" id="foldopen00159" data-start="{" data-end="}">
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">  159</a></span>METAL_FUNC vec&lt;IdxT, 2&gt; <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">elem_to_loc_2_nd</a>(</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    uint3 elem,</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    constant <span class="keyword">const</span> StrideT* a_strides,</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    constant <span class="keyword">const</span> StrideT* b_strides,</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    <span class="keywordtype">int</span> ndim) {</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>  vec&lt;IdxT, 2&gt; loc = {</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>      IdxT(</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>          elem.x * IdxT(a_strides[ndim - 1]) +</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>          IdxT(elem.y) * IdxT(a_strides[ndim - 2])),</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>      IdxT(</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>          elem.x * IdxT(b_strides[ndim - 1]) +</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>          elem.y * IdxT(b_strides[ndim - 2]))};</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> d = ndim - 3; d &gt;= 0; --d) {</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    uint l = elem.z % shape[d];</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>    loc.x += l * IdxT(a_strides[d]);</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>    loc.y += l * IdxT(b_strides[d]);</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>    elem.z /= shape[d];</div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>  }</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>  <span class="keywordflow">return</span> loc;</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>}</div>
 </div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span> </div>
-<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span><span class="comment">// Elem to loc in a loop utils</span></div>
-<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span> </div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span><span class="keyword">template</span> &lt;<span class="keywordtype">int</span> dim, <span class="keyword">typename</span> offset_t = <span class="keywordtype">size_t</span>&gt;</div>
-<div class="foldopen" id="foldopen00197" data-start="{" data-end="};">
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc.html">  197</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a> {</div>
-<div class="line"><a id="l00198" name="l00198"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">  198</a></span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt;dim - 1, offset_t&gt; <a class="code hl_variable" href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">inner_looper</a>;</div>
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">  199</a></span>  offset_t <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a>{0};</div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">  200</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">index</a>{0};</div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span> </div>
-<div class="foldopen" id="foldopen00202" data-start="{" data-end="}">
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">  202</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(<span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>    <a class="code hl_variable" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">index</a>++;</div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>    <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a> += strides[dim - 1];</div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span> </div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">index</a> &gt;= shape[dim - 1]) {</div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>      <a class="code hl_variable" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">index</a> = 0;</div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>      <a class="code hl_variable" href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">inner_looper</a>.next(shape, strides);</div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>      <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a> = <a class="code hl_variable" href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">inner_looper</a>.offset;</div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>    }</div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>  }</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span> </div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
+<div class="foldopen" id="foldopen00182" data-start="{" data-end="}">
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733">  182</a></span>METAL_FUNC vec&lt;IdxT, 3&gt; <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733">elem_to_loc_3_nd</a>(</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>    uint3 elem,</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* a_strides,</div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* b_strides,</div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* c_strides,</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    <span class="keywordtype">int</span> ndim) {</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>  vec&lt;IdxT, 3&gt; loc = {</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>      elem.x * IdxT(a_strides[ndim - 1]) + elem.y * IdxT(a_strides[ndim - 2]),</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>      elem.x * IdxT(b_strides[ndim - 1]) + elem.y * IdxT(b_strides[ndim - 2]),</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>      elem.x * IdxT(c_strides[ndim - 1]) + elem.y * IdxT(c_strides[ndim - 2])};</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> d = ndim - 3; d &gt;= 0; --d) {</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>    uint l = elem.z % shape[d];</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    loc.x += l * IdxT(a_strides[d]);</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>    loc.y += l * IdxT(b_strides[d]);</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>    loc.z += l * IdxT(c_strides[d]);</div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>    elem.z /= shape[d];</div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>  }</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>  <span class="keywordflow">return</span> loc;</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>}</div>
 </div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span> </div>
-<div class="foldopen" id="foldopen00213" data-start="{" data-end="}">
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc.html#add610f331ef8d7d2d1917050890f82b2">  213</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structlooped__elem__to__loc.html#add610f331ef8d7d2d1917050890f82b2">next</a>(<span class="keywordtype">int</span> n, <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>    <a class="code hl_variable" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">index</a> += n;</div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>    <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a> += n * strides[dim - 1];</div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span> </div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">index</a> &gt;= shape[dim - 1]) {</div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>      <span class="keywordtype">int</span> extra = <a class="code hl_variable" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">index</a> - shape[dim - 1];</div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>      <a class="code hl_variable" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">index</a> = 0;</div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>      <a class="code hl_variable" href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">inner_looper</a>.next(shape, strides);</div>
-<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>      <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a> = <a class="code hl_variable" href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">inner_looper</a>.offset;</div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>      <span class="keywordflow">if</span> (extra &gt; 0) {</div>
-<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>        <a class="code hl_variable" href="backend_2metal_2allocator_8h.html#ae704ab07eac590091daa5fc4aec7bddb">next</a>(extra, shape, strides);</div>
-<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>      }</div>
-<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>    }</div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>  }</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span> </div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span><span class="comment">// Elem to loc in a loop utils</span></div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span> </div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span><span class="keyword">template</span> &lt;<span class="keywordtype">int</span> DIM, <span class="keyword">typename</span> OffsetT = <span class="keywordtype">size_t</span>, <span class="keywordtype">bool</span> General = true&gt;</div>
+<div class="foldopen" id="foldopen00208" data-start="{" data-end="};">
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc.html">  208</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a> {</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">  209</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a>;</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">  210</a></span>  <a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;DIM - 1, OffsetT, General&gt; <a class="code hl_variable" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">inner_looper</a>;</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">  211</a></span>  OffsetT <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a>{0};</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">  212</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a>{0};</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span> </div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc.html#a5653be1c990722a4a215be27efe5648b">  214</a></span>  <a class="code hl_function" href="struct_looped_elem_to_loc.html#a5653be1c990722a4a215be27efe5648b">LoopedElemToLoc</a>(<span class="keywordtype">int</span> <a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a>) : <a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a>(<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a>), <a class="code hl_variable" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">inner_looper</a>(<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> - 1) {}</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span> </div>
+<div class="foldopen" id="foldopen00216" data-start="{" data-end="}">
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc.html#a54c743940bf96350f3be42bba5d28205">  216</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_looped_elem_to_loc.html#a54c743940bf96350f3be42bba5d28205">next</a>(<span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> == 0) {</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>    }</div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>    <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a>++;</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>    <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> += OffsetT(strides[<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> - 1]);</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a> &gt;= shape[<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> - 1]) {</div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>      <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a> = 0;</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>      <a class="code hl_variable" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">inner_looper</a>.next(shape, strides);</div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>      <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> = <a class="code hl_variable" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">inner_looper</a>.offset;</div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>    }</div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>  }</div>
 </div>
-<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span> </div>
-<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>  offset_t</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span> </div>
 <div class="foldopen" id="foldopen00229" data-start="{" data-end="}">
-<div class="line"><a id="l00229" name="l00229"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">  229</a></span>  <a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(offset_t, <span class="keyword">const</span> constant <span class="keywordtype">int</span>*, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>*, <span class="keywordtype">int</span>) {</div>
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a>;</div>
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>  }</div>
-</div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>};</div>
-</div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span> </div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> offset_t&gt;</div>
-<div class="foldopen" id="foldopen00235" data-start="{" data-end="};">
-<div class="line"><a id="l00235" name="l00235"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html">  235</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt;1, offset_t&gt; {</div>
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a7aebc0b0656e3a55d0dbca27a57d600e">  236</a></span>  offset_t <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a>{0};</div>
-<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span> </div>
-<div class="foldopen" id="foldopen00238" data-start="{" data-end="}">
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a96cf2987c04210c9197e5237e425c4b4">  238</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a96cf2987c04210c9197e5237e425c4b4">next</a>(<span class="keyword">const</span> constant <span class="keywordtype">int</span>*, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>    <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a> += strides[0];</div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>  }</div>
-</div>
-<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span> </div>
-<div class="foldopen" id="foldopen00242" data-start="{" data-end="}">
-<div class="line"><a id="l00242" name="l00242"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#af2984b35f7d7300d4812e7872b3c8851">  242</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#af2984b35f7d7300d4812e7872b3c8851">next</a>(<span class="keywordtype">int</span> n, <span class="keyword">const</span> constant <span class="keywordtype">int</span>*, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
-<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>    <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a> += n * strides[0];</div>
-<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>  }</div>
-</div>
-<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span> </div>
-<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>  offset_t</div>
-<div class="foldopen" id="foldopen00247" data-start="{" data-end="}">
-<div class="line"><a id="l00247" name="l00247"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a368d2a2204cee5055386954acd5ccb90">  247</a></span>  <a class="code hl_function" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a368d2a2204cee5055386954acd5ccb90">location</a>(offset_t, <span class="keyword">const</span> constant <span class="keywordtype">int</span>*, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>*, <span class="keywordtype">int</span>) {</div>
-<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a>;</div>
-<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>  }</div>
-</div>
-<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>};</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc.html#a7da7bd04e79ba86f71c535b5a6ec1a2d">  229</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_looped_elem_to_loc.html#a7da7bd04e79ba86f71c535b5a6ec1a2d">next</a>(<span class="keywordtype">int</span> n, <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> == 0) {</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>    }</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>    <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a> += n;</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>    <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> += n * OffsetT(strides[<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> - 1]);</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span> </div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a> &gt;= shape[<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> - 1]) {</div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>      <span class="keywordtype">int</span> extra = <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a> - shape[<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> - 1];</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>      <span class="keywordflow">if</span> (extra &gt;= shape[<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> - 1]) {</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>        <a class="code hl_variable" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">inner_looper</a>.next(1 + extra / shape[<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> - 1], shape, strides);</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>        extra = extra % shape[<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> - 1];</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>      } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>        <a class="code hl_variable" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">inner_looper</a>.next(shape, strides);</div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>      }</div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>      <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a> = 0;</div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>      <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> = <a class="code hl_variable" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">inner_looper</a>.offset;</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>      <span class="keywordflow">if</span> (extra &gt; 0) {</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>        <a class="code hl_variable" href="backend_2metal_2allocator_8h.html#ae704ab07eac590091daa5fc4aec7bddb">next</a>(extra, shape, strides);</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>      }</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>    }</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>  }</div>
 </div>
 <div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span> </div>
-<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> offset_t&gt;</div>
-<div class="foldopen" id="foldopen00253" data-start="{" data-end="};">
-<div class="line"><a id="l00253" name="l00253"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html">  253</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt;0, offset_t&gt; {</div>
-<div class="line"><a id="l00254" name="l00254"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#aa1e9e1009c16befb9a730835836436e0">  254</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#aa1e9e1009c16befb9a730835836436e0">next</a>(<span class="keyword">const</span> constant <span class="keywordtype">int</span>*, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>*) {}</div>
-<div class="line"><a id="l00255" name="l00255"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a1064cdfdcef779b5628ce5357a6fe4f0">  255</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a1064cdfdcef779b5628ce5357a6fe4f0">next</a>(<span class="keywordtype">int</span>, <span class="keyword">const</span> constant <span class="keywordtype">int</span>*, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>*) {}</div>
+<div class="foldopen" id="foldopen00252" data-start="{" data-end="}">
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc.html#aba051a428ad0934a9c6d04d4d3ee6e0e">  252</a></span>  OffsetT <a class="code hl_function" href="struct_looped_elem_to_loc.html#aba051a428ad0934a9c6d04d4d3ee6e0e">location</a>() {</div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a>;</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>  }</div>
+</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>};</div>
+</div>
 <div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span> </div>
-<div class="foldopen" id="foldopen00257" data-start="{" data-end="}">
-<div class="line"><a id="l00257" name="l00257"></a><span class="lineno"><a class="line" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a8c7aaffda0ca500d9f9566e5e74217a2">  257</a></span>  offset_t <a class="code hl_function" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a8c7aaffda0ca500d9f9566e5e74217a2">location</a>(</div>
-<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>      offset_t idx,</div>
-<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>      <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape,</div>
-<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>      <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides,</div>
-<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>      <span class="keywordtype">int</span> ndim) {</div>
-<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>    <span class="keywordflow">return</span> <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(idx, shape, strides, ndim);</div>
-<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>  }</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> OffsetT&gt;</div>
+<div class="foldopen" id="foldopen00258" data-start="{" data-end="};">
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">  258</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;1, OffsetT, true&gt; {</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a7be6bf560080472d61e74b522979ef1e">  259</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a7be6bf560080472d61e74b522979ef1e">dim</a>;</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a3a18944c158e2747a6ddebb420299a3b">  260</a></span>  OffsetT <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a>{0};</div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a2cd3b616739b3d5b41e5b46ae335957d">  261</a></span>  uint <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a>{0};</div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span> </div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#abf536c7162d36af7367e390789944c86">  263</a></span>  <a class="code hl_function" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#abf536c7162d36af7367e390789944c86">LoopedElemToLoc</a>(<span class="keywordtype">int</span> <a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a>) : <a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a>(<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a>) {}</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span> </div>
+<div class="foldopen" id="foldopen00265" data-start="{" data-end="}">
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#af5a7c0cddeb52da88fa1140f44aec45c">  265</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#af5a7c0cddeb52da88fa1140f44aec45c">next</a>(<span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>    <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a>++;</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> &gt; 1) {</div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>      <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, OffsetT&gt;</a>(<a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a>, shape, strides, <a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a>);</div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>      <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> += OffsetT(strides[0]);</div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>    }</div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>  }</div>
 </div>
-<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>};</div>
-</div>
-<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span> </div>
-<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span><span class="comment">// Calculation utils</span></div>
-<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span> </div>
-<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U&gt;</div>
-<div class="foldopen" id="foldopen00272" data-start="{" data-end="}">
-<div class="line"><a id="l00272" name="l00272"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">  272</a></span><span class="keyword">inline</span> T <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">ceildiv</a>(T N, U M) {</div>
-<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>  <span class="keywordflow">return</span> (N + M - 1) / M;</div>
-<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>}</div>
-</div>
-<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span> </div>
-<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span><span class="comment">// https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1202</span></div>
-<div class="foldopen" id="foldopen00277" data-start="{" data-end="}">
-<div class="line"><a id="l00277" name="l00277"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">  277</a></span><span class="keyword">inline</span> <span class="keywordtype">float</span> <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a>(<span class="keywordtype">float</span> x) {</div>
-<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>  <span class="keywordtype">float</span> xp1 = 1.0f + x;</div>
-<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>  <span class="keywordflow">if</span> (xp1 == <a class="code hl_struct" href="struct_limits.html">Limits&lt;float&gt;::max</a>) {</div>
-<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct_limits.html">Limits&lt;float&gt;::max</a>;</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span> </div>
+<div class="foldopen" id="foldopen00274" data-start="{" data-end="}">
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a8fe55b3a2fa8cd35af568085faed785d">  274</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a8fe55b3a2fa8cd35af568085faed785d">next</a>(<span class="keywordtype">int</span> n, <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>    <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a> += n;</div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a> &gt; 1) {</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>      <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, OffsetT&gt;</a>(<a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a>, shape, strides, <a class="code hl_variable" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a>);</div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>      <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> = <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a> * OffsetT(strides[0]);</div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>    }</div>
 <div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>  }</div>
-<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>  <span class="keywordflow">if</span> (xp1 == 1.0f) {</div>
-<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>    <span class="keywordflow">return</span> x;</div>
-<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>  }</div>
-<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span> </div>
-<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>  <span class="keywordflow">return</span> x * (<a class="code hl_function" href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a>(xp1) / (xp1 - 1.0f));</div>
-<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>}</div>
 </div>
-<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span> </div>
-<div class="foldopen" id="foldopen00289" data-start="{" data-end="}">
-<div class="line"><a id="l00289" name="l00289"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a3501b665c8837eabf9789ea27a7d6946">  289</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> x) {</div>
-<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>  <span class="keywordtype">float</span> xp1 = 1.0f + <span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x);</div>
-<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>  <span class="keywordflow">if</span> (xp1 == <a class="code hl_struct" href="struct_limits.html">Limits&lt;float&gt;::max</a>) {</div>
-<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct_limits.html">Limits&lt;bfloat16_t&gt;::max</a>;</div>
-<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>  }</div>
-<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span>  <span class="keywordflow">if</span> (xp1 == 1.0f) {</div>
-<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>    <span class="keywordflow">return</span> x;</div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span> </div>
+<div class="foldopen" id="foldopen00283" data-start="{" data-end="}">
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a66b84b12f6c1494e5908989ed2849a9f">  283</a></span>  OffsetT <a class="code hl_function" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a66b84b12f6c1494e5908989ed2849a9f">location</a>() {</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a>;</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>  }</div>
+</div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>};</div>
+</div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span> </div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> OffsetT&gt;</div>
+<div class="foldopen" id="foldopen00289" data-start="{" data-end="};">
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html">  289</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;1, OffsetT, false&gt; {</div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af792b1fd4e8286f97b9b863c127a2d9a">  290</a></span>  OffsetT <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a>{0};</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span> </div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a0e21977d9f23b6994773e8e4f3ee70de">  292</a></span>  <a class="code hl_function" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a0e21977d9f23b6994773e8e4f3ee70de">LoopedElemToLoc</a>(<span class="keywordtype">int</span>) {}</div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span> </div>
+<div class="foldopen" id="foldopen00294" data-start="{" data-end="}">
+<div class="line"><a id="l00294" name="l00294"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a03f3ca7a60bb85e36d7eba75e0e08b15">  294</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a03f3ca7a60bb85e36d7eba75e0e08b15">next</a>(<span class="keyword">const</span> constant <span class="keywordtype">int</span>*, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
+<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>    <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> += OffsetT(strides[0]);</div>
 <div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>  }</div>
+</div>
 <div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span> </div>
-<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>  <span class="keywordflow">return</span> <a class="code hl_typedef" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>(x * (<a class="code hl_function" href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a>(xp1) / (xp1 - 1.0f)));</div>
-<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>}</div>
+<div class="foldopen" id="foldopen00298" data-start="{" data-end="}">
+<div class="line"><a id="l00298" name="l00298"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af8f2b29946324756c09951b69e170dd8">  298</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af8f2b29946324756c09951b69e170dd8">next</a>(<span class="keywordtype">int</span> n, <span class="keyword">const</span> constant <span class="keywordtype">int</span>*, <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides) {</div>
+<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>    <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a> += n * OffsetT(strides[0]);</div>
+<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>  }</div>
 </div>
-<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span> </div>
-<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span><span class="comment">// SIMD shuffle ops</span></div>
-<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span> </div>
-<div class="foldopen" id="foldopen00305" data-start="{" data-end="}">
-<div class="line"><a id="l00305" name="l00305"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c">  305</a></span><span class="keyword">inline</span> uint64_t <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(uint64_t data, uint16_t delta) {</div>
-<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span>  <span class="keywordflow">return</span> as_type&lt;uint64_t&gt;(</div>
-<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span>      <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a>(as_type&lt;uint2&gt;(data), delta));</div>
-<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>}</div>
+<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span> </div>
+<div class="foldopen" id="foldopen00302" data-start="{" data-end="}">
+<div class="line"><a id="l00302" name="l00302"></a><span class="lineno"><a class="line" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a89d9ec4dc2f2f0d77e27aa0c05f261ef">  302</a></span>  OffsetT <a class="code hl_function" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a89d9ec4dc2f2f0d77e27aa0c05f261ef">location</a>() {</div>
+<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a>;</div>
+<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>  }</div>
 </div>
-<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span> </div>
-<div class="foldopen" id="foldopen00310" data-start="{" data-end="}">
-<div class="line"><a id="l00310" name="l00310"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a0c1e4d782fcc56e1ab5565cef12430dd">  310</a></span><span class="keyword">inline</span> int64_t <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(int64_t data, uint16_t delta) {</div>
-<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>  <span class="keywordflow">return</span> as_type&lt;int64_t&gt;(</div>
-<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span>      <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a>(as_type&lt;uint2&gt;(data), delta));</div>
-<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>}</div>
+<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>};</div>
 </div>
-<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span> </div>
-<div class="foldopen" id="foldopen00315" data-start="{" data-end="}">
-<div class="line"><a id="l00315" name="l00315"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a48ae83a8caf5c74810df60b6c6cdb062">  315</a></span><span class="keyword">inline</span> <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(<span class="keywordtype">bool</span> data, uint16_t delta) {</div>
-<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span>  <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(<span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(data), delta);</div>
-<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span>}</div>
+<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span> </div>
+<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span><span class="comment">// Calculation utils</span></div>
+<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span> </div>
+<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U&gt;</div>
+<div class="foldopen" id="foldopen00313" data-start="{" data-end="}">
+<div class="line"><a id="l00313" name="l00313"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">  313</a></span><span class="keyword">inline</span> T <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">ceildiv</a>(T N, U M) {</div>
+<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>  <span class="keywordflow">return</span> (N + M - 1) / M;</div>
+<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>}</div>
 </div>
-<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span> </div>
-<div class="foldopen" id="foldopen00319" data-start="{" data-end="}">
-<div class="line"><a id="l00319" name="l00319"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ad9a671a5f9aaa729ae7a77026f16bcb0">  319</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(<a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> data, uint16_t delta) {</div>
-<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>  <span class="keywordflow">return</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
-<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span>      <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, delta), <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, delta));</div>
-<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>}</div>
+<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span> </div>
+<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span><span class="comment">// https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#1202</span></div>
+<div class="foldopen" id="foldopen00318" data-start="{" data-end="}">
+<div class="line"><a id="l00318" name="l00318"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">  318</a></span><span class="keyword">inline</span> <span class="keywordtype">float</span> <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a>(<span class="keywordtype">float</span> x) {</div>
+<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>  <span class="keywordtype">float</span> xp1 = 1.0f + x;</div>
+<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>  <span class="keywordflow">if</span> (xp1 == <a class="code hl_struct" href="struct_limits.html">Limits&lt;float&gt;::max</a>) {</div>
+<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct_limits.html">Limits&lt;float&gt;::max</a>;</div>
+<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>  }</div>
+<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>  <span class="keywordflow">if</span> (xp1 == 1.0f) {</div>
+<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>    <span class="keywordflow">return</span> x;</div>
+<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>  }</div>
+<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span> </div>
+<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span>  <span class="keywordflow">return</span> x * (<a class="code hl_function" href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a>(xp1) / (xp1 - 1.0f));</div>
+<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>}</div>
 </div>
-<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span> </div>
-<div class="foldopen" id="foldopen00324" data-start="{" data-end="}">
-<div class="line"><a id="l00324" name="l00324"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a39e436e0a942912266aae7e0bd82d7c0">  324</a></span><span class="keyword">inline</span> uint64_t <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(uint64_t data, uint16_t delta) {</div>
-<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>  <span class="keywordflow">return</span> as_type&lt;uint64_t&gt;(<a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">metal::simd_shuffle_up</a>(as_type&lt;uint2&gt;(data), delta));</div>
-<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span>}</div>
+<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span> </div>
+<div class="foldopen" id="foldopen00330" data-start="{" data-end="}">
+<div class="line"><a id="l00330" name="l00330"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a3501b665c8837eabf9789ea27a7d6946">  330</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> x) {</div>
+<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>  <span class="keywordtype">float</span> xp1 = 1.0f + <span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(x);</div>
+<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span>  <span class="keywordflow">if</span> (xp1 == <a class="code hl_struct" href="struct_limits.html">Limits&lt;float&gt;::max</a>) {</div>
+<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="struct_limits.html">Limits&lt;bfloat16_t&gt;::max</a>;</div>
+<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span>  }</div>
+<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span>  <span class="keywordflow">if</span> (xp1 == 1.0f) {</div>
+<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span>    <span class="keywordflow">return</span> x;</div>
+<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span>  }</div>
+<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span> </div>
+<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>  <span class="keywordflow">return</span> <a class="code hl_typedef" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>(x * (<a class="code hl_function" href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a>(xp1) / (xp1 - 1.0f)));</div>
+<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span>}</div>
 </div>
-<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span> </div>
-<div class="foldopen" id="foldopen00328" data-start="{" data-end="}">
-<div class="line"><a id="l00328" name="l00328"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a617f3857caf33c569afa6148135f8b7a">  328</a></span><span class="keyword">inline</span> int64_t <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(int64_t data, uint16_t delta) {</div>
-<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>  <span class="keywordflow">return</span> as_type&lt;int64_t&gt;(<a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">metal::simd_shuffle_up</a>(as_type&lt;uint2&gt;(data), delta));</div>
-<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>}</div>
+<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span> </div>
+<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span><span class="comment">// SIMD shuffle ops</span></div>
+<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span> </div>
+<div class="foldopen" id="foldopen00346" data-start="{" data-end="}">
+<div class="line"><a id="l00346" name="l00346"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c">  346</a></span><span class="keyword">inline</span> uint64_t <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(uint64_t data, uint16_t delta) {</div>
+<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span>  <span class="keywordflow">return</span> as_type&lt;uint64_t&gt;(</div>
+<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>      <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a>(as_type&lt;uint2&gt;(data), delta));</div>
+<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>}</div>
 </div>
-<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span> </div>
-<div class="foldopen" id="foldopen00332" data-start="{" data-end="}">
-<div class="line"><a id="l00332" name="l00332"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ae0f5c42020275a588234e69f1eb7a485">  332</a></span><span class="keyword">inline</span> <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(<span class="keywordtype">bool</span> data, uint16_t delta) {</div>
-<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>  <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(<span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(data), delta);</div>
-<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span>}</div>
+<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span> </div>
+<div class="foldopen" id="foldopen00351" data-start="{" data-end="}">
+<div class="line"><a id="l00351" name="l00351"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a0c1e4d782fcc56e1ab5565cef12430dd">  351</a></span><span class="keyword">inline</span> int64_t <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(int64_t data, uint16_t delta) {</div>
+<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>  <span class="keywordflow">return</span> as_type&lt;int64_t&gt;(</div>
+<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span>      <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a>(as_type&lt;uint2&gt;(data), delta));</div>
+<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span>}</div>
 </div>
-<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span> </div>
-<div class="foldopen" id="foldopen00336" data-start="{" data-end="}">
-<div class="line"><a id="l00336" name="l00336"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a92b455bac6a23af51c35ea83de2383eb">  336</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(<a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> data, uint16_t delta) {</div>
-<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span>  <span class="keywordflow">return</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
-<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span>      <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, delta), <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, delta));</div>
-<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>}</div>
+<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span> </div>
+<div class="foldopen" id="foldopen00356" data-start="{" data-end="}">
+<div class="line"><a id="l00356" name="l00356"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a48ae83a8caf5c74810df60b6c6cdb062">  356</a></span><span class="keyword">inline</span> <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(<span class="keywordtype">bool</span> data, uint16_t delta) {</div>
+<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span>  <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(<span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(data), delta);</div>
+<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span>}</div>
 </div>
-<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span> </div>
-<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span><span class="keyword">inline</span> uint64_t</div>
-<div class="foldopen" id="foldopen00342" data-start="{" data-end="}">
-<div class="line"><a id="l00342" name="l00342"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">  342</a></span><a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(uint64_t data, uint64_t filling, uint16_t delta) {</div>
-<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>  <span class="keywordflow">return</span> as_type&lt;uint64_t&gt;(<a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">metal::simd_shuffle_and_fill_up</a>(</div>
-<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>      as_type&lt;uint2&gt;(data), as_type&lt;uint2&gt;(filling), delta));</div>
-<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>}</div>
+<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span> </div>
+<div class="foldopen" id="foldopen00360" data-start="{" data-end="}">
+<div class="line"><a id="l00360" name="l00360"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ad9a671a5f9aaa729ae7a77026f16bcb0">  360</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(<a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> data, uint16_t delta) {</div>
+<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span>  <span class="keywordflow">return</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
+<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span>      <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, delta), <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, delta));</div>
+<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>}</div>
 </div>
-<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span> </div>
-<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span><span class="keyword">inline</span> int64_t</div>
-<div class="foldopen" id="foldopen00348" data-start="{" data-end="}">
-<div class="line"><a id="l00348" name="l00348"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a7bb56415c5412a6a26f70a990915f064">  348</a></span><a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(int64_t data, int64_t filling, uint16_t delta) {</div>
-<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>  <span class="keywordflow">return</span> as_type&lt;int64_t&gt;(<a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">metal::simd_shuffle_and_fill_up</a>(</div>
-<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span>      as_type&lt;uint2&gt;(data), as_type&lt;uint2&gt;(filling), delta));</div>
-<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span>}</div>
+<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span> </div>
+<div class="foldopen" id="foldopen00365" data-start="{" data-end="}">
+<div class="line"><a id="l00365" name="l00365"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a39e436e0a942912266aae7e0bd82d7c0">  365</a></span><span class="keyword">inline</span> uint64_t <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(uint64_t data, uint16_t delta) {</div>
+<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span>  <span class="keywordflow">return</span> as_type&lt;uint64_t&gt;(<a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">metal::simd_shuffle_up</a>(as_type&lt;uint2&gt;(data), delta));</div>
+<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span>}</div>
 </div>
-<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span> </div>
-<div class="foldopen" id="foldopen00353" data-start="{" data-end="}">
-<div class="line"><a id="l00353" name="l00353"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ad55bd473647f2c6c68e65e5312c132d1">  353</a></span><span class="keyword">inline</span> <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(<span class="keywordtype">bool</span> data, <span class="keywordtype">bool</span> filling, uint16_t delta) {</div>
-<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span>  <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(</div>
-<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>      <span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(data), <span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(filling), delta);</div>
-<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>}</div>
+<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span> </div>
+<div class="foldopen" id="foldopen00369" data-start="{" data-end="}">
+<div class="line"><a id="l00369" name="l00369"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a617f3857caf33c569afa6148135f8b7a">  369</a></span><span class="keyword">inline</span> int64_t <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(int64_t data, uint16_t delta) {</div>
+<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span>  <span class="keywordflow">return</span> as_type&lt;int64_t&gt;(<a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">metal::simd_shuffle_up</a>(as_type&lt;uint2&gt;(data), delta));</div>
+<div class="line"><a id="l00371" name="l00371"></a><span class="lineno">  371</span>}</div>
 </div>
-<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span> </div>
-<div class="foldopen" id="foldopen00358" data-start="{" data-end="}">
-<div class="line"><a id="l00358" name="l00358"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a94e02a6ae8c39cbf4cb23aa44df9dbd5">  358</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(</div>
-<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>    <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> data,</div>
-<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>    <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> filling,</div>
-<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span>    uint16_t delta) {</div>
-<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span>  <span class="keywordflow">return</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
-<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>      <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, filling.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, delta),</div>
-<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>      <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, filling.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, delta));</div>
-<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span>}</div>
+<div class="line"><a id="l00372" name="l00372"></a><span class="lineno">  372</span> </div>
+<div class="foldopen" id="foldopen00373" data-start="{" data-end="}">
+<div class="line"><a id="l00373" name="l00373"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ae0f5c42020275a588234e69f1eb7a485">  373</a></span><span class="keyword">inline</span> <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(<span class="keywordtype">bool</span> data, uint16_t delta) {</div>
+<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span>  <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(<span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(data), delta);</div>
+<div class="line"><a id="l00375" name="l00375"></a><span class="lineno">  375</span>}</div>
 </div>
-<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span> </div>
-<div class="foldopen" id="foldopen00367" data-start="{" data-end="}">
-<div class="line"><a id="l00367" name="l00367"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2">  367</a></span><span class="keyword">inline</span> uint64_t <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(uint64_t data, uint16_t lane) {</div>
-<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span>  <span class="keywordflow">return</span> as_type&lt;uint64_t&gt;(<a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">metal::simd_shuffle</a>(as_type&lt;uint2&gt;(data), lane));</div>
-<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span>}</div>
+<div class="line"><a id="l00376" name="l00376"></a><span class="lineno">  376</span> </div>
+<div class="foldopen" id="foldopen00377" data-start="{" data-end="}">
+<div class="line"><a id="l00377" name="l00377"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a92b455bac6a23af51c35ea83de2383eb">  377</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(<a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> data, uint16_t delta) {</div>
+<div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span>  <span class="keywordflow">return</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
+<div class="line"><a id="l00379" name="l00379"></a><span class="lineno">  379</span>      <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, delta), <a class="code hl_function" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, delta));</div>
+<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>}</div>
 </div>
-<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span> </div>
-<div class="foldopen" id="foldopen00371" data-start="{" data-end="}">
-<div class="line"><a id="l00371" name="l00371"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a3bdbdfeb7a1dde40cd3ce1df8d9213b5">  371</a></span><span class="keyword">inline</span> int64_t <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(int64_t data, uint16_t lane) {</div>
-<div class="line"><a id="l00372" name="l00372"></a><span class="lineno">  372</span>  <span class="keywordflow">return</span> as_type&lt;int64_t&gt;(<a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">metal::simd_shuffle</a>(as_type&lt;uint2&gt;(data), lane));</div>
-<div class="line"><a id="l00373" name="l00373"></a><span class="lineno">  373</span>}</div>
+<div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span> </div>
+<div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span><span class="keyword">inline</span> uint64_t</div>
+<div class="foldopen" id="foldopen00383" data-start="{" data-end="}">
+<div class="line"><a id="l00383" name="l00383"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">  383</a></span><a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(uint64_t data, uint64_t filling, uint16_t delta) {</div>
+<div class="line"><a id="l00384" name="l00384"></a><span class="lineno">  384</span>  <span class="keywordflow">return</span> as_type&lt;uint64_t&gt;(<a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">metal::simd_shuffle_and_fill_up</a>(</div>
+<div class="line"><a id="l00385" name="l00385"></a><span class="lineno">  385</span>      as_type&lt;uint2&gt;(data), as_type&lt;uint2&gt;(filling), delta));</div>
+<div class="line"><a id="l00386" name="l00386"></a><span class="lineno">  386</span>}</div>
 </div>
-<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span> </div>
-<div class="foldopen" id="foldopen00375" data-start="{" data-end="}">
-<div class="line"><a id="l00375" name="l00375"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ab4cbcdb054f9165130da91a3334da0cf">  375</a></span><span class="keyword">inline</span> <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(<span class="keywordtype">bool</span> data, uint16_t lane) {</div>
-<div class="line"><a id="l00376" name="l00376"></a><span class="lineno">  376</span>  <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(<span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(data), lane);</div>
-<div class="line"><a id="l00377" name="l00377"></a><span class="lineno">  377</span>}</div>
+<div class="line"><a id="l00387" name="l00387"></a><span class="lineno">  387</span> </div>
+<div class="line"><a id="l00388" name="l00388"></a><span class="lineno">  388</span><span class="keyword">inline</span> int64_t</div>
+<div class="foldopen" id="foldopen00389" data-start="{" data-end="}">
+<div class="line"><a id="l00389" name="l00389"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a7bb56415c5412a6a26f70a990915f064">  389</a></span><a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(int64_t data, int64_t filling, uint16_t delta) {</div>
+<div class="line"><a id="l00390" name="l00390"></a><span class="lineno">  390</span>  <span class="keywordflow">return</span> as_type&lt;int64_t&gt;(<a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">metal::simd_shuffle_and_fill_up</a>(</div>
+<div class="line"><a id="l00391" name="l00391"></a><span class="lineno">  391</span>      as_type&lt;uint2&gt;(data), as_type&lt;uint2&gt;(filling), delta));</div>
+<div class="line"><a id="l00392" name="l00392"></a><span class="lineno">  392</span>}</div>
 </div>
-<div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span> </div>
-<div class="foldopen" id="foldopen00379" data-start="{" data-end="}">
-<div class="line"><a id="l00379" name="l00379"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ab8175b66bcc080fb89f738143568c30b">  379</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(<a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> data, uint16_t lane) {</div>
-<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>  <span class="keywordflow">return</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
-<div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span>      <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, lane), <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, lane));</div>
-<div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span>}</div>
+<div class="line"><a id="l00393" name="l00393"></a><span class="lineno">  393</span> </div>
+<div class="foldopen" id="foldopen00394" data-start="{" data-end="}">
+<div class="line"><a id="l00394" name="l00394"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ad55bd473647f2c6c68e65e5312c132d1">  394</a></span><span class="keyword">inline</span> <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(<span class="keywordtype">bool</span> data, <span class="keywordtype">bool</span> filling, uint16_t delta) {</div>
+<div class="line"><a id="l00395" name="l00395"></a><span class="lineno">  395</span>  <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(</div>
+<div class="line"><a id="l00396" name="l00396"></a><span class="lineno">  396</span>      <span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(data), <span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(filling), delta);</div>
+<div class="line"><a id="l00397" name="l00397"></a><span class="lineno">  397</span>}</div>
+</div>
+<div class="line"><a id="l00398" name="l00398"></a><span class="lineno">  398</span> </div>
+<div class="foldopen" id="foldopen00399" data-start="{" data-end="}">
+<div class="line"><a id="l00399" name="l00399"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a94e02a6ae8c39cbf4cb23aa44df9dbd5">  399</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(</div>
+<div class="line"><a id="l00400" name="l00400"></a><span class="lineno">  400</span>    <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> data,</div>
+<div class="line"><a id="l00401" name="l00401"></a><span class="lineno">  401</span>    <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> filling,</div>
+<div class="line"><a id="l00402" name="l00402"></a><span class="lineno">  402</span>    uint16_t delta) {</div>
+<div class="line"><a id="l00403" name="l00403"></a><span class="lineno">  403</span>  <span class="keywordflow">return</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
+<div class="line"><a id="l00404" name="l00404"></a><span class="lineno">  404</span>      <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, filling.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, delta),</div>
+<div class="line"><a id="l00405" name="l00405"></a><span class="lineno">  405</span>      <a class="code hl_function" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, filling.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, delta));</div>
+<div class="line"><a id="l00406" name="l00406"></a><span class="lineno">  406</span>}</div>
+</div>
+<div class="line"><a id="l00407" name="l00407"></a><span class="lineno">  407</span> </div>
+<div class="foldopen" id="foldopen00408" data-start="{" data-end="}">
+<div class="line"><a id="l00408" name="l00408"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2">  408</a></span><span class="keyword">inline</span> uint64_t <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(uint64_t data, uint16_t lane) {</div>
+<div class="line"><a id="l00409" name="l00409"></a><span class="lineno">  409</span>  <span class="keywordflow">return</span> as_type&lt;uint64_t&gt;(<a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">metal::simd_shuffle</a>(as_type&lt;uint2&gt;(data), lane));</div>
+<div class="line"><a id="l00410" name="l00410"></a><span class="lineno">  410</span>}</div>
+</div>
+<div class="line"><a id="l00411" name="l00411"></a><span class="lineno">  411</span> </div>
+<div class="foldopen" id="foldopen00412" data-start="{" data-end="}">
+<div class="line"><a id="l00412" name="l00412"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#a3bdbdfeb7a1dde40cd3ce1df8d9213b5">  412</a></span><span class="keyword">inline</span> int64_t <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(int64_t data, uint16_t lane) {</div>
+<div class="line"><a id="l00413" name="l00413"></a><span class="lineno">  413</span>  <span class="keywordflow">return</span> as_type&lt;int64_t&gt;(<a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">metal::simd_shuffle</a>(as_type&lt;uint2&gt;(data), lane));</div>
+<div class="line"><a id="l00414" name="l00414"></a><span class="lineno">  414</span>}</div>
+</div>
+<div class="line"><a id="l00415" name="l00415"></a><span class="lineno">  415</span> </div>
+<div class="foldopen" id="foldopen00416" data-start="{" data-end="}">
+<div class="line"><a id="l00416" name="l00416"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ab4cbcdb054f9165130da91a3334da0cf">  416</a></span><span class="keyword">inline</span> <span class="keywordtype">bool</span> <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(<span class="keywordtype">bool</span> data, uint16_t lane) {</div>
+<div class="line"><a id="l00417" name="l00417"></a><span class="lineno">  417</span>  <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(<span class="keyword">static_cast&lt;</span>uint32_t<span class="keyword">&gt;</span>(data), lane);</div>
+<div class="line"><a id="l00418" name="l00418"></a><span class="lineno">  418</span>}</div>
+</div>
+<div class="line"><a id="l00419" name="l00419"></a><span class="lineno">  419</span> </div>
+<div class="foldopen" id="foldopen00420" data-start="{" data-end="}">
+<div class="line"><a id="l00420" name="l00420"></a><span class="lineno"><a class="line" href="backend_2metal_2kernels_2utils_8h.html#ab8175b66bcc080fb89f738143568c30b">  420</a></span><span class="keyword">inline</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(<a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a> data, uint16_t lane) {</div>
+<div class="line"><a id="l00421" name="l00421"></a><span class="lineno">  421</span>  <span class="keywordflow">return</span> <a class="code hl_struct" href="structcomplex64__t.html">complex64_t</a>(</div>
+<div class="line"><a id="l00422" name="l00422"></a><span class="lineno">  422</span>      <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">real</a>, lane), <a class="code hl_function" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a>(data.<a class="code hl_variable" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">imag</a>, lane));</div>
+<div class="line"><a id="l00423" name="l00423"></a><span class="lineno">  423</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2allocator_8h_html_ae704ab07eac590091daa5fc4aec7bddb"><div class="ttname"><a href="backend_2metal_2allocator_8h.html#ae704ab07eac590091daa5fc4aec7bddb">next</a></div><div class="ttdeci">BufferHolder * next</div><div class="ttdef"><b>Definition</b> allocator.h:38</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html">bf16.h</a></div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a7782de82393104dd4ad754ce3b316e82"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></div><div class="ttdeci">struct _MLX_BFloat16 bfloat16_t</div><div class="ttdef"><b>Definition</b> bf16.h:257</div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2complex_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2complex_8h.html">complex.h</a></div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a01c9309978a6c12f79b6e4108728a953"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">elem_to_loc_2_nd</a></div><div class="ttdeci">METAL_FUNC ulong2 elem_to_loc_2_nd(uint3 elem, constant const int *shape, constant const stride_t *a_strides, constant const stride_t *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:153</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a196a07022b812b241d4c06192c0fa83d"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_1(uint elem, constant const stride_t &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:133</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a1e520e23f58ca645dea1ac20998d987a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">instantiate_float_limit</a></div><div class="ttdeci">#define instantiate_float_limit(type)</div><div class="ttdef"><b>Definition</b> utils.h:44</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a27c03f2f90ab56db2e4d59559a3d2e9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a></div><div class="ttdeci">float log1p(float x)</div><div class="ttdef"><b>Definition</b> utils.h:277</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a2c34ed54714c69e6e1b44344f9e6e330"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_3(uint3 elem, constant const stride_t strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a66940b1cc3d64651d24634bc696d528b"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b">elem_to_loc_3_nd</a></div><div class="ttdeci">METAL_FUNC ulong3 elem_to_loc_3_nd(uint3 elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:171</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8e5a4b0fb5d018d7b078d147efe4f1e3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">ceildiv</a></div><div class="ttdeci">T ceildiv(T N, U M)</div><div class="ttdoc">Compute ceil((float)N/(float)M)</div><div class="ttdef"><b>Definition</b> utils.h:272</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_abedffa358e7ba7782cc78d6772064c7c"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a></div><div class="ttdeci">#define instantiate_default_limit(type)</div><div class="ttdef"><b>Definition</b> utils.h:24</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_acb8ddf4a29129846b673c50ba7078773"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a></div><div class="ttdeci">half float16_t</div><div class="ttdef"><b>Definition</b> utils.h:10</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ad6c45cacca97899cd362df49c06fea79"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_2(uint2 elem, constant const stride_t strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:139</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a7782de82393104dd4ad754ce3b316e82"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></div><div class="ttdeci">struct _MLX_BFloat16 bfloat16_t</div><div class="ttdef"><b>Definition</b> bf16.h:251</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a1e520e23f58ca645dea1ac20998d987a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a">instantiate_float_limit</a></div><div class="ttdeci">#define instantiate_float_limit(type)</div><div class="ttdef"><b>Definition</b> utils.h:50</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a27c03f2f90ab56db2e4d59559a3d2e9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a></div><div class="ttdeci">float log1p(float x)</div><div class="ttdef"><b>Definition</b> utils.h:318</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a43f33efc000962d6de881a3aab7458de"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_2(uint2 elem, constant const StrideT strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a650f8ea8cf9f9519da9e301aad0308dc"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const StrideT strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:150</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a65d87b425e1f8ca19df97c15049f8733"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733">elem_to_loc_3_nd</a></div><div class="ttdeci">METAL_FUNC vec&lt; IdxT, 3 &gt; elem_to_loc_3_nd(uint3 elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:182</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a66a2d7eec0262b12db16cd6c781ccf9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">elem_to_loc_2_nd</a></div><div class="ttdeci">METAL_FUNC vec&lt; IdxT, 2 &gt; elem_to_loc_2_nd(uint3 elem, constant const int *shape, constant const StrideT *a_strides, constant const StrideT *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:159</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8e5a4b0fb5d018d7b078d147efe4f1e3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">ceildiv</a></div><div class="ttdeci">T ceildiv(T N, U M)</div><div class="ttdoc">Compute ceil((float)N/(float)M)</div><div class="ttdef"><b>Definition</b> utils.h:313</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_abedffa358e7ba7782cc78d6772064c7c"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c">instantiate_default_limit</a></div><div class="ttdeci">#define instantiate_default_limit(type)</div><div class="ttdef"><b>Definition</b> utils.h:30</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ac612d0ae30b8257198339debe04916a3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_1(uint elem, constant const StrideT &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:140</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_acb8ddf4a29129846b673c50ba7078773"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a></div><div class="ttdeci">half float16_t</div><div class="ttdef"><b>Definition</b> utils.h:16</div></div>
+<div class="ttc" id="abf16__math_8h_html"><div class="ttname"><a href="bf16__math_8h.html">bf16_math.h</a></div></div>
 <div class="ttc" id="adefines_8h_html"><div class="ttname"><a href="defines_8h.html">defines.h</a></div></div>
-<div class="ttc" id="anamespacemetal_html_a1ca14116bf50639b214d8414b5bbaaa6"><div class="ttname"><a href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">metal::simd_shuffle_and_fill_up</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)</div><div class="ttdef"><b>Definition</b> bf16_math.h:391</div></div>
-<div class="ttc" id="anamespacemetal_html_a259ed115bc3c58f88eb35830916b26d4"><div class="ttname"><a href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">metal::simd_shuffle</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle(bfloat16_t data, ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> bf16_math.h:391</div></div>
-<div class="ttc" id="anamespacemetal_html_a423a9f4f2fc7ef5ec7eda061277b51b6"><div class="ttname"><a href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_af6e2dd7ae087aba6abac4f0350b7611c"><div class="ttname"><a href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_down(bfloat16_t data, ushort delta)</div><div class="ttdef"><b>Definition</b> bf16_math.h:391</div></div>
-<div class="ttc" id="anamespacemetal_html_afe81c5fbde3f4890458b081909242c55"><div class="ttname"><a href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">metal::simd_shuffle_up</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_up(bfloat16_t data, ushort delta)</div><div class="ttdef"><b>Definition</b> bf16_math.h:391</div></div>
-<div class="ttc" id="anamespacepocketfft_1_1detail_html_afb987c919e9424a996d0fc8b3c23cc84"><div class="ttname"><a href="namespacepocketfft_1_1detail.html#afb987c919e9424a996d0fc8b3c23cc84">pocketfft::detail::stride_t</a></div><div class="ttdeci">std::vector&lt; ptrdiff_t &gt; stride_t</div><div class="ttdef"><b>Definition</b> pocketfft.h:103</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
-<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:17</div></div>
-<div class="ttc" id="astruct_limits_html_a2f0673b6f9da89ce1d64f9f3d74f50a8"><div class="ttname"><a href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">Limits::max</a></div><div class="ttdeci">static const constant U max</div><div class="ttdef"><b>Definition</b> utils.h:18</div></div>
-<div class="ttc" id="astruct_limits_html_a5a3eae6d244fbea2aa7b9200001463e5"><div class="ttname"><a href="struct_limits.html#a5a3eae6d244fbea2aa7b9200001463e5">Limits::finite_max</a></div><div class="ttdeci">static const constant U finite_max</div><div class="ttdef"><b>Definition</b> utils.h:20</div></div>
-<div class="ttc" id="astruct_limits_html_a6e81584ba65a4dc6ff9366b458e3a20e"><div class="ttname"><a href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">Limits::min</a></div><div class="ttdeci">static const constant U min</div><div class="ttdef"><b>Definition</b> utils.h:19</div></div>
-<div class="ttc" id="astruct_limits_html_ae7469d21f2688797ca3e388d919ef05e"><div class="ttname"><a href="struct_limits.html#ae7469d21f2688797ca3e388d919ef05e">Limits::finite_min</a></div><div class="ttdeci">static const constant U finite_min</div><div class="ttdef"><b>Definition</b> utils.h:21</div></div>
+<div class="ttc" id="anamespacemetal_html_a1ca14116bf50639b214d8414b5bbaaa6"><div class="ttname"><a href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">metal::simd_shuffle_and_fill_up</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)</div><div class="ttdef"><b>Definition</b> bf16_math.h:377</div></div>
+<div class="ttc" id="anamespacemetal_html_a259ed115bc3c58f88eb35830916b26d4"><div class="ttname"><a href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">metal::simd_shuffle</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle(bfloat16_t data, ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> bf16_math.h:377</div></div>
+<div class="ttc" id="anamespacemetal_html_a423a9f4f2fc7ef5ec7eda061277b51b6"><div class="ttname"><a href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_af6e2dd7ae087aba6abac4f0350b7611c"><div class="ttname"><a href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_down(bfloat16_t data, ushort delta)</div><div class="ttdef"><b>Definition</b> bf16_math.h:377</div></div>
+<div class="ttc" id="anamespacemetal_html_afe81c5fbde3f4890458b081909242c55"><div class="ttname"><a href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">metal::simd_shuffle_up</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_up(bfloat16_t data, ushort delta)</div><div class="ttdef"><b>Definition</b> bf16_math.h:377</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
+<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:23</div></div>
+<div class="ttc" id="astruct_limits_html_a2f0673b6f9da89ce1d64f9f3d74f50a8"><div class="ttname"><a href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">Limits::max</a></div><div class="ttdeci">static const constant U max</div><div class="ttdef"><b>Definition</b> utils.h:24</div></div>
+<div class="ttc" id="astruct_limits_html_a5a3eae6d244fbea2aa7b9200001463e5"><div class="ttname"><a href="struct_limits.html#a5a3eae6d244fbea2aa7b9200001463e5">Limits::finite_max</a></div><div class="ttdeci">static const constant U finite_max</div><div class="ttdef"><b>Definition</b> utils.h:26</div></div>
+<div class="ttc" id="astruct_limits_html_a6e81584ba65a4dc6ff9366b458e3a20e"><div class="ttname"><a href="struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e">Limits::min</a></div><div class="ttdeci">static const constant U min</div><div class="ttdef"><b>Definition</b> utils.h:25</div></div>
+<div class="ttc" id="astruct_limits_html_ae7469d21f2688797ca3e388d919ef05e"><div class="ttname"><a href="struct_limits.html#ae7469d21f2688797ca3e388d919ef05e">Limits::finite_min</a></div><div class="ttdeci">static const constant U finite_min</div><div class="ttdef"><b>Definition</b> utils.h:27</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4_html_a03f3ca7a60bb85e36d7eba75e0e08b15"><div class="ttname"><a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a03f3ca7a60bb85e36d7eba75e0e08b15">LoopedElemToLoc&lt; 1, OffsetT, false &gt;::next</a></div><div class="ttdeci">void next(const constant int *, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:294</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4_html_a0e21977d9f23b6994773e8e4f3ee70de"><div class="ttname"><a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a0e21977d9f23b6994773e8e4f3ee70de">LoopedElemToLoc&lt; 1, OffsetT, false &gt;::LoopedElemToLoc</a></div><div class="ttdeci">LoopedElemToLoc(int)</div><div class="ttdef"><b>Definition</b> utils.h:292</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4_html_a89d9ec4dc2f2f0d77e27aa0c05f261ef"><div class="ttname"><a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a89d9ec4dc2f2f0d77e27aa0c05f261ef">LoopedElemToLoc&lt; 1, OffsetT, false &gt;::location</a></div><div class="ttdeci">OffsetT location()</div><div class="ttdef"><b>Definition</b> utils.h:302</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4_html_af8f2b29946324756c09951b69e170dd8"><div class="ttname"><a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af8f2b29946324756c09951b69e170dd8">LoopedElemToLoc&lt; 1, OffsetT, false &gt;::next</a></div><div class="ttdeci">void next(int n, const constant int *, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:298</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4_html_a66b84b12f6c1494e5908989ed2849a9f"><div class="ttname"><a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a66b84b12f6c1494e5908989ed2849a9f">LoopedElemToLoc&lt; 1, OffsetT, true &gt;::location</a></div><div class="ttdeci">OffsetT location()</div><div class="ttdef"><b>Definition</b> utils.h:283</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4_html_a7be6bf560080472d61e74b522979ef1e"><div class="ttname"><a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a7be6bf560080472d61e74b522979ef1e">LoopedElemToLoc&lt; 1, OffsetT, true &gt;::dim</a></div><div class="ttdeci">int dim</div><div class="ttdef"><b>Definition</b> utils.h:259</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4_html_a8fe55b3a2fa8cd35af568085faed785d"><div class="ttname"><a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a8fe55b3a2fa8cd35af568085faed785d">LoopedElemToLoc&lt; 1, OffsetT, true &gt;::next</a></div><div class="ttdeci">void next(int n, const constant int *shape, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:274</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4_html_abf536c7162d36af7367e390789944c86"><div class="ttname"><a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#abf536c7162d36af7367e390789944c86">LoopedElemToLoc&lt; 1, OffsetT, true &gt;::LoopedElemToLoc</a></div><div class="ttdeci">LoopedElemToLoc(int dim)</div><div class="ttdef"><b>Definition</b> utils.h:263</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4_html_af5a7c0cddeb52da88fa1140f44aec45c"><div class="ttname"><a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#af5a7c0cddeb52da88fa1140f44aec45c">LoopedElemToLoc&lt; 1, OffsetT, true &gt;::next</a></div><div class="ttdeci">void next(const constant int *shape, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:265</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html"><div class="ttname"><a href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a></div><div class="ttdef"><b>Definition</b> utils.h:208</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html_a54c743940bf96350f3be42bba5d28205"><div class="ttname"><a href="struct_looped_elem_to_loc.html#a54c743940bf96350f3be42bba5d28205">LoopedElemToLoc::next</a></div><div class="ttdeci">void next(const constant int *shape, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:216</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html_a5653be1c990722a4a215be27efe5648b"><div class="ttname"><a href="struct_looped_elem_to_loc.html#a5653be1c990722a4a215be27efe5648b">LoopedElemToLoc::LoopedElemToLoc</a></div><div class="ttdeci">LoopedElemToLoc(int dim)</div><div class="ttdef"><b>Definition</b> utils.h:214</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html_a7da7bd04e79ba86f71c535b5a6ec1a2d"><div class="ttname"><a href="struct_looped_elem_to_loc.html#a7da7bd04e79ba86f71c535b5a6ec1a2d">LoopedElemToLoc::next</a></div><div class="ttdeci">void next(int n, const constant int *shape, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:229</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html_a8fbe77b4a774a30af5734dd9c5bd1f40"><div class="ttname"><a href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">LoopedElemToLoc::inner_looper</a></div><div class="ttdeci">LoopedElemToLoc&lt; DIM - 1, OffsetT, General &gt; inner_looper</div><div class="ttdef"><b>Definition</b> utils.h:210</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html_aba051a428ad0934a9c6d04d4d3ee6e0e"><div class="ttname"><a href="struct_looped_elem_to_loc.html#aba051a428ad0934a9c6d04d4d3ee6e0e">LoopedElemToLoc::location</a></div><div class="ttdeci">OffsetT location()</div><div class="ttdef"><b>Definition</b> utils.h:252</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html_acbd070b3193d9e87fb2c2db8db571333"><div class="ttname"><a href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">LoopedElemToLoc::index</a></div><div class="ttdeci">int index</div><div class="ttdef"><b>Definition</b> utils.h:212</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html_acdffe540c383a67417604b6080704791"><div class="ttname"><a href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">LoopedElemToLoc::offset</a></div><div class="ttdeci">OffsetT offset</div><div class="ttdef"><b>Definition</b> utils.h:211</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html_af8285112846769aba2c0d8615f6f1364"><div class="ttname"><a href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">LoopedElemToLoc::dim</a></div><div class="ttdeci">int dim</div><div class="ttdef"><b>Definition</b> utils.h:209</div></div>
 <div class="ttc" id="astructcomplex64__t_html"><div class="ttname"><a href="structcomplex64__t.html">complex64_t</a></div><div class="ttdef"><b>Definition</b> complex.h:20</div></div>
 <div class="ttc" id="astructcomplex64__t_html_a94037c0cf8451aaff7cb4d154a8426de"><div class="ttname"><a href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">complex64_t::imag</a></div><div class="ttdeci">float imag</div><div class="ttdef"><b>Definition</b> complex.h:22</div></div>
 <div class="ttc" id="astructcomplex64__t_html_abbd4a0092eca9f112c1c5ae1a133a27e"><div class="ttname"><a href="structcomplex64__t.html#abbd4a0092eca9f112c1c5ae1a133a27e">complex64_t::real</a></div><div class="ttdeci">float real</div><div class="ttdef"><b>Definition</b> complex.h:21</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_3_010_00_01offset__t_01_4_html_a1064cdfdcef779b5628ce5357a6fe4f0"><div class="ttname"><a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a1064cdfdcef779b5628ce5357a6fe4f0">looped_elem_to_loc&lt; 0, offset_t &gt;::next</a></div><div class="ttdeci">void next(int, const constant int *, const constant size_t *)</div><div class="ttdef"><b>Definition</b> utils.h:255</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_3_010_00_01offset__t_01_4_html_a8c7aaffda0ca500d9f9566e5e74217a2"><div class="ttname"><a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a8c7aaffda0ca500d9f9566e5e74217a2">looped_elem_to_loc&lt; 0, offset_t &gt;::location</a></div><div class="ttdeci">offset_t location(offset_t idx, const constant int *shape, const constant size_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:257</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_3_010_00_01offset__t_01_4_html_aa1e9e1009c16befb9a730835836436e0"><div class="ttname"><a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#aa1e9e1009c16befb9a730835836436e0">looped_elem_to_loc&lt; 0, offset_t &gt;::next</a></div><div class="ttdeci">void next(const constant int *, const constant size_t *)</div><div class="ttdef"><b>Definition</b> utils.h:254</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_3_011_00_01offset__t_01_4_html_a368d2a2204cee5055386954acd5ccb90"><div class="ttname"><a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a368d2a2204cee5055386954acd5ccb90">looped_elem_to_loc&lt; 1, offset_t &gt;::location</a></div><div class="ttdeci">offset_t location(offset_t, const constant int *, const constant size_t *, int)</div><div class="ttdef"><b>Definition</b> utils.h:247</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_3_011_00_01offset__t_01_4_html_a96cf2987c04210c9197e5237e425c4b4"><div class="ttname"><a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a96cf2987c04210c9197e5237e425c4b4">looped_elem_to_loc&lt; 1, offset_t &gt;::next</a></div><div class="ttdeci">void next(const constant int *, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:238</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_3_011_00_01offset__t_01_4_html_af2984b35f7d7300d4812e7872b3c8851"><div class="ttname"><a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#af2984b35f7d7300d4812e7872b3c8851">looped_elem_to_loc&lt; 1, offset_t &gt;::next</a></div><div class="ttdeci">void next(int n, const constant int *, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:242</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html"><div class="ttname"><a href="structlooped__elem__to__loc.html">looped_elem_to_loc</a></div><div class="ttdef"><b>Definition</b> utils.h:197</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_a05558dabba889ee0d80ed4b567d901ca"><div class="ttname"><a href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">looped_elem_to_loc::next</a></div><div class="ttdeci">void next(const constant int *shape, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:202</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_a11ef1389c9224e9117fd6374d740e0e0"><div class="ttname"><a href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">looped_elem_to_loc::offset</a></div><div class="ttdeci">offset_t offset</div><div class="ttdef"><b>Definition</b> utils.h:199</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_a29b154409551fea0a4ef50bf320ebc0a"><div class="ttname"><a href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">looped_elem_to_loc::index</a></div><div class="ttdeci">int index</div><div class="ttdef"><b>Definition</b> utils.h:200</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_a42c76764640618d721c48ef6b4f59189"><div class="ttname"><a href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">looped_elem_to_loc::inner_looper</a></div><div class="ttdeci">looped_elem_to_loc&lt; dim - 1, offset_t &gt; inner_looper</div><div class="ttdef"><b>Definition</b> utils.h:198</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_accc6d4957a8aeb38f5062754793b74d2"><div class="ttname"><a href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">looped_elem_to_loc::location</a></div><div class="ttdeci">offset_t location(offset_t, const constant int *, const constant size_t *, int)</div><div class="ttdef"><b>Definition</b> utils.h:229</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_add610f331ef8d7d2d1917050890f82b2"><div class="ttname"><a href="structlooped__elem__to__loc.html#add610f331ef8d7d2d1917050890f82b2">looped_elem_to_loc::next</a></div><div class="ttdeci">void next(int n, const constant int *shape, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:213</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/backend_2metal_2utils_8h.html b/docs/build/html/backend_2metal_2utils_8h.html
index 9fc5d1f11..93f04c832 100644
--- a/docs/build/html/backend_2metal_2utils_8h.html
+++ b/docs/build/html/backend_2metal_2utils_8h.html
@@ -109,12 +109,8 @@ Namespaces</h2></td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:a62340bbaa8b216539688a60adcb568bf" id="r_a62340bbaa8b216539688a60adcb568bf"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
-<tr class="memitem:a62340bbaa8b216539688a60adcb568bf"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf">mlx::core::set_vector_bytes</a> (<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;enc, const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)</td></tr>
-<tr class="separator:a62340bbaa8b216539688a60adcb568bf"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae309cb543dfb0239cfccc53a8ad0408e" id="r_ae309cb543dfb0239cfccc53a8ad0408e"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
-<tr class="memitem:ae309cb543dfb0239cfccc53a8ad0408e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#ae309cb543dfb0239cfccc53a8ad0408e">mlx::core::set_vector_bytes</a> (<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;enc, const std::vector&lt; T &gt; &amp;vec, int idx)</td></tr>
-<tr class="separator:ae309cb543dfb0239cfccc53a8ad0408e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aef60e3a8d9c987c9c338b193673d2164" id="r_aef60e3a8d9c987c9c338b193673d2164"><td class="memItemLeft" align="right" valign="top">std::string&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164">mlx::core::type_to_name</a> (const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;t)</td></tr>
+<tr class="separator:aef60e3a8d9c987c9c338b193673d2164"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:af1fdfdaa5644394362e6baba30701bae" id="r_af1fdfdaa5644394362e6baba30701bae"><td class="memItemLeft" align="right" valign="top">std::string&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae">mlx::core::type_to_name</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a)</td></tr>
 <tr class="separator:af1fdfdaa5644394362e6baba30701bae"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a0f0f59d3ffe2d16a684e5fc093302e15" id="r_a0f0f59d3ffe2d16a684e5fc093302e15"><td class="memItemLeft" align="right" valign="top">MTL::Size&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a0f0f59d3ffe2d16a684e5fc093302e15">mlx::core::get_block_dims</a> (int dim0, int dim1, int dim2, int pow2=10)</td></tr>
@@ -131,6 +127,12 @@ Functions</h2></td></tr>
 <tr class="separator:a489e45b3a5cd8b46e8ea56b9132eb230"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ad4be35b310a252edd80d9cf04f094a60" id="r_ad4be35b310a252edd80d9cf04f094a60"><td class="memItemLeft" align="right" valign="top">std::string&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">mlx::core::get_primitive_string</a> (<a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> *primitive)</td></tr>
 <tr class="separator:ad4be35b310a252edd80d9cf04f094a60"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a76a2e310857f60f5ea6f1388d45b964d" id="r_a76a2e310857f60f5ea6f1388d45b964d"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:a76a2e310857f60f5ea6f1388d45b964d"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">mlx::core::concatenate</a> (std::string &amp;acc, T first)</td></tr>
+<tr class="separator:a76a2e310857f60f5ea6f1388d45b964d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aaf51544472fa87fa974686eacdd2a4a6" id="r_aaf51544472fa87fa974686eacdd2a4a6"><td class="memTemplParams" colspan="2">template&lt;typename T , typename... Args&gt; </td></tr>
+<tr class="memitem:aaf51544472fa87fa974686eacdd2a4a6"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#aaf51544472fa87fa974686eacdd2a4a6">mlx::core::concatenate</a> (std::string &amp;acc, T first, Args... args)</td></tr>
+<tr class="separator:aaf51544472fa87fa974686eacdd2a4a6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/backend_2metal_2utils_8h_source.html b/docs/build/html/backend_2metal_2utils_8h_source.html
index 0795ecb30..32b2bae63 100644
--- a/docs/build/html/backend_2metal_2utils_8h_source.html
+++ b/docs/build/html/backend_2metal_2utils_8h_source.html
@@ -101,87 +101,82 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span> </div>
 <div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemlx_1_1core.html">mlx::core</a> {</div>
 <div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span> </div>
-<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span><span class="keyword">using </span>metal::CommandEncoder;</div>
-<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span> </div>
-<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
-<div class="foldopen" id="foldopen00014" data-start="{" data-end="}">
-<div class="line"><a id="l00014" name="l00014"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf">   14</a></span><span class="keyword">inline</span> <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf">set_vector_bytes</a>(</div>
-<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>    <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; enc,</div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    <span class="keyword">const</span> std::vector&lt;T&gt;&amp; vec,</div>
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    <span class="keywordtype">size_t</span> nelems,</div>
-<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    <span class="keywordtype">int</span> idx) {</div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>  enc-&gt;setBytes(vec.data(), nelems * <span class="keyword">sizeof</span>(T), idx);</div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>}</div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164">   11</a></span>std::string <a class="code hl_function" href="namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164">type_to_name</a>(<span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a>&amp; t);</div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae">   12</a></span>std::string <a class="code hl_function" href="namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164">type_to_name</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; a);</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span> </div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span><span class="comment">// Compute the thread block dimensions which fit the given</span></div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span><span class="comment">// input dimensions.</span></div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span><span class="comment">// - The thread block dimensions will be powers of two</span></div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span><span class="comment">// - The thread block size will be less than 2^pow2</span></div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a0f0f59d3ffe2d16a684e5fc093302e15">   18</a></span>MTL::Size <a class="code hl_function" href="namespacemlx_1_1core.html#a0f0f59d3ffe2d16a684e5fc093302e15">get_block_dims</a>(<span class="keywordtype">int</span> dim0, <span class="keywordtype">int</span> dim1, <span class="keywordtype">int</span> dim2, <span class="keywordtype">int</span> pow2 = 10);</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span> </div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span><span class="comment">// Computes a 2D grid where each element is &lt; UINT_MAX</span></div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span><span class="comment">// Assumes:</span></div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="comment">// - overall size (product of non-broadcasted dimensions) is &lt; UINT_MAX^2</span></div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span><span class="comment">// - shape and strides correspond to a contiguous (no holes) but</span></div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span><span class="comment">//   possibly broadcasted array</span></div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a8dc169474a51a1f4f761d5752819bd7c">   25</a></span>MTL::Size <a class="code hl_function" href="namespacemlx_1_1core.html#a8dc169474a51a1f4f761d5752819bd7c">get_2d_grid_dims</a>(</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    <span class="keyword">const</span> std::vector&lt;int&gt;&amp; shape,</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>    <span class="keyword">const</span> std::vector&lt;size_t&gt;&amp; strides);</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span> </div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span><span class="comment">// Same as above but we do an implicit division with divisor.</span></div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span><span class="comment">// Basically, equivalent to factorizing</span></div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span><span class="comment">//    Prod(s \forall s in shape if strides[s] &gt; 0) / divisor.</span></div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a187b9a932c7b3d67ee42d9d12fcb1bb1">   32</a></span>MTL::Size <a class="code hl_function" href="namespacemlx_1_1core.html#a8dc169474a51a1f4f761d5752819bd7c">get_2d_grid_dims</a>(</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>    <span class="keyword">const</span> std::vector&lt;int&gt;&amp; shape,</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    <span class="keyword">const</span> std::vector&lt;size_t&gt;&amp; strides,</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>    <span class="keywordtype">size_t</span> divisor);</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span> </div>
+<div class="foldopen" id="foldopen00037" data-start="{" data-end="}">
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">   37</a></span><span class="keyword">inline</span> NS::String* <a class="code hl_function" href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">make_string</a>(std::ostringstream&amp; os) {</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  std::string <span class="keywordtype">string</span> = os.str();</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  <span class="keywordflow">return</span> NS::String::string(<span class="keywordtype">string</span>.c_str(), NS::UTF8StringEncoding);</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>}</div>
 </div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span> </div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span><span class="keyword">inline</span> <span class="keywordtype">void</span></div>
-<div class="foldopen" id="foldopen00024" data-start="{" data-end="}">
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ae309cb543dfb0239cfccc53a8ad0408e">   24</a></span><a class="code hl_function" href="namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf">set_vector_bytes</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; enc, <span class="keyword">const</span> std::vector&lt;T&gt;&amp; vec, <span class="keywordtype">int</span> idx) {</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>  <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf">set_vector_bytes</a>(enc, vec, vec.size(), idx);</div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>}</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span> </div>
+<div class="foldopen" id="foldopen00042" data-start="{" data-end="}">
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a79817d2432e782e596c9c49a08b93be2">   42</a></span><span class="keyword">inline</span> <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a79817d2432e782e596c9c49a08b93be2">debug_set_stream_queue_label</a>(MTL::CommandQueue* queue, <span class="keywordtype">int</span> index) {</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span><span class="preprocessor">#ifdef MLX_METAL_DEBUG</span></div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  std::ostringstream label;</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>  label &lt;&lt; <span class="stringliteral">&quot;Stream &quot;</span> &lt;&lt; index;</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  queue-&gt;setLabel(<a class="code hl_function" href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">make_string</a>(label));</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span><span class="preprocessor">#endif</span></div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>}</div>
 </div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span> </div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae">   28</a></span>std::string <a class="code hl_function" href="namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae">type_to_name</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; a);</div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span> </div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span><span class="comment">// Compute the thread block dimensions which fit the given</span></div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span><span class="comment">// input dimensions.</span></div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span><span class="comment">// - The thread block dimensions will be powers of two</span></div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span><span class="comment">// - The thread block size will be less than 2^pow2</span></div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a0f0f59d3ffe2d16a684e5fc093302e15">   34</a></span>MTL::Size <a class="code hl_function" href="namespacemlx_1_1core.html#a0f0f59d3ffe2d16a684e5fc093302e15">get_block_dims</a>(<span class="keywordtype">int</span> dim0, <span class="keywordtype">int</span> dim1, <span class="keywordtype">int</span> dim2, <span class="keywordtype">int</span> pow2 = 10);</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span> </div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span><span class="comment">// Computes a 2D grid where each element is &lt; UINT_MAX</span></div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span><span class="comment">// Assumes:</span></div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span><span class="comment">// - overall size (product of non-broadcasted dimensions) is &lt; UINT_MAX^2</span></div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span><span class="comment">// - shape and strides correspond to a contiguous (no holes) but</span></div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span><span class="comment">//   possibly broadcasted array</span></div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a8dc169474a51a1f4f761d5752819bd7c">   41</a></span>MTL::Size <a class="code hl_function" href="namespacemlx_1_1core.html#a8dc169474a51a1f4f761d5752819bd7c">get_2d_grid_dims</a>(</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>    <span class="keyword">const</span> std::vector&lt;int&gt;&amp; shape,</div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>    <span class="keyword">const</span> std::vector&lt;size_t&gt;&amp; strides);</div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span> </div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span><span class="comment">// Same as above but we do an implicit division with divisor.</span></div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span><span class="comment">// Basically, equivalent to factorizing</span></div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span><span class="comment">//    Prod(s \forall s in shape if strides[s] &gt; 0) / divisor.</span></div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a187b9a932c7b3d67ee42d9d12fcb1bb1">   48</a></span>MTL::Size <a class="code hl_function" href="namespacemlx_1_1core.html#a8dc169474a51a1f4f761d5752819bd7c">get_2d_grid_dims</a>(</div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>    <span class="keyword">const</span> std::vector&lt;int&gt;&amp; shape,</div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>    <span class="keyword">const</span> std::vector&lt;size_t&gt;&amp; strides,</div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>    <span class="keywordtype">size_t</span> divisor);</div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span> </div>
-<div class="foldopen" id="foldopen00053" data-start="{" data-end="}">
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">   53</a></span><span class="keyword">inline</span> NS::String* <a class="code hl_function" href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">make_string</a>(std::ostringstream&amp; os) {</div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>  std::string <span class="keywordtype">string</span> = os.str();</div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>  <span class="keywordflow">return</span> NS::String::string(<span class="keywordtype">string</span>.c_str(), NS::UTF8StringEncoding);</div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>}</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span> </div>
+<div class="foldopen" id="foldopen00050" data-start="{" data-end="}">
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a489e45b3a5cd8b46e8ea56b9132eb230">   50</a></span><span class="keyword">inline</span> <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a489e45b3a5cd8b46e8ea56b9132eb230">debug_set_primitive_buffer_label</a>(</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>    MTL::CommandBuffer* command_buffer,</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>    <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; primitive) {</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span><span class="preprocessor">#ifdef MLX_METAL_DEBUG</span></div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>  std::ostringstream label;</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>  <span class="keywordflow">if</span> (<span class="keyword">auto</span> cbuf_label = command_buffer-&gt;label(); cbuf_label) {</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>    label &lt;&lt; cbuf_label-&gt;utf8String();</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>  }</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>  primitive.<a class="code hl_function" href="classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb">print</a>(label);</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>  command_buffer-&gt;setLabel(<a class="code hl_function" href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">make_string</a>(label));</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span><span class="preprocessor">#endif</span></div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>}</div>
 </div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span> </div>
-<div class="foldopen" id="foldopen00058" data-start="{" data-end="}">
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a79817d2432e782e596c9c49a08b93be2">   58</a></span><span class="keyword">inline</span> <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a79817d2432e782e596c9c49a08b93be2">debug_set_stream_queue_label</a>(MTL::CommandQueue* queue, <span class="keywordtype">int</span> index) {</div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span><span class="preprocessor">#ifdef MLX_METAL_DEBUG</span></div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>  std::ostringstream label;</div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>  label &lt;&lt; <span class="stringliteral">&quot;Stream &quot;</span> &lt;&lt; index;</div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>  queue-&gt;setLabel(<a class="code hl_function" href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">make_string</a>(label));</div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span><span class="preprocessor">#endif</span></div>
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>}</div>
-</div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span> </div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span> </div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">   63</a></span>std::string <a class="code hl_function" href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">get_primitive_string</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>* primitive);</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span> </div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
 <div class="foldopen" id="foldopen00066" data-start="{" data-end="}">
-<div class="line"><a id="l00066" name="l00066"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a489e45b3a5cd8b46e8ea56b9132eb230">   66</a></span><span class="keyword">inline</span> <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a489e45b3a5cd8b46e8ea56b9132eb230">debug_set_primitive_buffer_label</a>(</div>
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    MTL::CommandBuffer* command_buffer,</div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; primitive) {</div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span><span class="preprocessor">#ifdef MLX_METAL_DEBUG</span></div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>  std::ostringstream label;</div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>  <span class="keywordflow">if</span> (<span class="keyword">auto</span> cbuf_label = command_buffer-&gt;label(); cbuf_label) {</div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    label &lt;&lt; cbuf_label-&gt;utf8String();</div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  }</div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>  primitive.<a class="code hl_function" href="classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb">print</a>(label);</div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>  command_buffer-&gt;setLabel(<a class="code hl_function" href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">make_string</a>(label));</div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span><span class="preprocessor">#endif</span></div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>}</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">   66</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">concatenate</a>(std::string&amp; acc, T first) {</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>  acc += first;</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>}</div>
 </div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span> </div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">   79</a></span>std::string <a class="code hl_function" href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">get_primitive_string</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>* primitive);</div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span> </div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>} <span class="comment">// namespace mlx::core</span></div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span> </div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span>... Args&gt;</div>
+<div class="foldopen" id="foldopen00071" data-start="{" data-end="}">
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aaf51544472fa87fa974686eacdd2a4a6">   71</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">concatenate</a>(std::string&amp; acc, T first, Args... args) {</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>  acc += first;</div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  <a class="code hl_function" href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">concatenate</a>(acc, args...);</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>}</div>
+</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span> </div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="abackend_2metal_2device_8h_html"><div class="ttname"><a href="backend_2metal_2device_8h.html">device.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></div><div class="ttdef"><b>Definition</b> primitives.h:48</div></div>
@@ -189,15 +184,15 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a0f0f59d3ffe2d16a684e5fc093302e15"><div class="ttname"><a href="namespacemlx_1_1core.html#a0f0f59d3ffe2d16a684e5fc093302e15">mlx::core::get_block_dims</a></div><div class="ttdeci">MTL::Size get_block_dims(int dim0, int dim1, int dim2, int pow2=10)</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_a489e45b3a5cd8b46e8ea56b9132eb230"><div class="ttname"><a href="namespacemlx_1_1core.html#a489e45b3a5cd8b46e8ea56b9132eb230">mlx::core::debug_set_primitive_buffer_label</a></div><div class="ttdeci">void debug_set_primitive_buffer_label(MTL::CommandBuffer *command_buffer, Primitive &amp;primitive)</div><div class="ttdef"><b>Definition</b> utils.h:66</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_a62340bbaa8b216539688a60adcb568bf"><div class="ttname"><a href="namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf">mlx::core::set_vector_bytes</a></div><div class="ttdeci">void set_vector_bytes(CommandEncoder &amp;enc, const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)</div><div class="ttdef"><b>Definition</b> utils.h:14</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_a79817d2432e782e596c9c49a08b93be2"><div class="ttname"><a href="namespacemlx_1_1core.html#a79817d2432e782e596c9c49a08b93be2">mlx::core::debug_set_stream_queue_label</a></div><div class="ttdeci">void debug_set_stream_queue_label(MTL::CommandQueue *queue, int index)</div><div class="ttdef"><b>Definition</b> utils.h:58</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_a489e45b3a5cd8b46e8ea56b9132eb230"><div class="ttname"><a href="namespacemlx_1_1core.html#a489e45b3a5cd8b46e8ea56b9132eb230">mlx::core::debug_set_primitive_buffer_label</a></div><div class="ttdeci">void debug_set_primitive_buffer_label(MTL::CommandBuffer *command_buffer, Primitive &amp;primitive)</div><div class="ttdef"><b>Definition</b> utils.h:50</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_a76a2e310857f60f5ea6f1388d45b964d"><div class="ttname"><a href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">mlx::core::concatenate</a></div><div class="ttdeci">void concatenate(std::string &amp;acc, T first)</div><div class="ttdef"><b>Definition</b> utils.h:66</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_a79817d2432e782e596c9c49a08b93be2"><div class="ttname"><a href="namespacemlx_1_1core.html#a79817d2432e782e596c9c49a08b93be2">mlx::core::debug_set_stream_queue_label</a></div><div class="ttdeci">void debug_set_stream_queue_label(MTL::CommandQueue *queue, int index)</div><div class="ttdef"><b>Definition</b> utils.h:42</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a8dc169474a51a1f4f761d5752819bd7c"><div class="ttname"><a href="namespacemlx_1_1core.html#a8dc169474a51a1f4f761d5752819bd7c">mlx::core::get_2d_grid_dims</a></div><div class="ttdeci">MTL::Size get_2d_grid_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_ad4be35b310a252edd80d9cf04f094a60"><div class="ttname"><a href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">mlx::core::get_primitive_string</a></div><div class="ttdeci">std::string get_primitive_string(Primitive *primitive)</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_aed148d95e7b5221f1312473deded0d27"><div class="ttname"><a href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">mlx::core::make_string</a></div><div class="ttdeci">NS::String * make_string(std::ostringstream &amp;os)</div><div class="ttdef"><b>Definition</b> utils.h:53</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_af1fdfdaa5644394362e6baba30701bae"><div class="ttname"><a href="namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae">mlx::core::type_to_name</a></div><div class="ttdeci">std::string type_to_name(const array &amp;a)</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_aed148d95e7b5221f1312473deded0d27"><div class="ttname"><a href="namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27">mlx::core::make_string</a></div><div class="ttdeci">NS::String * make_string(std::ostringstream &amp;os)</div><div class="ttdef"><b>Definition</b> utils.h:37</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_aef60e3a8d9c987c9c338b193673d2164"><div class="ttname"><a href="namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164">mlx::core::type_to_name</a></div><div class="ttdeci">std::string type_to_name(const Dtype &amp;t)</div></div>
 <div class="ttc" id="aprimitives_8h_html"><div class="ttname"><a href="primitives_8h.html">primitives.h</a></div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></div><div class="ttdef"><b>Definition</b> device.h:41</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1_dtype_html"><div class="ttname"><a href="structmlx_1_1core_1_1_dtype.html">mlx::core::Dtype</a></div><div class="ttdef"><b>Definition</b> dtype.h:13</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/bf16__math_8h.html b/docs/build/html/bf16__math_8h.html
index 4d6497014..da7db6474 100644
--- a/docs/build/html/bf16__math_8h.html
+++ b/docs/build/html/bf16__math_8h.html
@@ -95,8 +95,7 @@ $(function(){ initResizable(false); });
   <div class="headertitle"><div class="title">bf16_math.h File Reference</div></div>
 </div><!--header-->
 <div class="contents">
-<div class="textblock"><code>#include &quot;<a class="el" href="backend_2metal_2kernels_2bf16_8h_source.html">mlx/backend/metal/kernels/bf16.h</a>&quot;</code><br />
-</div>
+
 <p><a href="bf16__math_8h_source.html">Go to the source code of this file.</a></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="namespaces" name="namespaces"></a>
@@ -116,379 +115,357 @@ Macros</h2></td></tr>
 <tr class="separator:aecc11cb898846d01bfc9faa109fcf791"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a51688bc24fc9292aaec5f54a58eaa2d0" id="r_a51688bc24fc9292aaec5f54a58eaa2d0"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a51688bc24fc9292aaec5f54a58eaa2d0">instantiate_metal_simd_reduction_funcs</a>(itype,  otype,  ctype)</td></tr>
 <tr class="separator:a51688bc24fc9292aaec5f54a58eaa2d0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a51cfdd4502e755310f6f3456f039bea7" id="r_a51cfdd4502e755310f6f3456f039bea7"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a51cfdd4502e755310f6f3456f039bea7">bfloat16_to_uint16</a>(x)</td></tr>
-<tr class="separator:a51cfdd4502e755310f6f3456f039bea7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a030d871474c0e7d907fccffcc8c047e0" id="r_a030d871474c0e7d907fccffcc8c047e0"><td class="memItemLeft" align="right" valign="top">#define&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a030d871474c0e7d907fccffcc8c047e0">uint16_to_bfloat16</a>(x)</td></tr>
-<tr class="separator:a030d871474c0e7d907fccffcc8c047e0"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:a87c5122c60f9a12afceb9925a5b78ffb" id="r_a87c5122c60f9a12afceb9925a5b78ffb"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">metal::abs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a87c5122c60f9a12afceb9925a5b78ffb" id="r_a87c5122c60f9a12afceb9925a5b78ffb"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">metal::abs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a87c5122c60f9a12afceb9925a5b78ffb"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad4537748b3c832b6569ff7ccb209fcb2" id="r_ad4537748b3c832b6569ff7ccb209fcb2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ad4537748b3c832b6569ff7ccb209fcb2">metal::acos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ad4537748b3c832b6569ff7ccb209fcb2" id="r_ad4537748b3c832b6569ff7ccb209fcb2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ad4537748b3c832b6569ff7ccb209fcb2">metal::acos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ad4537748b3c832b6569ff7ccb209fcb2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2d0efb92b7f61eff342d776bd6c5f3a0" id="r_a2d0efb92b7f61eff342d776bd6c5f3a0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a2d0efb92b7f61eff342d776bd6c5f3a0">metal::acosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a2d0efb92b7f61eff342d776bd6c5f3a0" id="r_a2d0efb92b7f61eff342d776bd6c5f3a0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a2d0efb92b7f61eff342d776bd6c5f3a0">metal::acosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a2d0efb92b7f61eff342d776bd6c5f3a0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a16e843194df3fd136404bf80ba5ac95c" id="r_a16e843194df3fd136404bf80ba5ac95c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a16e843194df3fd136404bf80ba5ac95c">metal::asin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a16e843194df3fd136404bf80ba5ac95c" id="r_a16e843194df3fd136404bf80ba5ac95c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a16e843194df3fd136404bf80ba5ac95c">metal::asin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a16e843194df3fd136404bf80ba5ac95c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:abcc3251866930cfe880f89e7473d0e63" id="r_abcc3251866930cfe880f89e7473d0e63"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#abcc3251866930cfe880f89e7473d0e63">metal::asinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:abcc3251866930cfe880f89e7473d0e63" id="r_abcc3251866930cfe880f89e7473d0e63"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#abcc3251866930cfe880f89e7473d0e63">metal::asinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:abcc3251866930cfe880f89e7473d0e63"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a80a771553d9a0012b93620d19c48b00f" id="r_a80a771553d9a0012b93620d19c48b00f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a80a771553d9a0012b93620d19c48b00f">metal::atan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
+<tr class="memitem:a80a771553d9a0012b93620d19c48b00f" id="r_a80a771553d9a0012b93620d19c48b00f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a80a771553d9a0012b93620d19c48b00f">metal::atan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
 <tr class="separator:a80a771553d9a0012b93620d19c48b00f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1d430793eaa38ccf0d07145e3fcd1e61" id="r_a1d430793eaa38ccf0d07145e3fcd1e61"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a1d430793eaa38ccf0d07145e3fcd1e61">metal::atan2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a1d430793eaa38ccf0d07145e3fcd1e61" id="r_a1d430793eaa38ccf0d07145e3fcd1e61"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a1d430793eaa38ccf0d07145e3fcd1e61">metal::atan2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a1d430793eaa38ccf0d07145e3fcd1e61"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a57116427997ba71dd3863bfb15de33bf" id="r_a57116427997ba71dd3863bfb15de33bf"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a57116427997ba71dd3863bfb15de33bf">metal::atanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a57116427997ba71dd3863bfb15de33bf" id="r_a57116427997ba71dd3863bfb15de33bf"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a57116427997ba71dd3863bfb15de33bf">metal::atanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a57116427997ba71dd3863bfb15de33bf"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad63204d38bc01df6ffc64583f7886b3c" id="r_ad63204d38bc01df6ffc64583f7886b3c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ad63204d38bc01df6ffc64583f7886b3c">metal::ceil</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ad63204d38bc01df6ffc64583f7886b3c" id="r_ad63204d38bc01df6ffc64583f7886b3c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ad63204d38bc01df6ffc64583f7886b3c">metal::ceil</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ad63204d38bc01df6ffc64583f7886b3c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2fa4778a6fe2fa43253ea724e5a608a3" id="r_a2fa4778a6fe2fa43253ea724e5a608a3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3">metal::cos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a2fa4778a6fe2fa43253ea724e5a608a3" id="r_a2fa4778a6fe2fa43253ea724e5a608a3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3">metal::cos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a2fa4778a6fe2fa43253ea724e5a608a3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8a68a88cc110830d057dbd71431b93c0" id="r_a8a68a88cc110830d057dbd71431b93c0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0">metal::cosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8a68a88cc110830d057dbd71431b93c0" id="r_a8a68a88cc110830d057dbd71431b93c0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0">metal::cosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8a68a88cc110830d057dbd71431b93c0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5c2f37939ad705ddea4409d3bedb8ce1" id="r_a5c2f37939ad705ddea4409d3bedb8ce1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1">metal::cospi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a5c2f37939ad705ddea4409d3bedb8ce1" id="r_a5c2f37939ad705ddea4409d3bedb8ce1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1">metal::cospi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a5c2f37939ad705ddea4409d3bedb8ce1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2aea493fc1a874970b77ed0031e965df" id="r_a2aea493fc1a874970b77ed0031e965df"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a2aea493fc1a874970b77ed0031e965df">metal::divide</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a2aea493fc1a874970b77ed0031e965df" id="r_a2aea493fc1a874970b77ed0031e965df"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a2aea493fc1a874970b77ed0031e965df">metal::divide</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a2aea493fc1a874970b77ed0031e965df"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac2a0b3618d922ac014baac8189d44650" id="r_ac2a0b3618d922ac014baac8189d44650"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">metal::exp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ac2a0b3618d922ac014baac8189d44650" id="r_ac2a0b3618d922ac014baac8189d44650"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">metal::exp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ac2a0b3618d922ac014baac8189d44650"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4c63707d13c89364496a48906631c204" id="r_a4c63707d13c89364496a48906631c204"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a4c63707d13c89364496a48906631c204">metal::exp10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a4c63707d13c89364496a48906631c204" id="r_a4c63707d13c89364496a48906631c204"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a4c63707d13c89364496a48906631c204">metal::exp10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a4c63707d13c89364496a48906631c204"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a228201c20777848804a4d0589c1d33e7" id="r_a228201c20777848804a4d0589c1d33e7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a228201c20777848804a4d0589c1d33e7">metal::exp2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a228201c20777848804a4d0589c1d33e7" id="r_a228201c20777848804a4d0589c1d33e7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a228201c20777848804a4d0589c1d33e7">metal::exp2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a228201c20777848804a4d0589c1d33e7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a487eba718144be1325abcf66e109bb21" id="r_a487eba718144be1325abcf66e109bb21"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a487eba718144be1325abcf66e109bb21">metal::fabs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a487eba718144be1325abcf66e109bb21" id="r_a487eba718144be1325abcf66e109bb21"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a487eba718144be1325abcf66e109bb21">metal::fabs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a487eba718144be1325abcf66e109bb21"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a85a560794be56d8116889c1ee2d78761" id="r_a85a560794be56d8116889c1ee2d78761"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a85a560794be56d8116889c1ee2d78761">metal::fdim</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a85a560794be56d8116889c1ee2d78761" id="r_a85a560794be56d8116889c1ee2d78761"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a85a560794be56d8116889c1ee2d78761">metal::fdim</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a85a560794be56d8116889c1ee2d78761"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a020790f30c28a9982c4a83deaa258277" id="r_a020790f30c28a9982c4a83deaa258277"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a020790f30c28a9982c4a83deaa258277">metal::floor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a020790f30c28a9982c4a83deaa258277" id="r_a020790f30c28a9982c4a83deaa258277"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a020790f30c28a9982c4a83deaa258277">metal::floor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a020790f30c28a9982c4a83deaa258277"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6301a78d69ff14a06194ca85a0c7d326" id="r_a6301a78d69ff14a06194ca85a0c7d326"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a6301a78d69ff14a06194ca85a0c7d326">metal::fma</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a6301a78d69ff14a06194ca85a0c7d326" id="r_a6301a78d69ff14a06194ca85a0c7d326"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a6301a78d69ff14a06194ca85a0c7d326">metal::fma</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a6301a78d69ff14a06194ca85a0c7d326"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0558e56fdb94b456deea6a4eb53964ed" id="r_a0558e56fdb94b456deea6a4eb53964ed"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a0558e56fdb94b456deea6a4eb53964ed">metal::fmax</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a0558e56fdb94b456deea6a4eb53964ed" id="r_a0558e56fdb94b456deea6a4eb53964ed"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a0558e56fdb94b456deea6a4eb53964ed">metal::fmax</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a0558e56fdb94b456deea6a4eb53964ed"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae0c1a7ba1a7449adc64d00b2a29e67f6" id="r_ae0c1a7ba1a7449adc64d00b2a29e67f6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae0c1a7ba1a7449adc64d00b2a29e67f6">metal::fmax3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:ae0c1a7ba1a7449adc64d00b2a29e67f6" id="r_ae0c1a7ba1a7449adc64d00b2a29e67f6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae0c1a7ba1a7449adc64d00b2a29e67f6">metal::fmax3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:ae0c1a7ba1a7449adc64d00b2a29e67f6"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa35227450d943fb88cf43162aa9d8c49" id="r_aa35227450d943fb88cf43162aa9d8c49"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#aa35227450d943fb88cf43162aa9d8c49">metal::fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:aa35227450d943fb88cf43162aa9d8c49" id="r_aa35227450d943fb88cf43162aa9d8c49"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#aa35227450d943fb88cf43162aa9d8c49">metal::fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:aa35227450d943fb88cf43162aa9d8c49"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a66ac19825ea79b8294e243ae6d0b3d3c" id="r_a66ac19825ea79b8294e243ae6d0b3d3c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a66ac19825ea79b8294e243ae6d0b3d3c">metal::fmin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a66ac19825ea79b8294e243ae6d0b3d3c" id="r_a66ac19825ea79b8294e243ae6d0b3d3c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a66ac19825ea79b8294e243ae6d0b3d3c">metal::fmin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a66ac19825ea79b8294e243ae6d0b3d3c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae2acd25f2241f00aaf89ff48f132a879" id="r_ae2acd25f2241f00aaf89ff48f132a879"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae2acd25f2241f00aaf89ff48f132a879">metal::fmin3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:ae2acd25f2241f00aaf89ff48f132a879" id="r_ae2acd25f2241f00aaf89ff48f132a879"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae2acd25f2241f00aaf89ff48f132a879">metal::fmin3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:ae2acd25f2241f00aaf89ff48f132a879"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2ff952d4d596a7969b2a3035fc2fda58" id="r_a2ff952d4d596a7969b2a3035fc2fda58"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a2ff952d4d596a7969b2a3035fc2fda58">metal::fmod</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a2ff952d4d596a7969b2a3035fc2fda58" id="r_a2ff952d4d596a7969b2a3035fc2fda58"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a2ff952d4d596a7969b2a3035fc2fda58">metal::fmod</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a2ff952d4d596a7969b2a3035fc2fda58"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6b1c15d251aeaacb1f4338a5e152ae78" id="r_a6b1c15d251aeaacb1f4338a5e152ae78"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a6b1c15d251aeaacb1f4338a5e152ae78">metal::fract</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a6b1c15d251aeaacb1f4338a5e152ae78" id="r_a6b1c15d251aeaacb1f4338a5e152ae78"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a6b1c15d251aeaacb1f4338a5e152ae78">metal::fract</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a6b1c15d251aeaacb1f4338a5e152ae78"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac89d4ef524d21a301da6c37dbd95ff9f" id="r_ac89d4ef524d21a301da6c37dbd95ff9f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ac89d4ef524d21a301da6c37dbd95ff9f">metal::frexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">exp</a>)</td></tr>
+<tr class="memitem:ac89d4ef524d21a301da6c37dbd95ff9f" id="r_ac89d4ef524d21a301da6c37dbd95ff9f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ac89d4ef524d21a301da6c37dbd95ff9f">metal::frexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">exp</a>)</td></tr>
 <tr class="separator:ac89d4ef524d21a301da6c37dbd95ff9f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3deed001738b6f03accd3c2195586c2b" id="r_a3deed001738b6f03accd3c2195586c2b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a3deed001738b6f03accd3c2195586c2b">metal::ldexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
+<tr class="memitem:a3deed001738b6f03accd3c2195586c2b" id="r_a3deed001738b6f03accd3c2195586c2b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a3deed001738b6f03accd3c2195586c2b">metal::ldexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
 <tr class="separator:a3deed001738b6f03accd3c2195586c2b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a423a9f4f2fc7ef5ec7eda061277b51b6" id="r_a423a9f4f2fc7ef5ec7eda061277b51b6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a423a9f4f2fc7ef5ec7eda061277b51b6" id="r_a423a9f4f2fc7ef5ec7eda061277b51b6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a423a9f4f2fc7ef5ec7eda061277b51b6"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a042b98827baa910e9d726227cec55a80" id="r_a042b98827baa910e9d726227cec55a80"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a042b98827baa910e9d726227cec55a80">metal::log10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a042b98827baa910e9d726227cec55a80" id="r_a042b98827baa910e9d726227cec55a80"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a042b98827baa910e9d726227cec55a80">metal::log10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a042b98827baa910e9d726227cec55a80"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae894dd5fc13799f120b55cab6267c89c" id="r_ae894dd5fc13799f120b55cab6267c89c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae894dd5fc13799f120b55cab6267c89c">metal::log2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ae894dd5fc13799f120b55cab6267c89c" id="r_ae894dd5fc13799f120b55cab6267c89c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae894dd5fc13799f120b55cab6267c89c">metal::log2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ae894dd5fc13799f120b55cab6267c89c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a853c80479ab2264d9c4587c7bcac767b" id="r_a853c80479ab2264d9c4587c7bcac767b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a853c80479ab2264d9c4587c7bcac767b" id="r_a853c80479ab2264d9c4587c7bcac767b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a853c80479ab2264d9c4587c7bcac767b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a00f9c0ad66d969794614f56912eed9c9" id="r_a00f9c0ad66d969794614f56912eed9c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a00f9c0ad66d969794614f56912eed9c9">metal::max3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a00f9c0ad66d969794614f56912eed9c9" id="r_a00f9c0ad66d969794614f56912eed9c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a00f9c0ad66d969794614f56912eed9c9">metal::max3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a00f9c0ad66d969794614f56912eed9c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa3ff49457ce3c93fc1c0897fd1525157" id="r_aa3ff49457ce3c93fc1c0897fd1525157"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157">metal::median3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:aa3ff49457ce3c93fc1c0897fd1525157" id="r_aa3ff49457ce3c93fc1c0897fd1525157"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157">metal::median3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:aa3ff49457ce3c93fc1c0897fd1525157"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6653b28c9473087141eddce39878d4d3" id="r_a6653b28c9473087141eddce39878d4d3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a6653b28c9473087141eddce39878d4d3" id="r_a6653b28c9473087141eddce39878d4d3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a6653b28c9473087141eddce39878d4d3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a005510c8c0f964ce2b8aad3ba76a7a3f" id="r_a005510c8c0f964ce2b8aad3ba76a7a3f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f">metal::min3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a005510c8c0f964ce2b8aad3ba76a7a3f" id="r_a005510c8c0f964ce2b8aad3ba76a7a3f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f">metal::min3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a005510c8c0f964ce2b8aad3ba76a7a3f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a9547fd7b09164931986f6db4813bd72d" id="r_a9547fd7b09164931986f6db4813bd72d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a9547fd7b09164931986f6db4813bd72d">metal::nextafter</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a9547fd7b09164931986f6db4813bd72d" id="r_a9547fd7b09164931986f6db4813bd72d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a9547fd7b09164931986f6db4813bd72d">metal::nextafter</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a9547fd7b09164931986f6db4813bd72d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:acd288d4552215bd10455584a214c57b8" id="r_acd288d4552215bd10455584a214c57b8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#acd288d4552215bd10455584a214c57b8">metal::pow</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:acd288d4552215bd10455584a214c57b8" id="r_acd288d4552215bd10455584a214c57b8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#acd288d4552215bd10455584a214c57b8">metal::pow</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:acd288d4552215bd10455584a214c57b8"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae529e431f178bafedc18a889323c0bc2" id="r_ae529e431f178bafedc18a889323c0bc2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae529e431f178bafedc18a889323c0bc2">metal::powr</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ae529e431f178bafedc18a889323c0bc2" id="r_ae529e431f178bafedc18a889323c0bc2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae529e431f178bafedc18a889323c0bc2">metal::powr</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ae529e431f178bafedc18a889323c0bc2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a29ab6060527120eee745aec0daa06e01" id="r_a29ab6060527120eee745aec0daa06e01"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a29ab6060527120eee745aec0daa06e01">metal::rint</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a29ab6060527120eee745aec0daa06e01" id="r_a29ab6060527120eee745aec0daa06e01"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a29ab6060527120eee745aec0daa06e01">metal::rint</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a29ab6060527120eee745aec0daa06e01"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a46c667e169ff9d51a9204a045305442f" id="r_a46c667e169ff9d51a9204a045305442f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">metal::round</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a46c667e169ff9d51a9204a045305442f" id="r_a46c667e169ff9d51a9204a045305442f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">metal::round</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a46c667e169ff9d51a9204a045305442f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1cf4b605c0aa7ff5bfe5e979a16f5157" id="r_a1cf4b605c0aa7ff5bfe5e979a16f5157"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a1cf4b605c0aa7ff5bfe5e979a16f5157">metal::rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a1cf4b605c0aa7ff5bfe5e979a16f5157" id="r_a1cf4b605c0aa7ff5bfe5e979a16f5157"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a1cf4b605c0aa7ff5bfe5e979a16f5157">metal::rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a1cf4b605c0aa7ff5bfe5e979a16f5157"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a619a159ca5f2ddfe3647d3a6bb6e804c" id="r_a619a159ca5f2ddfe3647d3a6bb6e804c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a619a159ca5f2ddfe3647d3a6bb6e804c">metal::sin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a619a159ca5f2ddfe3647d3a6bb6e804c" id="r_a619a159ca5f2ddfe3647d3a6bb6e804c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a619a159ca5f2ddfe3647d3a6bb6e804c">metal::sin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a619a159ca5f2ddfe3647d3a6bb6e804c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a83ba4235ae350ab8880a9df09158620b" id="r_a83ba4235ae350ab8880a9df09158620b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a83ba4235ae350ab8880a9df09158620b">metal::sinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a83ba4235ae350ab8880a9df09158620b" id="r_a83ba4235ae350ab8880a9df09158620b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a83ba4235ae350ab8880a9df09158620b">metal::sinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a83ba4235ae350ab8880a9df09158620b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae9655f7fa2ba6c0625ca25fbb278e269" id="r_ae9655f7fa2ba6c0625ca25fbb278e269"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae9655f7fa2ba6c0625ca25fbb278e269">metal::sinpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ae9655f7fa2ba6c0625ca25fbb278e269" id="r_ae9655f7fa2ba6c0625ca25fbb278e269"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae9655f7fa2ba6c0625ca25fbb278e269">metal::sinpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ae9655f7fa2ba6c0625ca25fbb278e269"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab3f4d4852ca0e591104fbd8e5b50d31b" id="r_ab3f4d4852ca0e591104fbd8e5b50d31b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ab3f4d4852ca0e591104fbd8e5b50d31b">metal::sqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ab3f4d4852ca0e591104fbd8e5b50d31b" id="r_ab3f4d4852ca0e591104fbd8e5b50d31b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ab3f4d4852ca0e591104fbd8e5b50d31b">metal::sqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ab3f4d4852ca0e591104fbd8e5b50d31b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a862215a8ddacb086296ba02567c9b158" id="r_a862215a8ddacb086296ba02567c9b158"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a862215a8ddacb086296ba02567c9b158">metal::tan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a862215a8ddacb086296ba02567c9b158" id="r_a862215a8ddacb086296ba02567c9b158"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a862215a8ddacb086296ba02567c9b158">metal::tan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a862215a8ddacb086296ba02567c9b158"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa97fc50bd6addfc6de0aae8570fe963d" id="r_aa97fc50bd6addfc6de0aae8570fe963d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#aa97fc50bd6addfc6de0aae8570fe963d">metal::tanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aa97fc50bd6addfc6de0aae8570fe963d" id="r_aa97fc50bd6addfc6de0aae8570fe963d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#aa97fc50bd6addfc6de0aae8570fe963d">metal::tanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aa97fc50bd6addfc6de0aae8570fe963d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae2046d163a525fc1822a9ec8a0aeaeb3" id="r_ae2046d163a525fc1822a9ec8a0aeaeb3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae2046d163a525fc1822a9ec8a0aeaeb3">metal::tanpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ae2046d163a525fc1822a9ec8a0aeaeb3" id="r_ae2046d163a525fc1822a9ec8a0aeaeb3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae2046d163a525fc1822a9ec8a0aeaeb3">metal::tanpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ae2046d163a525fc1822a9ec8a0aeaeb3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a93cb75a11a362bfc8310ea19c554c887" id="r_a93cb75a11a362bfc8310ea19c554c887"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a93cb75a11a362bfc8310ea19c554c887">metal::trunc</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a93cb75a11a362bfc8310ea19c554c887" id="r_a93cb75a11a362bfc8310ea19c554c887"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a93cb75a11a362bfc8310ea19c554c887">metal::trunc</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a93cb75a11a362bfc8310ea19c554c887"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a90d2973f71f83180e7f02e38d11c7a8f" id="r_a90d2973f71f83180e7f02e38d11c7a8f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a90d2973f71f83180e7f02e38d11c7a8f">metal::fast::abs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a90d2973f71f83180e7f02e38d11c7a8f" id="r_a90d2973f71f83180e7f02e38d11c7a8f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a90d2973f71f83180e7f02e38d11c7a8f">metal::fast::abs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a90d2973f71f83180e7f02e38d11c7a8f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a805ce5c3a94b618b7349d70bbb82f0b2" id="r_a805ce5c3a94b618b7349d70bbb82f0b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a805ce5c3a94b618b7349d70bbb82f0b2">metal::fast::acos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a805ce5c3a94b618b7349d70bbb82f0b2" id="r_a805ce5c3a94b618b7349d70bbb82f0b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a805ce5c3a94b618b7349d70bbb82f0b2">metal::fast::acos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a805ce5c3a94b618b7349d70bbb82f0b2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afb656fc3406649a238b6f1e0509de751" id="r_afb656fc3406649a238b6f1e0509de751"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#afb656fc3406649a238b6f1e0509de751">metal::fast::acosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:afb656fc3406649a238b6f1e0509de751" id="r_afb656fc3406649a238b6f1e0509de751"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#afb656fc3406649a238b6f1e0509de751">metal::fast::acosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:afb656fc3406649a238b6f1e0509de751"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a769455a283da99654b6e42c3acf13eb1" id="r_a769455a283da99654b6e42c3acf13eb1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a769455a283da99654b6e42c3acf13eb1">metal::fast::asin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a769455a283da99654b6e42c3acf13eb1" id="r_a769455a283da99654b6e42c3acf13eb1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a769455a283da99654b6e42c3acf13eb1">metal::fast::asin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a769455a283da99654b6e42c3acf13eb1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4367034b7b3e14310803bb2be975a556" id="r_a4367034b7b3e14310803bb2be975a556"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4367034b7b3e14310803bb2be975a556">metal::fast::asinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a4367034b7b3e14310803bb2be975a556" id="r_a4367034b7b3e14310803bb2be975a556"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4367034b7b3e14310803bb2be975a556">metal::fast::asinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a4367034b7b3e14310803bb2be975a556"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a769503b4b7f89071d0983258c5a3ac5a" id="r_a769503b4b7f89071d0983258c5a3ac5a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a769503b4b7f89071d0983258c5a3ac5a">metal::fast::atan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
+<tr class="memitem:a769503b4b7f89071d0983258c5a3ac5a" id="r_a769503b4b7f89071d0983258c5a3ac5a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a769503b4b7f89071d0983258c5a3ac5a">metal::fast::atan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
 <tr class="separator:a769503b4b7f89071d0983258c5a3ac5a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a00e687ea46f5affe26e6aef8fd62b89a" id="r_a00e687ea46f5affe26e6aef8fd62b89a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a00e687ea46f5affe26e6aef8fd62b89a">metal::fast::atan2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a00e687ea46f5affe26e6aef8fd62b89a" id="r_a00e687ea46f5affe26e6aef8fd62b89a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a00e687ea46f5affe26e6aef8fd62b89a">metal::fast::atan2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a00e687ea46f5affe26e6aef8fd62b89a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af24608fc605db9a14427d37c36dc1c53" id="r_af24608fc605db9a14427d37c36dc1c53"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#af24608fc605db9a14427d37c36dc1c53">metal::fast::atanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:af24608fc605db9a14427d37c36dc1c53" id="r_af24608fc605db9a14427d37c36dc1c53"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#af24608fc605db9a14427d37c36dc1c53">metal::fast::atanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:af24608fc605db9a14427d37c36dc1c53"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a97b0bbd79f1f45d9d3104d712914e6b8" id="r_a97b0bbd79f1f45d9d3104d712914e6b8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a97b0bbd79f1f45d9d3104d712914e6b8">metal::fast::ceil</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a97b0bbd79f1f45d9d3104d712914e6b8" id="r_a97b0bbd79f1f45d9d3104d712914e6b8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a97b0bbd79f1f45d9d3104d712914e6b8">metal::fast::ceil</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a97b0bbd79f1f45d9d3104d712914e6b8"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a75b6bb32fa3870eda46a7bfc9f481f88" id="r_a75b6bb32fa3870eda46a7bfc9f481f88"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88">metal::fast::cos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a75b6bb32fa3870eda46a7bfc9f481f88" id="r_a75b6bb32fa3870eda46a7bfc9f481f88"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88">metal::fast::cos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a75b6bb32fa3870eda46a7bfc9f481f88"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a31544ad9de28012a4ddda86e3966a77e" id="r_a31544ad9de28012a4ddda86e3966a77e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e">metal::fast::cosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a31544ad9de28012a4ddda86e3966a77e" id="r_a31544ad9de28012a4ddda86e3966a77e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e">metal::fast::cosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a31544ad9de28012a4ddda86e3966a77e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a9906b41f75319b384ffb570cc94d67ce" id="r_a9906b41f75319b384ffb570cc94d67ce"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce">metal::fast::cospi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a9906b41f75319b384ffb570cc94d67ce" id="r_a9906b41f75319b384ffb570cc94d67ce"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce">metal::fast::cospi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a9906b41f75319b384ffb570cc94d67ce"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae70bc2185e4649369cf7b15f5e1d48be" id="r_ae70bc2185e4649369cf7b15f5e1d48be"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ae70bc2185e4649369cf7b15f5e1d48be">metal::fast::divide</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ae70bc2185e4649369cf7b15f5e1d48be" id="r_ae70bc2185e4649369cf7b15f5e1d48be"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ae70bc2185e4649369cf7b15f5e1d48be">metal::fast::divide</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ae70bc2185e4649369cf7b15f5e1d48be"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad3dbd387b63373c29e3449609f763ede" id="r_ad3dbd387b63373c29e3449609f763ede"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">metal::fast::exp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ad3dbd387b63373c29e3449609f763ede" id="r_ad3dbd387b63373c29e3449609f763ede"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">metal::fast::exp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ad3dbd387b63373c29e3449609f763ede"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a453122f982485cbb4e471b3ac282ee5e" id="r_a453122f982485cbb4e471b3ac282ee5e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a453122f982485cbb4e471b3ac282ee5e">metal::fast::exp10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a453122f982485cbb4e471b3ac282ee5e" id="r_a453122f982485cbb4e471b3ac282ee5e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a453122f982485cbb4e471b3ac282ee5e">metal::fast::exp10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a453122f982485cbb4e471b3ac282ee5e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac092b65a46720adaf22f6266671d2d71" id="r_ac092b65a46720adaf22f6266671d2d71"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ac092b65a46720adaf22f6266671d2d71">metal::fast::exp2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ac092b65a46720adaf22f6266671d2d71" id="r_ac092b65a46720adaf22f6266671d2d71"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ac092b65a46720adaf22f6266671d2d71">metal::fast::exp2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ac092b65a46720adaf22f6266671d2d71"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a129fbd68c9df1a437e8959a25187f554" id="r_a129fbd68c9df1a437e8959a25187f554"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a129fbd68c9df1a437e8959a25187f554">metal::fast::fabs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a129fbd68c9df1a437e8959a25187f554" id="r_a129fbd68c9df1a437e8959a25187f554"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a129fbd68c9df1a437e8959a25187f554">metal::fast::fabs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a129fbd68c9df1a437e8959a25187f554"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a667df76100d5ea0ce5860ddae3e5a00b" id="r_a667df76100d5ea0ce5860ddae3e5a00b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a667df76100d5ea0ce5860ddae3e5a00b">metal::fast::fdim</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a667df76100d5ea0ce5860ddae3e5a00b" id="r_a667df76100d5ea0ce5860ddae3e5a00b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a667df76100d5ea0ce5860ddae3e5a00b">metal::fast::fdim</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a667df76100d5ea0ce5860ddae3e5a00b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac012ce1701c2339914f15cce9f2c632f" id="r_ac012ce1701c2339914f15cce9f2c632f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ac012ce1701c2339914f15cce9f2c632f">metal::fast::floor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ac012ce1701c2339914f15cce9f2c632f" id="r_ac012ce1701c2339914f15cce9f2c632f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ac012ce1701c2339914f15cce9f2c632f">metal::fast::floor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ac012ce1701c2339914f15cce9f2c632f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aebcd6e951da6f7157ec219eb7a8f1ddd" id="r_aebcd6e951da6f7157ec219eb7a8f1ddd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aebcd6e951da6f7157ec219eb7a8f1ddd">metal::fast::fma</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:aebcd6e951da6f7157ec219eb7a8f1ddd" id="r_aebcd6e951da6f7157ec219eb7a8f1ddd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aebcd6e951da6f7157ec219eb7a8f1ddd">metal::fast::fma</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:aebcd6e951da6f7157ec219eb7a8f1ddd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a26e3257cf877154f8a0d434be0bdb034" id="r_a26e3257cf877154f8a0d434be0bdb034"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a26e3257cf877154f8a0d434be0bdb034">metal::fast::fmax</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a26e3257cf877154f8a0d434be0bdb034" id="r_a26e3257cf877154f8a0d434be0bdb034"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a26e3257cf877154f8a0d434be0bdb034">metal::fast::fmax</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a26e3257cf877154f8a0d434be0bdb034"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5c6a3a389f348e1f92e8392b765a32c7" id="r_a5c6a3a389f348e1f92e8392b765a32c7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a5c6a3a389f348e1f92e8392b765a32c7">metal::fast::fmax3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a5c6a3a389f348e1f92e8392b765a32c7" id="r_a5c6a3a389f348e1f92e8392b765a32c7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a5c6a3a389f348e1f92e8392b765a32c7">metal::fast::fmax3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a5c6a3a389f348e1f92e8392b765a32c7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a923869181c3f576f2d86fba5bfa85633" id="r_a923869181c3f576f2d86fba5bfa85633"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a923869181c3f576f2d86fba5bfa85633">metal::fast::fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a923869181c3f576f2d86fba5bfa85633" id="r_a923869181c3f576f2d86fba5bfa85633"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a923869181c3f576f2d86fba5bfa85633">metal::fast::fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a923869181c3f576f2d86fba5bfa85633"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a7e202ec52bf12bfabdf2265b300acbfa" id="r_a7e202ec52bf12bfabdf2265b300acbfa"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a7e202ec52bf12bfabdf2265b300acbfa">metal::fast::fmin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a7e202ec52bf12bfabdf2265b300acbfa" id="r_a7e202ec52bf12bfabdf2265b300acbfa"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a7e202ec52bf12bfabdf2265b300acbfa">metal::fast::fmin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a7e202ec52bf12bfabdf2265b300acbfa"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a9531c6a4a520927523961e6eb6b94c1a" id="r_a9531c6a4a520927523961e6eb6b94c1a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a9531c6a4a520927523961e6eb6b94c1a">metal::fast::fmin3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a9531c6a4a520927523961e6eb6b94c1a" id="r_a9531c6a4a520927523961e6eb6b94c1a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a9531c6a4a520927523961e6eb6b94c1a">metal::fast::fmin3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a9531c6a4a520927523961e6eb6b94c1a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adbec09f18a89f773d7e368ef04a69526" id="r_adbec09f18a89f773d7e368ef04a69526"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#adbec09f18a89f773d7e368ef04a69526">metal::fast::fmod</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:adbec09f18a89f773d7e368ef04a69526" id="r_adbec09f18a89f773d7e368ef04a69526"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#adbec09f18a89f773d7e368ef04a69526">metal::fast::fmod</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:adbec09f18a89f773d7e368ef04a69526"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa8bb448827503e485eb649eb3edb2d4c" id="r_aa8bb448827503e485eb649eb3edb2d4c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aa8bb448827503e485eb649eb3edb2d4c">metal::fast::fract</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aa8bb448827503e485eb649eb3edb2d4c" id="r_aa8bb448827503e485eb649eb3edb2d4c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aa8bb448827503e485eb649eb3edb2d4c">metal::fast::fract</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aa8bb448827503e485eb649eb3edb2d4c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a23902df22aeaa859ef673a36381387c2" id="r_a23902df22aeaa859ef673a36381387c2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a23902df22aeaa859ef673a36381387c2">metal::fast::frexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">exp</a>)</td></tr>
+<tr class="memitem:a23902df22aeaa859ef673a36381387c2" id="r_a23902df22aeaa859ef673a36381387c2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a23902df22aeaa859ef673a36381387c2">metal::fast::frexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">exp</a>)</td></tr>
 <tr class="separator:a23902df22aeaa859ef673a36381387c2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adb045765987e76c7ad4b511fab0c867e" id="r_adb045765987e76c7ad4b511fab0c867e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#adb045765987e76c7ad4b511fab0c867e">metal::fast::ldexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
+<tr class="memitem:adb045765987e76c7ad4b511fab0c867e" id="r_adb045765987e76c7ad4b511fab0c867e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#adb045765987e76c7ad4b511fab0c867e">metal::fast::ldexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
 <tr class="separator:adb045765987e76c7ad4b511fab0c867e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aef942e7f9e5c2e58c58644ab1bdd58d1" id="r_aef942e7f9e5c2e58c58644ab1bdd58d1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aef942e7f9e5c2e58c58644ab1bdd58d1">metal::fast::log</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aef942e7f9e5c2e58c58644ab1bdd58d1" id="r_aef942e7f9e5c2e58c58644ab1bdd58d1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aef942e7f9e5c2e58c58644ab1bdd58d1">metal::fast::log</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aef942e7f9e5c2e58c58644ab1bdd58d1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0d1150cf2deee5100a7ea2988b3bb39e" id="r_a0d1150cf2deee5100a7ea2988b3bb39e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a0d1150cf2deee5100a7ea2988b3bb39e">metal::fast::log10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a0d1150cf2deee5100a7ea2988b3bb39e" id="r_a0d1150cf2deee5100a7ea2988b3bb39e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a0d1150cf2deee5100a7ea2988b3bb39e">metal::fast::log10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a0d1150cf2deee5100a7ea2988b3bb39e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a986ef245dd433ae62af864f5cbb07118" id="r_a986ef245dd433ae62af864f5cbb07118"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a986ef245dd433ae62af864f5cbb07118">metal::fast::log2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a986ef245dd433ae62af864f5cbb07118" id="r_a986ef245dd433ae62af864f5cbb07118"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a986ef245dd433ae62af864f5cbb07118">metal::fast::log2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a986ef245dd433ae62af864f5cbb07118"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a747e2e58092a27fb8b4dd3d16934fb52" id="r_a747e2e58092a27fb8b4dd3d16934fb52"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a747e2e58092a27fb8b4dd3d16934fb52">metal::fast::max</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a747e2e58092a27fb8b4dd3d16934fb52" id="r_a747e2e58092a27fb8b4dd3d16934fb52"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a747e2e58092a27fb8b4dd3d16934fb52">metal::fast::max</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a747e2e58092a27fb8b4dd3d16934fb52"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6fc2cf18ffa8149561864c86dba0f803" id="r_a6fc2cf18ffa8149561864c86dba0f803"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a6fc2cf18ffa8149561864c86dba0f803">metal::fast::max3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a6fc2cf18ffa8149561864c86dba0f803" id="r_a6fc2cf18ffa8149561864c86dba0f803"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a6fc2cf18ffa8149561864c86dba0f803">metal::fast::max3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a6fc2cf18ffa8149561864c86dba0f803"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a742b55f1e4369921ee7f60d70185bfbc" id="r_a742b55f1e4369921ee7f60d70185bfbc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc">metal::fast::median3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a742b55f1e4369921ee7f60d70185bfbc" id="r_a742b55f1e4369921ee7f60d70185bfbc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc">metal::fast::median3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a742b55f1e4369921ee7f60d70185bfbc"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3e958e56a4712687c381a0b64d123e61" id="r_a3e958e56a4712687c381a0b64d123e61"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61">metal::fast::min</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a3e958e56a4712687c381a0b64d123e61" id="r_a3e958e56a4712687c381a0b64d123e61"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61">metal::fast::min</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a3e958e56a4712687c381a0b64d123e61"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a606a4c1b34ce05ea89ca5af81724036f" id="r_a606a4c1b34ce05ea89ca5af81724036f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f">metal::fast::min3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a606a4c1b34ce05ea89ca5af81724036f" id="r_a606a4c1b34ce05ea89ca5af81724036f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f">metal::fast::min3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a606a4c1b34ce05ea89ca5af81724036f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4583e8be04fc0bd475b97b0934604f23" id="r_a4583e8be04fc0bd475b97b0934604f23"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4583e8be04fc0bd475b97b0934604f23">metal::fast::nextafter</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a4583e8be04fc0bd475b97b0934604f23" id="r_a4583e8be04fc0bd475b97b0934604f23"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4583e8be04fc0bd475b97b0934604f23">metal::fast::nextafter</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a4583e8be04fc0bd475b97b0934604f23"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ade2367eaec894bd2e14a1351c363e003" id="r_ade2367eaec894bd2e14a1351c363e003"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ade2367eaec894bd2e14a1351c363e003">metal::fast::pow</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ade2367eaec894bd2e14a1351c363e003" id="r_ade2367eaec894bd2e14a1351c363e003"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ade2367eaec894bd2e14a1351c363e003">metal::fast::pow</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ade2367eaec894bd2e14a1351c363e003"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4293cbc94175b4dcc724fe4747eb5d5a" id="r_a4293cbc94175b4dcc724fe4747eb5d5a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4293cbc94175b4dcc724fe4747eb5d5a">metal::fast::powr</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a4293cbc94175b4dcc724fe4747eb5d5a" id="r_a4293cbc94175b4dcc724fe4747eb5d5a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4293cbc94175b4dcc724fe4747eb5d5a">metal::fast::powr</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a4293cbc94175b4dcc724fe4747eb5d5a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa613bc252f8d8069e175ec9e9d05a7ec" id="r_aa613bc252f8d8069e175ec9e9d05a7ec"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aa613bc252f8d8069e175ec9e9d05a7ec">metal::fast::rint</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aa613bc252f8d8069e175ec9e9d05a7ec" id="r_aa613bc252f8d8069e175ec9e9d05a7ec"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aa613bc252f8d8069e175ec9e9d05a7ec">metal::fast::rint</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aa613bc252f8d8069e175ec9e9d05a7ec"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4cb687257a004726d49e496417eaa40f" id="r_a4cb687257a004726d49e496417eaa40f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4cb687257a004726d49e496417eaa40f">metal::fast::round</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a4cb687257a004726d49e496417eaa40f" id="r_a4cb687257a004726d49e496417eaa40f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4cb687257a004726d49e496417eaa40f">metal::fast::round</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a4cb687257a004726d49e496417eaa40f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa62097c750f1e4b69d09277f19976ab1" id="r_aa62097c750f1e4b69d09277f19976ab1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aa62097c750f1e4b69d09277f19976ab1">metal::fast::rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aa62097c750f1e4b69d09277f19976ab1" id="r_aa62097c750f1e4b69d09277f19976ab1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aa62097c750f1e4b69d09277f19976ab1">metal::fast::rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aa62097c750f1e4b69d09277f19976ab1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3af771cfe7a135104f9d063147dba270" id="r_a3af771cfe7a135104f9d063147dba270"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a3af771cfe7a135104f9d063147dba270">metal::fast::sin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a3af771cfe7a135104f9d063147dba270" id="r_a3af771cfe7a135104f9d063147dba270"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a3af771cfe7a135104f9d063147dba270">metal::fast::sin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a3af771cfe7a135104f9d063147dba270"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a990d90b3440e38d1fb4ff5065c6c189b" id="r_a990d90b3440e38d1fb4ff5065c6c189b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a990d90b3440e38d1fb4ff5065c6c189b">metal::fast::sinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a990d90b3440e38d1fb4ff5065c6c189b" id="r_a990d90b3440e38d1fb4ff5065c6c189b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a990d90b3440e38d1fb4ff5065c6c189b">metal::fast::sinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a990d90b3440e38d1fb4ff5065c6c189b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab07a32fe544aa304577d29e0251e87b2" id="r_ab07a32fe544aa304577d29e0251e87b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ab07a32fe544aa304577d29e0251e87b2">metal::fast::sinpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ab07a32fe544aa304577d29e0251e87b2" id="r_ab07a32fe544aa304577d29e0251e87b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ab07a32fe544aa304577d29e0251e87b2">metal::fast::sinpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ab07a32fe544aa304577d29e0251e87b2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4218a85c7d8a74cb8055b4755205627e" id="r_a4218a85c7d8a74cb8055b4755205627e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4218a85c7d8a74cb8055b4755205627e">metal::fast::sqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a4218a85c7d8a74cb8055b4755205627e" id="r_a4218a85c7d8a74cb8055b4755205627e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a4218a85c7d8a74cb8055b4755205627e">metal::fast::sqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a4218a85c7d8a74cb8055b4755205627e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae34754afa152a6170ac2ae3294174506" id="r_ae34754afa152a6170ac2ae3294174506"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ae34754afa152a6170ac2ae3294174506">metal::fast::tan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ae34754afa152a6170ac2ae3294174506" id="r_ae34754afa152a6170ac2ae3294174506"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#ae34754afa152a6170ac2ae3294174506">metal::fast::tan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ae34754afa152a6170ac2ae3294174506"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a13e6e6ae087b7c558e9a94ddbc864d43" id="r_a13e6e6ae087b7c558e9a94ddbc864d43"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a13e6e6ae087b7c558e9a94ddbc864d43">metal::fast::tanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a13e6e6ae087b7c558e9a94ddbc864d43" id="r_a13e6e6ae087b7c558e9a94ddbc864d43"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a13e6e6ae087b7c558e9a94ddbc864d43">metal::fast::tanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a13e6e6ae087b7c558e9a94ddbc864d43"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a39b2952d4adf1400016c63243798aaf8" id="r_a39b2952d4adf1400016c63243798aaf8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a39b2952d4adf1400016c63243798aaf8">metal::fast::tanpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a39b2952d4adf1400016c63243798aaf8" id="r_a39b2952d4adf1400016c63243798aaf8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#a39b2952d4adf1400016c63243798aaf8">metal::fast::tanpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a39b2952d4adf1400016c63243798aaf8"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa62e1075e86c626d97038f16e9433415" id="r_aa62e1075e86c626d97038f16e9433415"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aa62e1075e86c626d97038f16e9433415">metal::fast::trunc</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aa62e1075e86c626d97038f16e9433415" id="r_aa62e1075e86c626d97038f16e9433415"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1fast.html#aa62e1075e86c626d97038f16e9433415">metal::fast::trunc</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aa62e1075e86c626d97038f16e9433415"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a99f2b2746e813b9ca7b4249afbaf2a14" id="r_a99f2b2746e813b9ca7b4249afbaf2a14"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a99f2b2746e813b9ca7b4249afbaf2a14">metal::precise::abs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a99f2b2746e813b9ca7b4249afbaf2a14" id="r_a99f2b2746e813b9ca7b4249afbaf2a14"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a99f2b2746e813b9ca7b4249afbaf2a14">metal::precise::abs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a99f2b2746e813b9ca7b4249afbaf2a14"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8a2bcc89fc0b7e74f0453f82f89a8604" id="r_a8a2bcc89fc0b7e74f0453f82f89a8604"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8a2bcc89fc0b7e74f0453f82f89a8604">metal::precise::acos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8a2bcc89fc0b7e74f0453f82f89a8604" id="r_a8a2bcc89fc0b7e74f0453f82f89a8604"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8a2bcc89fc0b7e74f0453f82f89a8604">metal::precise::acos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8a2bcc89fc0b7e74f0453f82f89a8604"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1f489fabffab969b8677b56bb1136067" id="r_a1f489fabffab969b8677b56bb1136067"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a1f489fabffab969b8677b56bb1136067">metal::precise::acosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a1f489fabffab969b8677b56bb1136067" id="r_a1f489fabffab969b8677b56bb1136067"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a1f489fabffab969b8677b56bb1136067">metal::precise::acosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a1f489fabffab969b8677b56bb1136067"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adc7b8b6e12e320cb32030f728dcbf438" id="r_adc7b8b6e12e320cb32030f728dcbf438"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#adc7b8b6e12e320cb32030f728dcbf438">metal::precise::asin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:adc7b8b6e12e320cb32030f728dcbf438" id="r_adc7b8b6e12e320cb32030f728dcbf438"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#adc7b8b6e12e320cb32030f728dcbf438">metal::precise::asin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:adc7b8b6e12e320cb32030f728dcbf438"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aaad1cdde6687c8011fbc5fda1bb13424" id="r_aaad1cdde6687c8011fbc5fda1bb13424"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aaad1cdde6687c8011fbc5fda1bb13424">metal::precise::asinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aaad1cdde6687c8011fbc5fda1bb13424" id="r_aaad1cdde6687c8011fbc5fda1bb13424"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aaad1cdde6687c8011fbc5fda1bb13424">metal::precise::asinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aaad1cdde6687c8011fbc5fda1bb13424"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aaaf4b5f4786a912089bbf0ae7619a6be" id="r_aaaf4b5f4786a912089bbf0ae7619a6be"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aaaf4b5f4786a912089bbf0ae7619a6be">metal::precise::atan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
+<tr class="memitem:aaaf4b5f4786a912089bbf0ae7619a6be" id="r_aaaf4b5f4786a912089bbf0ae7619a6be"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aaaf4b5f4786a912089bbf0ae7619a6be">metal::precise::atan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
 <tr class="separator:aaaf4b5f4786a912089bbf0ae7619a6be"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6f161b049cc6884f87b09b33c2d1cd7f" id="r_a6f161b049cc6884f87b09b33c2d1cd7f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a6f161b049cc6884f87b09b33c2d1cd7f">metal::precise::atan2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a6f161b049cc6884f87b09b33c2d1cd7f" id="r_a6f161b049cc6884f87b09b33c2d1cd7f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a6f161b049cc6884f87b09b33c2d1cd7f">metal::precise::atan2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a6f161b049cc6884f87b09b33c2d1cd7f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a902994837653b90c47f4285673e712c4" id="r_a902994837653b90c47f4285673e712c4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a902994837653b90c47f4285673e712c4">metal::precise::atanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a902994837653b90c47f4285673e712c4" id="r_a902994837653b90c47f4285673e712c4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a902994837653b90c47f4285673e712c4">metal::precise::atanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a902994837653b90c47f4285673e712c4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8577549a1afeea206dd9a2004af2868d" id="r_a8577549a1afeea206dd9a2004af2868d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8577549a1afeea206dd9a2004af2868d">metal::precise::ceil</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8577549a1afeea206dd9a2004af2868d" id="r_a8577549a1afeea206dd9a2004af2868d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8577549a1afeea206dd9a2004af2868d">metal::precise::ceil</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8577549a1afeea206dd9a2004af2868d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac4941f62e7d8ab9d7cabbd967aa9f220" id="r_ac4941f62e7d8ab9d7cabbd967aa9f220"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220">metal::precise::cos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ac4941f62e7d8ab9d7cabbd967aa9f220" id="r_ac4941f62e7d8ab9d7cabbd967aa9f220"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220">metal::precise::cos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ac4941f62e7d8ab9d7cabbd967aa9f220"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a72d86d508300a9b58f4ccbbe70da4fbc" id="r_a72d86d508300a9b58f4ccbbe70da4fbc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc">metal::precise::cosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a72d86d508300a9b58f4ccbbe70da4fbc" id="r_a72d86d508300a9b58f4ccbbe70da4fbc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc">metal::precise::cosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a72d86d508300a9b58f4ccbbe70da4fbc"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2392b78bd196efdbbac65901c4ab20e7" id="r_a2392b78bd196efdbbac65901c4ab20e7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7">metal::precise::cospi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a2392b78bd196efdbbac65901c4ab20e7" id="r_a2392b78bd196efdbbac65901c4ab20e7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7">metal::precise::cospi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a2392b78bd196efdbbac65901c4ab20e7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aec0982cdb96a08b61f51129150d82e9d" id="r_aec0982cdb96a08b61f51129150d82e9d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aec0982cdb96a08b61f51129150d82e9d">metal::precise::divide</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:aec0982cdb96a08b61f51129150d82e9d" id="r_aec0982cdb96a08b61f51129150d82e9d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aec0982cdb96a08b61f51129150d82e9d">metal::precise::divide</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:aec0982cdb96a08b61f51129150d82e9d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8d8d2d5700ce432b33cf47cf22528e8f" id="r_a8d8d2d5700ce432b33cf47cf22528e8f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8d8d2d5700ce432b33cf47cf22528e8f">metal::precise::exp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8d8d2d5700ce432b33cf47cf22528e8f" id="r_a8d8d2d5700ce432b33cf47cf22528e8f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8d8d2d5700ce432b33cf47cf22528e8f">metal::precise::exp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8d8d2d5700ce432b33cf47cf22528e8f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af9addb343c967da3a83e9e123a8521fd" id="r_af9addb343c967da3a83e9e123a8521fd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#af9addb343c967da3a83e9e123a8521fd">metal::precise::exp10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:af9addb343c967da3a83e9e123a8521fd" id="r_af9addb343c967da3a83e9e123a8521fd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#af9addb343c967da3a83e9e123a8521fd">metal::precise::exp10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:af9addb343c967da3a83e9e123a8521fd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a92a880bd2197efc0da0f8f0f7ec1e4c9" id="r_a92a880bd2197efc0da0f8f0f7ec1e4c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a92a880bd2197efc0da0f8f0f7ec1e4c9">metal::precise::exp2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a92a880bd2197efc0da0f8f0f7ec1e4c9" id="r_a92a880bd2197efc0da0f8f0f7ec1e4c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a92a880bd2197efc0da0f8f0f7ec1e4c9">metal::precise::exp2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a92a880bd2197efc0da0f8f0f7ec1e4c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae4c71d8bc8ef291036a7aaa05f8be3d1" id="r_ae4c71d8bc8ef291036a7aaa05f8be3d1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ae4c71d8bc8ef291036a7aaa05f8be3d1">metal::precise::fabs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ae4c71d8bc8ef291036a7aaa05f8be3d1" id="r_ae4c71d8bc8ef291036a7aaa05f8be3d1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ae4c71d8bc8ef291036a7aaa05f8be3d1">metal::precise::fabs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ae4c71d8bc8ef291036a7aaa05f8be3d1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af693e7c93de446e80dd1377f5e9e7260" id="r_af693e7c93de446e80dd1377f5e9e7260"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#af693e7c93de446e80dd1377f5e9e7260">metal::precise::fdim</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:af693e7c93de446e80dd1377f5e9e7260" id="r_af693e7c93de446e80dd1377f5e9e7260"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#af693e7c93de446e80dd1377f5e9e7260">metal::precise::fdim</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:af693e7c93de446e80dd1377f5e9e7260"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a66e02b028e3cecfe7c80773460dc7925" id="r_a66e02b028e3cecfe7c80773460dc7925"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a66e02b028e3cecfe7c80773460dc7925">metal::precise::floor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a66e02b028e3cecfe7c80773460dc7925" id="r_a66e02b028e3cecfe7c80773460dc7925"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a66e02b028e3cecfe7c80773460dc7925">metal::precise::floor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a66e02b028e3cecfe7c80773460dc7925"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a49391a64d6b66fe3a212516b316a2144" id="r_a49391a64d6b66fe3a212516b316a2144"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a49391a64d6b66fe3a212516b316a2144">metal::precise::fma</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a49391a64d6b66fe3a212516b316a2144" id="r_a49391a64d6b66fe3a212516b316a2144"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a49391a64d6b66fe3a212516b316a2144">metal::precise::fma</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a49391a64d6b66fe3a212516b316a2144"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac7d49f921c2883caf9eec66efc4de1cd" id="r_ac7d49f921c2883caf9eec66efc4de1cd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ac7d49f921c2883caf9eec66efc4de1cd">metal::precise::fmax</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ac7d49f921c2883caf9eec66efc4de1cd" id="r_ac7d49f921c2883caf9eec66efc4de1cd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ac7d49f921c2883caf9eec66efc4de1cd">metal::precise::fmax</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ac7d49f921c2883caf9eec66efc4de1cd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adf750e51bd83d569994d0967029e3bdc" id="r_adf750e51bd83d569994d0967029e3bdc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#adf750e51bd83d569994d0967029e3bdc">metal::precise::fmax3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:adf750e51bd83d569994d0967029e3bdc" id="r_adf750e51bd83d569994d0967029e3bdc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#adf750e51bd83d569994d0967029e3bdc">metal::precise::fmax3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:adf750e51bd83d569994d0967029e3bdc"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a48d1d0be889de4043b775bb6b030a989" id="r_a48d1d0be889de4043b775bb6b030a989"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a48d1d0be889de4043b775bb6b030a989">metal::precise::fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a48d1d0be889de4043b775bb6b030a989" id="r_a48d1d0be889de4043b775bb6b030a989"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a48d1d0be889de4043b775bb6b030a989">metal::precise::fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a48d1d0be889de4043b775bb6b030a989"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a18df8eb481dfa56c92ad31b5bab8e069" id="r_a18df8eb481dfa56c92ad31b5bab8e069"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a18df8eb481dfa56c92ad31b5bab8e069">metal::precise::fmin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a18df8eb481dfa56c92ad31b5bab8e069" id="r_a18df8eb481dfa56c92ad31b5bab8e069"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a18df8eb481dfa56c92ad31b5bab8e069">metal::precise::fmin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a18df8eb481dfa56c92ad31b5bab8e069"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5bb710e6742996d32225a8f54a0f116c" id="r_a5bb710e6742996d32225a8f54a0f116c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a5bb710e6742996d32225a8f54a0f116c">metal::precise::fmin3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a5bb710e6742996d32225a8f54a0f116c" id="r_a5bb710e6742996d32225a8f54a0f116c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a5bb710e6742996d32225a8f54a0f116c">metal::precise::fmin3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a5bb710e6742996d32225a8f54a0f116c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa99937178a1fc8158054e328eeeae648" id="r_aa99937178a1fc8158054e328eeeae648"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aa99937178a1fc8158054e328eeeae648">metal::precise::fmod</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:aa99937178a1fc8158054e328eeeae648" id="r_aa99937178a1fc8158054e328eeeae648"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aa99937178a1fc8158054e328eeeae648">metal::precise::fmod</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:aa99937178a1fc8158054e328eeeae648"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0f21c19332a90df1a8ff507a813b5757" id="r_a0f21c19332a90df1a8ff507a813b5757"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a0f21c19332a90df1a8ff507a813b5757">metal::precise::fract</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a0f21c19332a90df1a8ff507a813b5757" id="r_a0f21c19332a90df1a8ff507a813b5757"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a0f21c19332a90df1a8ff507a813b5757">metal::precise::fract</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a0f21c19332a90df1a8ff507a813b5757"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0fbb1624c308b97380f894f92fd858b4" id="r_a0fbb1624c308b97380f894f92fd858b4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a0fbb1624c308b97380f894f92fd858b4">metal::precise::frexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="namespacemetal_1_1precise.html#a8d8d2d5700ce432b33cf47cf22528e8f">exp</a>)</td></tr>
+<tr class="memitem:a0fbb1624c308b97380f894f92fd858b4" id="r_a0fbb1624c308b97380f894f92fd858b4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a0fbb1624c308b97380f894f92fd858b4">metal::precise::frexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="namespacemetal_1_1precise.html#a8d8d2d5700ce432b33cf47cf22528e8f">exp</a>)</td></tr>
 <tr class="separator:a0fbb1624c308b97380f894f92fd858b4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa0462827a08a9f475fdaeb104c98b6ab" id="r_aa0462827a08a9f475fdaeb104c98b6ab"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aa0462827a08a9f475fdaeb104c98b6ab">metal::precise::ldexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
+<tr class="memitem:aa0462827a08a9f475fdaeb104c98b6ab" id="r_aa0462827a08a9f475fdaeb104c98b6ab"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#aa0462827a08a9f475fdaeb104c98b6ab">metal::precise::ldexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
 <tr class="separator:aa0462827a08a9f475fdaeb104c98b6ab"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a341c2b8c27d1bed860f85f8b355023d4" id="r_a341c2b8c27d1bed860f85f8b355023d4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a341c2b8c27d1bed860f85f8b355023d4">metal::precise::log</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a341c2b8c27d1bed860f85f8b355023d4" id="r_a341c2b8c27d1bed860f85f8b355023d4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a341c2b8c27d1bed860f85f8b355023d4">metal::precise::log</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a341c2b8c27d1bed860f85f8b355023d4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a44239067e8e9248b1574353f98e94d72" id="r_a44239067e8e9248b1574353f98e94d72"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a44239067e8e9248b1574353f98e94d72">metal::precise::log10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a44239067e8e9248b1574353f98e94d72" id="r_a44239067e8e9248b1574353f98e94d72"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a44239067e8e9248b1574353f98e94d72">metal::precise::log10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a44239067e8e9248b1574353f98e94d72"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a632dbbdcc1a465cf4739a14306147573" id="r_a632dbbdcc1a465cf4739a14306147573"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a632dbbdcc1a465cf4739a14306147573">metal::precise::log2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a632dbbdcc1a465cf4739a14306147573" id="r_a632dbbdcc1a465cf4739a14306147573"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a632dbbdcc1a465cf4739a14306147573">metal::precise::log2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a632dbbdcc1a465cf4739a14306147573"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6a954a4e4e3753303d1dc734855a185f" id="r_a6a954a4e4e3753303d1dc734855a185f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a6a954a4e4e3753303d1dc734855a185f">metal::precise::max</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a6a954a4e4e3753303d1dc734855a185f" id="r_a6a954a4e4e3753303d1dc734855a185f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a6a954a4e4e3753303d1dc734855a185f">metal::precise::max</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a6a954a4e4e3753303d1dc734855a185f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac490e8614ebd2c9343af1ae6c0d4e82c" id="r_ac490e8614ebd2c9343af1ae6c0d4e82c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ac490e8614ebd2c9343af1ae6c0d4e82c">metal::precise::max3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:ac490e8614ebd2c9343af1ae6c0d4e82c" id="r_ac490e8614ebd2c9343af1ae6c0d4e82c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ac490e8614ebd2c9343af1ae6c0d4e82c">metal::precise::max3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:ac490e8614ebd2c9343af1ae6c0d4e82c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a14555ff99c4388493fec48e070144ae2" id="r_a14555ff99c4388493fec48e070144ae2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2">metal::precise::median3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a14555ff99c4388493fec48e070144ae2" id="r_a14555ff99c4388493fec48e070144ae2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2">metal::precise::median3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a14555ff99c4388493fec48e070144ae2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afed0da2f7df3505b5dffa2389c3cb36e" id="r_afed0da2f7df3505b5dffa2389c3cb36e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e">metal::precise::min</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:afed0da2f7df3505b5dffa2389c3cb36e" id="r_afed0da2f7df3505b5dffa2389c3cb36e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e">metal::precise::min</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:afed0da2f7df3505b5dffa2389c3cb36e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4d37ce31c3549ca4772a4ee29798e231" id="r_a4d37ce31c3549ca4772a4ee29798e231"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231">metal::precise::min3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a4d37ce31c3549ca4772a4ee29798e231" id="r_a4d37ce31c3549ca4772a4ee29798e231"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231">metal::precise::min3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a4d37ce31c3549ca4772a4ee29798e231"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad012ceeb55b77f1533749b351331e026" id="r_ad012ceeb55b77f1533749b351331e026"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ad012ceeb55b77f1533749b351331e026">metal::precise::nextafter</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ad012ceeb55b77f1533749b351331e026" id="r_ad012ceeb55b77f1533749b351331e026"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ad012ceeb55b77f1533749b351331e026">metal::precise::nextafter</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ad012ceeb55b77f1533749b351331e026"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4cce64f1f20c1c6dfd29115bdb7c8d42" id="r_a4cce64f1f20c1c6dfd29115bdb7c8d42"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a4cce64f1f20c1c6dfd29115bdb7c8d42">metal::precise::pow</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a4cce64f1f20c1c6dfd29115bdb7c8d42" id="r_a4cce64f1f20c1c6dfd29115bdb7c8d42"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a4cce64f1f20c1c6dfd29115bdb7c8d42">metal::precise::pow</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a4cce64f1f20c1c6dfd29115bdb7c8d42"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac9dbab0bd99b2b94e364aba5353bdcd7" id="r_ac9dbab0bd99b2b94e364aba5353bdcd7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ac9dbab0bd99b2b94e364aba5353bdcd7">metal::precise::powr</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ac9dbab0bd99b2b94e364aba5353bdcd7" id="r_ac9dbab0bd99b2b94e364aba5353bdcd7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ac9dbab0bd99b2b94e364aba5353bdcd7">metal::precise::powr</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ac9dbab0bd99b2b94e364aba5353bdcd7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab17bd408098270ad92f37bcd1039c254" id="r_ab17bd408098270ad92f37bcd1039c254"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ab17bd408098270ad92f37bcd1039c254">metal::precise::rint</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ab17bd408098270ad92f37bcd1039c254" id="r_ab17bd408098270ad92f37bcd1039c254"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#ab17bd408098270ad92f37bcd1039c254">metal::precise::rint</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ab17bd408098270ad92f37bcd1039c254"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5295ab08055d12534cc3775da855ac12" id="r_a5295ab08055d12534cc3775da855ac12"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a5295ab08055d12534cc3775da855ac12">metal::precise::round</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a5295ab08055d12534cc3775da855ac12" id="r_a5295ab08055d12534cc3775da855ac12"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a5295ab08055d12534cc3775da855ac12">metal::precise::round</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a5295ab08055d12534cc3775da855ac12"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afb397b477745f12a44423934fa2b05ac" id="r_afb397b477745f12a44423934fa2b05ac"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#afb397b477745f12a44423934fa2b05ac">metal::precise::rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:afb397b477745f12a44423934fa2b05ac" id="r_afb397b477745f12a44423934fa2b05ac"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#afb397b477745f12a44423934fa2b05ac">metal::precise::rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:afb397b477745f12a44423934fa2b05ac"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a71acf77ffd29c56f56afae0195c98a1c" id="r_a71acf77ffd29c56f56afae0195c98a1c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a71acf77ffd29c56f56afae0195c98a1c">metal::precise::sin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a71acf77ffd29c56f56afae0195c98a1c" id="r_a71acf77ffd29c56f56afae0195c98a1c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a71acf77ffd29c56f56afae0195c98a1c">metal::precise::sin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a71acf77ffd29c56f56afae0195c98a1c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:abc8f4f59dd6e7204ab5d84f0af96331c" id="r_abc8f4f59dd6e7204ab5d84f0af96331c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#abc8f4f59dd6e7204ab5d84f0af96331c">metal::precise::sinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:abc8f4f59dd6e7204ab5d84f0af96331c" id="r_abc8f4f59dd6e7204ab5d84f0af96331c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#abc8f4f59dd6e7204ab5d84f0af96331c">metal::precise::sinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:abc8f4f59dd6e7204ab5d84f0af96331c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a78b17dab93519d9c82c2575dafec49c9" id="r_a78b17dab93519d9c82c2575dafec49c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a78b17dab93519d9c82c2575dafec49c9">metal::precise::sinpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a78b17dab93519d9c82c2575dafec49c9" id="r_a78b17dab93519d9c82c2575dafec49c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a78b17dab93519d9c82c2575dafec49c9">metal::precise::sinpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a78b17dab93519d9c82c2575dafec49c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:acb213467361cd2cab93a8d5ea1aa5bfd" id="r_acb213467361cd2cab93a8d5ea1aa5bfd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#acb213467361cd2cab93a8d5ea1aa5bfd">metal::precise::sqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:acb213467361cd2cab93a8d5ea1aa5bfd" id="r_acb213467361cd2cab93a8d5ea1aa5bfd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#acb213467361cd2cab93a8d5ea1aa5bfd">metal::precise::sqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:acb213467361cd2cab93a8d5ea1aa5bfd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8261ed22f03122ef15b89512358acb1f" id="r_a8261ed22f03122ef15b89512358acb1f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8261ed22f03122ef15b89512358acb1f">metal::precise::tan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8261ed22f03122ef15b89512358acb1f" id="r_a8261ed22f03122ef15b89512358acb1f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8261ed22f03122ef15b89512358acb1f">metal::precise::tan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8261ed22f03122ef15b89512358acb1f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a741c27a10cc968dd1e63473d9fcd8f99" id="r_a741c27a10cc968dd1e63473d9fcd8f99"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a741c27a10cc968dd1e63473d9fcd8f99">metal::precise::tanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a741c27a10cc968dd1e63473d9fcd8f99" id="r_a741c27a10cc968dd1e63473d9fcd8f99"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a741c27a10cc968dd1e63473d9fcd8f99">metal::precise::tanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a741c27a10cc968dd1e63473d9fcd8f99"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8fae8c20deff43a8e855bba6f3ba20a5" id="r_a8fae8c20deff43a8e855bba6f3ba20a5"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8fae8c20deff43a8e855bba6f3ba20a5">metal::precise::tanpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8fae8c20deff43a8e855bba6f3ba20a5" id="r_a8fae8c20deff43a8e855bba6f3ba20a5"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a8fae8c20deff43a8e855bba6f3ba20a5">metal::precise::tanpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8fae8c20deff43a8e855bba6f3ba20a5"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a334183e7a2dd49b983d072d1e8ee2b27" id="r_a334183e7a2dd49b983d072d1e8ee2b27"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a334183e7a2dd49b983d072d1e8ee2b27">metal::precise::trunc</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a334183e7a2dd49b983d072d1e8ee2b27" id="r_a334183e7a2dd49b983d072d1e8ee2b27"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal_1_1precise.html#a334183e7a2dd49b983d072d1e8ee2b27">metal::precise::trunc</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a334183e7a2dd49b983d072d1e8ee2b27"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a498f1e85107eb5f01ba4435977f8efe0" id="r_a498f1e85107eb5f01ba4435977f8efe0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a498f1e85107eb5f01ba4435977f8efe0">metal::simd_broadcast</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort broadcast_lane_id)</td></tr>
+<tr class="memitem:a498f1e85107eb5f01ba4435977f8efe0" id="r_a498f1e85107eb5f01ba4435977f8efe0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a498f1e85107eb5f01ba4435977f8efe0">metal::simd_broadcast</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort broadcast_lane_id)</td></tr>
 <tr class="separator:a498f1e85107eb5f01ba4435977f8efe0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a259ed115bc3c58f88eb35830916b26d4" id="r_a259ed115bc3c58f88eb35830916b26d4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">metal::simd_shuffle</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort simd_lane_id)</td></tr>
+<tr class="memitem:a259ed115bc3c58f88eb35830916b26d4" id="r_a259ed115bc3c58f88eb35830916b26d4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4">metal::simd_shuffle</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort simd_lane_id)</td></tr>
 <tr class="separator:a259ed115bc3c58f88eb35830916b26d4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae29a06f0eac636ad7af21dea5b04938b" id="r_ae29a06f0eac636ad7af21dea5b04938b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae29a06f0eac636ad7af21dea5b04938b">metal::simd_shuffle_and_fill_down</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta, ushort modulo)</td></tr>
+<tr class="memitem:ae29a06f0eac636ad7af21dea5b04938b" id="r_ae29a06f0eac636ad7af21dea5b04938b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae29a06f0eac636ad7af21dea5b04938b">metal::simd_shuffle_and_fill_down</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta, ushort modulo)</td></tr>
 <tr class="separator:ae29a06f0eac636ad7af21dea5b04938b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0ee6239fa29a5f9ee0201e0dc5ddc8e0" id="r_a0ee6239fa29a5f9ee0201e0dc5ddc8e0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a0ee6239fa29a5f9ee0201e0dc5ddc8e0">metal::simd_shuffle_and_fill_down</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta)</td></tr>
+<tr class="memitem:a0ee6239fa29a5f9ee0201e0dc5ddc8e0" id="r_a0ee6239fa29a5f9ee0201e0dc5ddc8e0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a0ee6239fa29a5f9ee0201e0dc5ddc8e0">metal::simd_shuffle_and_fill_down</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta)</td></tr>
 <tr class="separator:a0ee6239fa29a5f9ee0201e0dc5ddc8e0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1ca14116bf50639b214d8414b5bbaaa6" id="r_a1ca14116bf50639b214d8414b5bbaaa6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">metal::simd_shuffle_and_fill_up</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta, ushort modulo)</td></tr>
+<tr class="memitem:a1ca14116bf50639b214d8414b5bbaaa6" id="r_a1ca14116bf50639b214d8414b5bbaaa6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6">metal::simd_shuffle_and_fill_up</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta, ushort modulo)</td></tr>
 <tr class="separator:a1ca14116bf50639b214d8414b5bbaaa6"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5138d5cdc18139e135707916a243cd8e" id="r_a5138d5cdc18139e135707916a243cd8e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a5138d5cdc18139e135707916a243cd8e">metal::simd_shuffle_and_fill_up</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta)</td></tr>
+<tr class="memitem:a5138d5cdc18139e135707916a243cd8e" id="r_a5138d5cdc18139e135707916a243cd8e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a5138d5cdc18139e135707916a243cd8e">metal::simd_shuffle_and_fill_up</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta)</td></tr>
 <tr class="separator:a5138d5cdc18139e135707916a243cd8e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af6e2dd7ae087aba6abac4f0350b7611c" id="r_af6e2dd7ae087aba6abac4f0350b7611c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
+<tr class="memitem:af6e2dd7ae087aba6abac4f0350b7611c" id="r_af6e2dd7ae087aba6abac4f0350b7611c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
 <tr class="separator:af6e2dd7ae087aba6abac4f0350b7611c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4bb203647a421032db47e73cd649841b" id="r_a4bb203647a421032db47e73cd649841b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a4bb203647a421032db47e73cd649841b">metal::simd_shuffle_rotate_down</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
+<tr class="memitem:a4bb203647a421032db47e73cd649841b" id="r_a4bb203647a421032db47e73cd649841b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a4bb203647a421032db47e73cd649841b">metal::simd_shuffle_rotate_down</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
 <tr class="separator:a4bb203647a421032db47e73cd649841b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a729b22077d6c944491a6027c18ea80c9" id="r_a729b22077d6c944491a6027c18ea80c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a729b22077d6c944491a6027c18ea80c9">metal::simd_shuffle_rotate_up</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
+<tr class="memitem:a729b22077d6c944491a6027c18ea80c9" id="r_a729b22077d6c944491a6027c18ea80c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a729b22077d6c944491a6027c18ea80c9">metal::simd_shuffle_rotate_up</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
 <tr class="separator:a729b22077d6c944491a6027c18ea80c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afe81c5fbde3f4890458b081909242c55" id="r_afe81c5fbde3f4890458b081909242c55"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">metal::simd_shuffle_up</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
+<tr class="memitem:afe81c5fbde3f4890458b081909242c55" id="r_afe81c5fbde3f4890458b081909242c55"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#afe81c5fbde3f4890458b081909242c55">metal::simd_shuffle_up</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
 <tr class="separator:afe81c5fbde3f4890458b081909242c55"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5017efc9605e069cfb507137cd1a1852" id="r_a5017efc9605e069cfb507137cd1a1852"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a5017efc9605e069cfb507137cd1a1852">metal::simd_shuffle_xor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort mask)</td></tr>
+<tr class="memitem:a5017efc9605e069cfb507137cd1a1852" id="r_a5017efc9605e069cfb507137cd1a1852"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a5017efc9605e069cfb507137cd1a1852">metal::simd_shuffle_xor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort mask)</td></tr>
 <tr class="separator:a5017efc9605e069cfb507137cd1a1852"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a048cad0aca52cb737ebf103e76bd1c49" id="r_a048cad0aca52cb737ebf103e76bd1c49"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">metal::simd_max</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="memitem:a048cad0aca52cb737ebf103e76bd1c49" id="r_a048cad0aca52cb737ebf103e76bd1c49"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">metal::simd_max</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
 <tr class="separator:a048cad0aca52cb737ebf103e76bd1c49"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae9e2a23e00724ba2d7868bc4112b386b" id="r_ae9e2a23e00724ba2d7868bc4112b386b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b">metal::simd_min</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="memitem:ae9e2a23e00724ba2d7868bc4112b386b" id="r_ae9e2a23e00724ba2d7868bc4112b386b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b">metal::simd_min</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
 <tr class="separator:ae9e2a23e00724ba2d7868bc4112b386b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5ca40242390b632f737e29636829b2e4" id="r_a5ca40242390b632f737e29636829b2e4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a5ca40242390b632f737e29636829b2e4">metal::simd_prefix_exclusive_product</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="memitem:a5ca40242390b632f737e29636829b2e4" id="r_a5ca40242390b632f737e29636829b2e4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a5ca40242390b632f737e29636829b2e4">metal::simd_prefix_exclusive_product</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
 <tr class="separator:a5ca40242390b632f737e29636829b2e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:abfbb70c7471f28bf7ff36a612ad014b2" id="r_abfbb70c7471f28bf7ff36a612ad014b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#abfbb70c7471f28bf7ff36a612ad014b2">metal::simd_prefix_exclusive_sum</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="memitem:abfbb70c7471f28bf7ff36a612ad014b2" id="r_abfbb70c7471f28bf7ff36a612ad014b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#abfbb70c7471f28bf7ff36a612ad014b2">metal::simd_prefix_exclusive_sum</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
 <tr class="separator:abfbb70c7471f28bf7ff36a612ad014b2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6ca6a7e1996228fa536e969e9e45c446" id="r_a6ca6a7e1996228fa536e969e9e45c446"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a6ca6a7e1996228fa536e969e9e45c446">metal::simd_prefix_inclusive_product</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="memitem:a6ca6a7e1996228fa536e969e9e45c446" id="r_a6ca6a7e1996228fa536e969e9e45c446"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a6ca6a7e1996228fa536e969e9e45c446">metal::simd_prefix_inclusive_product</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
 <tr class="separator:a6ca6a7e1996228fa536e969e9e45c446"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a567acb18199ac0107712eb8cb8aeb8e9" id="r_a567acb18199ac0107712eb8cb8aeb8e9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a567acb18199ac0107712eb8cb8aeb8e9">metal::simd_prefix_inclusive_sum</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="memitem:a567acb18199ac0107712eb8cb8aeb8e9" id="r_a567acb18199ac0107712eb8cb8aeb8e9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a567acb18199ac0107712eb8cb8aeb8e9">metal::simd_prefix_inclusive_sum</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
 <tr class="separator:a567acb18199ac0107712eb8cb8aeb8e9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac6e883a04e2265a9790d7db76059e1b4" id="r_ac6e883a04e2265a9790d7db76059e1b4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ac6e883a04e2265a9790d7db76059e1b4">metal::simd_product</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="memitem:ac6e883a04e2265a9790d7db76059e1b4" id="r_ac6e883a04e2265a9790d7db76059e1b4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#ac6e883a04e2265a9790d7db76059e1b4">metal::simd_product</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
 <tr class="separator:ac6e883a04e2265a9790d7db76059e1b4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a85181e37a00cb4a4217f1bb25389bce5" id="r_a85181e37a00cb4a4217f1bb25389bce5"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">metal::simd_sum</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="memitem:a85181e37a00cb4a4217f1bb25389bce5" id="r_a85181e37a00cb4a4217f1bb25389bce5"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">metal::simd_sum</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
 <tr class="separator:a85181e37a00cb4a4217f1bb25389bce5"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1308decbf2d5c33d34d6be523ea1c30f" id="r_a1308decbf2d5c33d34d6be523ea1c30f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a1308decbf2d5c33d34d6be523ea1c30f">metal::simd_xor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="memitem:a1308decbf2d5c33d34d6be523ea1c30f" id="r_a1308decbf2d5c33d34d6be523ea1c30f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemetal.html#a1308decbf2d5c33d34d6be523ea1c30f">metal::simd_xor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
 <tr class="separator:a1308decbf2d5c33d34d6be523ea1c30f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Macro Definition Documentation</h2>
-<a id="a51cfdd4502e755310f6f3456f039bea7" name="a51cfdd4502e755310f6f3456f039bea7"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a51cfdd4502e755310f6f3456f039bea7">&#9670;&#160;</a></span>bfloat16_to_uint16</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">#define bfloat16_to_uint16</td>
-          <td>(</td>
-          <td class="paramtype"></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
-          <td></td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-<b>Value:</b><div class="fragment"><div class="line">x.bits_</div>
-</div><!-- fragment -->
-</div>
-</div>
 <a id="a9e9f0fdd6e304522bc88acd22c576842" name="a9e9f0fdd6e304522bc88acd22c576842"></a>
 <h2 class="memtitle"><span class="permalink"><a href="#a9e9f0fdd6e304522bc88acd22c576842">&#9670;&#160;</a></span>instantiate_metal_math_funcs</h2>
 
@@ -580,26 +557,6 @@ Functions</h2></td></tr>
       </table>
 </div><div class="memdoc">
 
-</div>
-</div>
-<a id="a030d871474c0e7d907fccffcc8c047e0" name="a030d871474c0e7d907fccffcc8c047e0"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a030d871474c0e7d907fccffcc8c047e0">&#9670;&#160;</a></span>uint16_to_bfloat16</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">#define uint16_to_bfloat16</td>
-          <td>(</td>
-          <td class="paramtype"></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
-          <td></td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-<b>Value:</b><div class="fragment"><div class="line"><a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>(x, <a class="code hl_function" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a>())</div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html_a91ccb774773b65f8d4c1aea3f1c6e1ca"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16::bits_to_bfloat</a></div><div class="ttdeci">static constexpr METAL_FUNC bits_to_bfloat_struct bits_to_bfloat()</div><div class="ttdef"><b>Definition</b> bf16.h:64</div></div>
-</div><!-- fragment -->
 </div>
 </div>
 </div><!-- contents -->
diff --git a/docs/build/html/bf16__math_8h_source.html b/docs/build/html/bf16__math_8h_source.html
index c7b8985d2..e060315d9 100644
--- a/docs/build/html/bf16__math_8h_source.html
+++ b/docs/build/html/bf16__math_8h_source.html
@@ -95,408 +95,395 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
 <div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
 <div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
-<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2bf16_8h.html">mlx/backend/metal/kernels/bf16.h</a>&quot;</span></div>
-<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span> </div>
-<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span><span class="comment">// Metal math for bfloat16</span></div>
-<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span> </div>
-<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span><span class="comment">/*</span></div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span><span class="comment">// Metal math for bfloat16</span></div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span> </div>
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span><span class="comment">/*</span></div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="comment"></span> </div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span><span class="comment">Following the Metal Shading Language Specification (Metal 3.1)</span></div>
 <div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span><span class="comment"></span> </div>
-<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span><span class="comment">Following the Metal Shading Language Specification (Metal 3.1)</span></div>
-<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span><span class="comment"></span> </div>
-<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span><span class="comment">&quot;bfloat is an extended itypeing point type that only allows implicit conversion</span></div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span><span class="comment"> to a type of greater itypeing point rank. While bfloat can be implicitly</span></div>
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span><span class="comment"> converted to itype, it cannot be implicitly converted to half, and neither</span></div>
-<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span><span class="comment"> itype nor half can be implicitly converted to bfloat.&quot;</span></div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span><span class="comment"></span> </div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span><span class="comment">Further, as far as I can tell, the stdlib math/simd functions are not defined</span></div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span><span class="comment">for bfloat and calling with an argument of type bfloat will result in that</span></div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="comment">argument getting implicitly converted to itype which then returns an output</span></div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span><span class="comment">that is (likely) a itype which cannot be implicitly converted into a bfloat</span></div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span><span class="comment"></span> </div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span><span class="comment">This leads to situations where</span></div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span><span class="comment">bfloat a = 5.0bf;</span></div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span><span class="comment">bfloat b = metal::abs(a); // this will throw an error since abs return itype</span></div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span><span class="comment">bfloat c = static_cast&lt;bfloat&gt;(metal::abs(a)); // this is fine</span></div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span><span class="comment"></span> </div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span><span class="comment">For the moment, I will be adding overloaded instantiations of the math</span></div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span><span class="comment">functions to accordingly automatically handle the casting</span></div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span><span class="comment"></span> </div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span><span class="comment">*/</span></div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span> </div>
-<div class="foldopen" id="foldopen00035" data-start="" data-end="">
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno"><a class="line" href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">   35</a></span><span class="preprocessor">#define instantiate_metal_math_funcs(itype, otype, ctype, mfast)               \</span></div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span><span class="preprocessor">  METAL_FUNC otype abs(itype x) {                                              \</span></div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fabs(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span><span class="preprocessor">  METAL_FUNC otype acos(itype x) {                                             \</span></div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_acos(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span><span class="preprocessor">  METAL_FUNC otype acosh(itype x) {                                            \</span></div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_acosh(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span><span class="preprocessor">  METAL_FUNC otype asin(itype x) {                                             \</span></div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_asin(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span><span class="preprocessor">  METAL_FUNC otype asinh(itype x) {                                            \</span></div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_asinh(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span><span class="preprocessor">  METAL_FUNC otype atan(itype y_over_x) {                                      \</span></div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span><span class="preprocessor">        __metal_atan(static_cast&lt;ctype&gt;(y_over_x), mfast));                    \</span></div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span><span class="preprocessor">  METAL_FUNC otype atan2(itype y, itype x) {                                   \</span></div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span><span class="preprocessor">        __metal_atan2(static_cast&lt;ctype&gt;(y), static_cast&lt;ctype&gt;(x), mfast));   \</span></div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span><span class="preprocessor">  METAL_FUNC otype atanh(itype x) {                                            \</span></div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_atanh(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span><span class="preprocessor">  METAL_FUNC otype ceil(itype x) {                                             \</span></div>
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_ceil(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span><span class="preprocessor">  METAL_FUNC otype cos(itype x) {                                              \</span></div>
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_cos(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span><span class="preprocessor">  METAL_FUNC otype cosh(itype x) {                                             \</span></div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_cosh(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span><span class="preprocessor">  METAL_FUNC otype cospi(itype x) {                                            \</span></div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_cospi(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span><span class="preprocessor">  METAL_FUNC otype divide(itype x, itype y) {                                  \</span></div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span><span class="preprocessor">        __metal_divide(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));  \</span></div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span><span class="preprocessor">  METAL_FUNC otype exp(itype x) {                                              \</span></div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_exp(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span><span class="preprocessor">  METAL_FUNC otype exp10(itype x) {                                            \</span></div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_exp10(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span><span class="preprocessor">  METAL_FUNC otype exp2(itype x) {                                             \</span></div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_exp2(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span><span class="preprocessor">  METAL_FUNC otype fabs(itype x) {                                             \</span></div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fabs(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span><span class="preprocessor">  METAL_FUNC otype fdim(itype x, itype y) {                                    \</span></div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span><span class="preprocessor">    ctype t = static_cast&lt;ctype&gt;(x - y);                                       \</span></div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span><span class="preprocessor">    return static_cast&lt;otype&gt;(select(t, ctype(0), t &lt; ctype(0) || x == y));    \</span></div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span><span class="preprocessor">  METAL_FUNC otype floor(itype x) {                                            \</span></div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_floor(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span><span class="preprocessor">  METAL_FUNC otype fma(itype x, itype y, itype z) {                            \</span></div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fma(                                     \</span></div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), static_cast&lt;ctype&gt;(z))); \</span></div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span><span class="preprocessor">  METAL_FUNC otype fmax(itype x, itype y) {                                    \</span></div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span><span class="preprocessor">        __metal_fmax(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span><span class="preprocessor">  METAL_FUNC otype fmax3(itype x, itype y, itype z) {                          \</span></div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmax3(                                   \</span></div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span><span class="preprocessor">        mfast));                                                               \</span></div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span><span class="preprocessor">  METAL_FUNC otype fmedian3(itype x, itype y, itype z) {                       \</span></div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmedian3(                                \</span></div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span><span class="preprocessor">        mfast));                                                               \</span></div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span><span class="preprocessor">  METAL_FUNC otype fmin(itype x, itype y) {                                    \</span></div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span><span class="preprocessor">        __metal_fmin(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span><span class="preprocessor">  METAL_FUNC otype fmin3(itype x, itype y, itype z) {                          \</span></div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmin3(                                   \</span></div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span><span class="preprocessor">        mfast));                                                               \</span></div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span><span class="preprocessor">  METAL_FUNC otype fmod(itype x, itype y) {                                    \</span></div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span><span class="preprocessor">        __metal_fmod(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span><span class="preprocessor">  METAL_FUNC otype fract(itype x) {                                            \</span></div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fract(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span><span class="preprocessor">  METAL_FUNC otype frexp(itype x, thread int&amp; exp) {                           \</span></div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_frexp(static_cast&lt;ctype&gt;(x), &amp;exp));     \</span></div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span><span class="preprocessor">  METAL_FUNC otype ldexp(itype x, int k) {                                     \</span></div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_ldexp(static_cast&lt;ctype&gt;(x), k, mfast)); \</span></div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span><span class="preprocessor">  METAL_FUNC otype log(itype x) {                                              \</span></div>
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_log(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span><span class="preprocessor">  METAL_FUNC otype log10(itype x) {                                            \</span></div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_log10(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span><span class="preprocessor">  METAL_FUNC otype log2(itype x) {                                             \</span></div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_log2(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span><span class="preprocessor">  METAL_FUNC otype max(itype x, itype y) {                                     \</span></div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span><span class="preprocessor">        __metal_fmax(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span><span class="preprocessor">  METAL_FUNC otype max3(itype x, itype y, itype z) {                           \</span></div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmax3(                                   \</span></div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span><span class="preprocessor">        mfast));                                                               \</span></div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span><span class="preprocessor">  METAL_FUNC otype median3(itype x, itype y, itype z) {                        \</span></div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmedian3(                                \</span></div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span><span class="preprocessor">        mfast));                                                               \</span></div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span><span class="preprocessor">  METAL_FUNC otype min(itype x, itype y) {                                     \</span></div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span><span class="preprocessor">        __metal_fmin(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span><span class="preprocessor">  METAL_FUNC otype min3(itype x, itype y, itype z) {                           \</span></div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmin3(                                   \</span></div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span><span class="preprocessor">        mfast));                                                               \</span></div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span><span class="preprocessor">  METAL_FUNC otype nextafter(itype x, itype y) {                               \</span></div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span><span class="preprocessor">        __metal_nextafter(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y)));      \</span></div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span><span class="preprocessor">  METAL_FUNC otype pow(itype x, itype y) {                                     \</span></div>
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span><span class="preprocessor">        __metal_pow(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));     \</span></div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span><span class="preprocessor">  METAL_FUNC otype powr(itype x, itype y) {                                    \</span></div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span><span class="preprocessor">        __metal_powr(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
-<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span><span class="preprocessor">  METAL_FUNC otype rint(itype x) {                                             \</span></div>
-<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_rint(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span><span class="preprocessor">  METAL_FUNC otype round(itype x) {                                            \</span></div>
-<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_round(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span><span class="preprocessor">  METAL_FUNC otype rsqrt(itype x) {                                            \</span></div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_rsqrt(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span><span class="preprocessor">  METAL_FUNC otype sin(itype x) {                                              \</span></div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_sin(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span><span class="preprocessor">  METAL_FUNC otype sinh(itype x) {                                             \</span></div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_sinh(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span><span class="preprocessor">  METAL_FUNC otype sinpi(itype x) {                                            \</span></div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_sinpi(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span><span class="preprocessor">  METAL_FUNC otype sqrt(itype x) {                                             \</span></div>
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_sqrt(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span><span class="preprocessor">  METAL_FUNC otype tan(itype x) {                                              \</span></div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_tan(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span><span class="preprocessor">  METAL_FUNC otype tanh(itype x) {                                             \</span></div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_tanh(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span><span class="preprocessor">  METAL_FUNC otype tanpi(itype x) {                                            \</span></div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_tanpi(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span><span class="preprocessor">  METAL_FUNC otype trunc(itype x) {                                            \</span></div>
-<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_trunc(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span><span class="preprocessor">  }</span></div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span><span class="comment">&quot;bfloat is an extended itypeing point type that only allows implicit conversion</span></div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span><span class="comment"> to a type of greater itypeing point rank. While bfloat can be implicitly</span></div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span><span class="comment"> converted to itype, it cannot be implicitly converted to half, and neither</span></div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span><span class="comment"> itype nor half can be implicitly converted to bfloat.&quot;</span></div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span><span class="comment"></span> </div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span><span class="comment">Further, as far as I can tell, the stdlib math/simd functions are not defined</span></div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span><span class="comment">for bfloat and calling with an argument of type bfloat will result in that</span></div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span><span class="comment">argument getting implicitly converted to itype which then returns an output</span></div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span><span class="comment">that is (likely) a itype which cannot be implicitly converted into a bfloat</span></div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="comment"></span> </div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span><span class="comment">This leads to situations where</span></div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span><span class="comment">bfloat a = 5.0bf;</span></div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span><span class="comment">bfloat b = metal::abs(a); // this will throw an error since abs return itype</span></div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span><span class="comment">bfloat c = static_cast&lt;bfloat&gt;(metal::abs(a)); // this is fine</span></div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span><span class="comment"></span> </div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span><span class="comment">For the moment, I will be adding overloaded instantiations of the math</span></div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span><span class="comment">functions to accordingly automatically handle the casting</span></div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span><span class="comment"></span> </div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span><span class="comment">*/</span></div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span> </div>
+<div class="foldopen" id="foldopen00033" data-start="" data-end="">
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno"><a class="line" href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">   33</a></span><span class="preprocessor">#define instantiate_metal_math_funcs(itype, otype, ctype, mfast)               \</span></div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span><span class="preprocessor">  METAL_FUNC otype abs(itype x) {                                              \</span></div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fabs(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span><span class="preprocessor">  METAL_FUNC otype acos(itype x) {                                             \</span></div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_acos(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span><span class="preprocessor">  METAL_FUNC otype acosh(itype x) {                                            \</span></div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_acosh(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span><span class="preprocessor">  METAL_FUNC otype asin(itype x) {                                             \</span></div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_asin(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span><span class="preprocessor">  METAL_FUNC otype asinh(itype x) {                                            \</span></div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_asinh(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span><span class="preprocessor">  METAL_FUNC otype atan(itype y_over_x) {                                      \</span></div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span><span class="preprocessor">        __metal_atan(static_cast&lt;ctype&gt;(y_over_x), mfast));                    \</span></div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span><span class="preprocessor">  METAL_FUNC otype atan2(itype y, itype x) {                                   \</span></div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span><span class="preprocessor">        __metal_atan2(static_cast&lt;ctype&gt;(y), static_cast&lt;ctype&gt;(x), mfast));   \</span></div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span><span class="preprocessor">  METAL_FUNC otype atanh(itype x) {                                            \</span></div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_atanh(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span><span class="preprocessor">  METAL_FUNC otype ceil(itype x) {                                             \</span></div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_ceil(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span><span class="preprocessor">  METAL_FUNC otype cos(itype x) {                                              \</span></div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_cos(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span><span class="preprocessor">  METAL_FUNC otype cosh(itype x) {                                             \</span></div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_cosh(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span><span class="preprocessor">  METAL_FUNC otype cospi(itype x) {                                            \</span></div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_cospi(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span><span class="preprocessor">  METAL_FUNC otype divide(itype x, itype y) {                                  \</span></div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span><span class="preprocessor">        __metal_divide(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));  \</span></div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span><span class="preprocessor">  METAL_FUNC otype exp(itype x) {                                              \</span></div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_exp(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span><span class="preprocessor">  METAL_FUNC otype exp10(itype x) {                                            \</span></div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_exp10(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span><span class="preprocessor">  METAL_FUNC otype exp2(itype x) {                                             \</span></div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_exp2(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span><span class="preprocessor">  METAL_FUNC otype fabs(itype x) {                                             \</span></div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fabs(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span><span class="preprocessor">  METAL_FUNC otype fdim(itype x, itype y) {                                    \</span></div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span><span class="preprocessor">    ctype t = static_cast&lt;ctype&gt;(x - y);                                       \</span></div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span><span class="preprocessor">    return static_cast&lt;otype&gt;(select(t, ctype(0), t &lt; ctype(0) || x == y));    \</span></div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span><span class="preprocessor">  METAL_FUNC otype floor(itype x) {                                            \</span></div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_floor(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span><span class="preprocessor">  METAL_FUNC otype fma(itype x, itype y, itype z) {                            \</span></div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fma(                                     \</span></div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), static_cast&lt;ctype&gt;(z))); \</span></div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span><span class="preprocessor">  METAL_FUNC otype fmax(itype x, itype y) {                                    \</span></div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span><span class="preprocessor">        __metal_fmax(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span><span class="preprocessor">  METAL_FUNC otype fmax3(itype x, itype y, itype z) {                          \</span></div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmax3(                                   \</span></div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span><span class="preprocessor">        mfast));                                                               \</span></div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span><span class="preprocessor">  METAL_FUNC otype fmedian3(itype x, itype y, itype z) {                       \</span></div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmedian3(                                \</span></div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span><span class="preprocessor">        mfast));                                                               \</span></div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span><span class="preprocessor">  METAL_FUNC otype fmin(itype x, itype y) {                                    \</span></div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span><span class="preprocessor">        __metal_fmin(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span><span class="preprocessor">  METAL_FUNC otype fmin3(itype x, itype y, itype z) {                          \</span></div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmin3(                                   \</span></div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span><span class="preprocessor">        mfast));                                                               \</span></div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span><span class="preprocessor">  METAL_FUNC otype fmod(itype x, itype y) {                                    \</span></div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span><span class="preprocessor">        __metal_fmod(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span><span class="preprocessor">  METAL_FUNC otype fract(itype x) {                                            \</span></div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fract(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span><span class="preprocessor">  METAL_FUNC otype frexp(itype x, thread int&amp; exp) {                           \</span></div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_frexp(static_cast&lt;ctype&gt;(x), &amp;exp));     \</span></div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span><span class="preprocessor">  METAL_FUNC otype ldexp(itype x, int k) {                                     \</span></div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_ldexp(static_cast&lt;ctype&gt;(x), k, mfast)); \</span></div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span><span class="preprocessor">  METAL_FUNC otype log(itype x) {                                              \</span></div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_log(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span><span class="preprocessor">  METAL_FUNC otype log10(itype x) {                                            \</span></div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_log10(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span><span class="preprocessor">  METAL_FUNC otype log2(itype x) {                                             \</span></div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_log2(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span><span class="preprocessor">  METAL_FUNC otype max(itype x, itype y) {                                     \</span></div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span><span class="preprocessor">        __metal_fmax(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span><span class="preprocessor">  METAL_FUNC otype max3(itype x, itype y, itype z) {                           \</span></div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmax3(                                   \</span></div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span><span class="preprocessor">        mfast));                                                               \</span></div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span><span class="preprocessor">  METAL_FUNC otype median3(itype x, itype y, itype z) {                        \</span></div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmedian3(                                \</span></div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span><span class="preprocessor">        mfast));                                                               \</span></div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span><span class="preprocessor">  METAL_FUNC otype min(itype x, itype y) {                                     \</span></div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span><span class="preprocessor">        __metal_fmin(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span><span class="preprocessor">  METAL_FUNC otype min3(itype x, itype y, itype z) {                           \</span></div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_fmin3(                                   \</span></div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span><span class="preprocessor">        static_cast&lt;ctype&gt;(x),                                                 \</span></div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span><span class="preprocessor">        static_cast&lt;ctype&gt;(y),                                                 \</span></div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span><span class="preprocessor">        static_cast&lt;ctype&gt;(z),                                                 \</span></div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span><span class="preprocessor">        mfast));                                                               \</span></div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span><span class="preprocessor">  METAL_FUNC otype nextafter(itype x, itype y) {                               \</span></div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span><span class="preprocessor">        __metal_nextafter(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y)));      \</span></div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span><span class="preprocessor">  METAL_FUNC otype pow(itype x, itype y) {                                     \</span></div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span><span class="preprocessor">        __metal_pow(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));     \</span></div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span><span class="preprocessor">  METAL_FUNC otype powr(itype x, itype y) {                                    \</span></div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span><span class="preprocessor">        __metal_powr(static_cast&lt;ctype&gt;(x), static_cast&lt;ctype&gt;(y), mfast));    \</span></div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span><span class="preprocessor">  METAL_FUNC otype rint(itype x) {                                             \</span></div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_rint(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span><span class="preprocessor">  METAL_FUNC otype round(itype x) {                                            \</span></div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_round(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span><span class="preprocessor">  METAL_FUNC otype rsqrt(itype x) {                                            \</span></div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_rsqrt(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span><span class="preprocessor">  METAL_FUNC otype sin(itype x) {                                              \</span></div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_sin(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span><span class="preprocessor">  METAL_FUNC otype sinh(itype x) {                                             \</span></div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_sinh(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span><span class="preprocessor">  METAL_FUNC otype sinpi(itype x) {                                            \</span></div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_sinpi(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span><span class="preprocessor">  METAL_FUNC otype sqrt(itype x) {                                             \</span></div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_sqrt(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span><span class="preprocessor">  METAL_FUNC otype tan(itype x) {                                              \</span></div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_tan(static_cast&lt;ctype&gt;(x), mfast));      \</span></div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span><span class="preprocessor">  METAL_FUNC otype tanh(itype x) {                                             \</span></div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_tanh(static_cast&lt;ctype&gt;(x), mfast));     \</span></div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span><span class="preprocessor">  METAL_FUNC otype tanpi(itype x) {                                            \</span></div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_tanpi(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span><span class="preprocessor">  METAL_FUNC otype trunc(itype x) {                                            \</span></div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_trunc(static_cast&lt;ctype&gt;(x), mfast));    \</span></div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span><span class="preprocessor">  }</span></div>
 </div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span> </div>
+<div class="foldopen" id="foldopen00226" data-start="{" data-end="}">
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno"><a class="line" href="namespacemetal.html">  226</a></span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a> {</div>
 <div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span> </div>
-<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a> {</div>
-<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span> </div>
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span><a class="code hl_define" href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">instantiate_metal_math_funcs</a>(</div>
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>    <span class="keywordtype">float</span>,</div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno"><a class="line" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">  234</a></span>    __METAL_MAYBE_FAST_MATH__);</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span><a class="code hl_define" href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">instantiate_metal_math_funcs</a>(</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>    <span class="keywordtype">float</span>,</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno"><a class="line" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">  232</a></span>    __METAL_MAYBE_FAST_MATH__);</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span> </div>
+<div class="foldopen" id="foldopen00234" data-start="{" data-end="}">
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno"><a class="line" href="namespacemetal_1_1fast.html">  234</a></span><span class="keyword">namespace </span>fast {</div>
 <div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span> </div>
-<div class="foldopen" id="foldopen00236" data-start="{" data-end="}">
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno"><a class="line" href="namespacemetal_1_1fast.html">  236</a></span><span class="keyword">namespace </span>fast {</div>
-<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span> </div>
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span><a class="code hl_define" href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">instantiate_metal_math_funcs</a>(</div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
-<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>    <span class="keywordtype">float</span>,</div>
-<div class="line"><a id="l00242" name="l00242"></a><span class="lineno"><a class="line" href="namespacemetal_1_1fast.html#a90d2973f71f83180e7f02e38d11c7a8f">  242</a></span>    __METAL_FAST_MATH__);</div>
-<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span> </div>
-<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>} <span class="comment">// namespace fast</span></div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span><a class="code hl_define" href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">instantiate_metal_math_funcs</a>(</div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>    <span class="keywordtype">float</span>,</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno"><a class="line" href="namespacemetal_1_1fast.html#a90d2973f71f83180e7f02e38d11c7a8f">  240</a></span>    __METAL_FAST_MATH__);</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span> </div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>} <span class="comment">// namespace fast</span></div>
 </div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span> </div>
+<div class="foldopen" id="foldopen00244" data-start="{" data-end="}">
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno"><a class="line" href="namespacemetal_1_1precise.html">  244</a></span><span class="keyword">namespace </span>precise {</div>
 <div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span> </div>
-<div class="foldopen" id="foldopen00246" data-start="{" data-end="}">
-<div class="line"><a id="l00246" name="l00246"></a><span class="lineno"><a class="line" href="namespacemetal_1_1precise.html">  246</a></span><span class="keyword">namespace </span>precise {</div>
-<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span> </div>
-<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span><a class="code hl_define" href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">instantiate_metal_math_funcs</a>(</div>
-<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
-<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
-<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>    <span class="keywordtype">float</span>,</div>
-<div class="line"><a id="l00252" name="l00252"></a><span class="lineno"><a class="line" href="namespacemetal_1_1precise.html#a99f2b2746e813b9ca7b4249afbaf2a14">  252</a></span>    __METAL_PRECISE_MATH__);</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span><a class="code hl_define" href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">instantiate_metal_math_funcs</a>(</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>    <span class="keywordtype">float</span>,</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno"><a class="line" href="namespacemetal_1_1precise.html#a99f2b2746e813b9ca7b4249afbaf2a14">  250</a></span>    __METAL_PRECISE_MATH__);</div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span> </div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>} <span class="comment">// namespace precise</span></div>
+</div>
 <div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span> </div>
-<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>} <span class="comment">// namespace precise</span></div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>} <span class="comment">// namespace metal</span></div>
 </div>
 <div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span> </div>
-<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>} <span class="comment">// namespace metal</span></div>
-<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span> </div>
-<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span><span class="comment">// Metal simd for bfloat16</span></div>
-<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span> </div>
-<div class="line"><a id="l00262" name="l00262"></a><span class="lineno"><a class="line" href="bf16__math_8h.html#aecc11cb898846d01bfc9faa109fcf791">  262</a></span><span class="preprocessor">#define instantiate_metal_simd_comm_funcs(                                   \</span></div>
-<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span><span class="preprocessor">    itype, otype, ctype, itype_to_ctype, ctype_to_otype)                     \</span></div>
-<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span><span class="preprocessor">  METAL_FUNC otype simd_broadcast(itype data, ushort broadcast_lane_id) {    \</span></div>
-<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
-<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span><span class="preprocessor">        __metal_simd_broadcast(itype_to_ctype(data), broadcast_lane_id));    \</span></div>
-<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle(itype data, ushort simd_lane_id) {           \</span></div>
-<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
-<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span><span class="preprocessor">        __metal_simd_shuffle(itype_to_ctype(data), simd_lane_id));           \</span></div>
-<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_and_fill_down(                               \</span></div>
-<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span><span class="preprocessor">      itype data, itype filling_data, ushort delta, ushort modulo) {         \</span></div>
-<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span><span class="preprocessor">    return ctype_to_otype(__metal_simd_shuffle_and_fill_down(                \</span></div>
-<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span><span class="preprocessor">        itype_to_ctype(data), itype_to_ctype(filling_data), delta, modulo)); \</span></div>
-<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_and_fill_down(                               \</span></div>
-<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span><span class="preprocessor">      itype data, itype filling_data, ushort delta) {                        \</span></div>
-<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span><span class="preprocessor">    return ctype_to_otype(__metal_simd_shuffle_and_fill_down(                \</span></div>
-<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span><span class="preprocessor">        itype_to_ctype(data),                                                \</span></div>
-<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span><span class="preprocessor">        itype_to_ctype(filling_data),                                        \</span></div>
-<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span><span class="preprocessor">        delta,                                                               \</span></div>
-<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span><span class="preprocessor">        __metal_get_simdgroup_size(ushort())));                              \</span></div>
-<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_and_fill_up(                                 \</span></div>
-<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span><span class="preprocessor">      itype data, itype filling_data, ushort delta, ushort modulo) {         \</span></div>
-<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span><span class="preprocessor">    return ctype_to_otype(__metal_simd_shuffle_and_fill_up(                  \</span></div>
-<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span><span class="preprocessor">        itype_to_ctype(data), itype_to_ctype(filling_data), delta, modulo)); \</span></div>
-<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_and_fill_up(                                 \</span></div>
-<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span><span class="preprocessor">      itype data, itype filling_data, ushort delta) {                        \</span></div>
-<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span><span class="preprocessor">    return ctype_to_otype(__metal_simd_shuffle_and_fill_up(                  \</span></div>
-<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span><span class="preprocessor">        itype_to_ctype(data),                                                \</span></div>
-<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span><span class="preprocessor">        itype_to_ctype(filling_data),                                        \</span></div>
-<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span><span class="preprocessor">        delta,                                                               \</span></div>
-<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span><span class="preprocessor">        __metal_get_simdgroup_size(ushort())));                              \</span></div>
-<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_down(itype data, ushort delta) {             \</span></div>
-<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
-<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span><span class="preprocessor">        __metal_simd_shuffle_down(itype_to_ctype(data), delta));             \</span></div>
-<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_rotate_down(itype data, ushort delta) {      \</span></div>
-<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
-<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span><span class="preprocessor">        __metal_simd_shuffle_rotate_down(itype_to_ctype(data), delta));      \</span></div>
-<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_rotate_up(itype data, ushort delta) {        \</span></div>
-<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
-<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span><span class="preprocessor">        __metal_simd_shuffle_rotate_up(itype_to_ctype(data), delta));        \</span></div>
-<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_up(itype data, ushort delta) {               \</span></div>
-<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
-<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span><span class="preprocessor">        __metal_simd_shuffle_up(itype_to_ctype(data), delta));               \</span></div>
-<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span><span class="preprocessor">  }                                                                          \</span></div>
-<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span><span class="preprocessor">                                                                             \</span></div>
-<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_xor(itype data, ushort mask) {               \</span></div>
-<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
-<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span><span class="preprocessor">        __metal_simd_shuffle_xor(itype_to_ctype(data), mask));               \</span></div>
-<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span><span class="preprocessor">  }</span></div>
-<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span> </div>
-<div class="foldopen" id="foldopen00330" data-start="" data-end="">
-<div class="line"><a id="l00330" name="l00330"></a><span class="lineno"><a class="line" href="bf16__math_8h.html#a51688bc24fc9292aaec5f54a58eaa2d0">  330</a></span><span class="preprocessor">#define instantiate_metal_simd_reduction_funcs(itype, otype, ctype)            \</span></div>
-<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span><span class="preprocessor">  METAL_FUNC otype simd_max(itype data) {                                      \</span></div>
-<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_max(static_cast&lt;ctype&gt;(data)));     \</span></div>
-<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span><span class="preprocessor">  METAL_FUNC otype simd_min(itype data) {                                      \</span></div>
-<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_min(static_cast&lt;ctype&gt;(data)));     \</span></div>
-<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span><span class="preprocessor">  METAL_FUNC otype simd_prefix_exclusive_product(itype data) {                 \</span></div>
-<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span><span class="preprocessor">        __metal_simd_prefix_exclusive_product(static_cast&lt;ctype&gt;(data)));      \</span></div>
-<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span><span class="preprocessor">  METAL_FUNC otype simd_prefix_exclusive_sum(itype data) {                     \</span></div>
-<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span><span class="preprocessor">        __metal_simd_prefix_exclusive_sum(static_cast&lt;ctype&gt;(data)));          \</span></div>
-<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span><span class="preprocessor">  METAL_FUNC otype simd_prefix_inclusive_product(itype data) {                 \</span></div>
-<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span><span class="preprocessor">        __metal_simd_prefix_inclusive_product(static_cast&lt;ctype&gt;(data)));      \</span></div>
-<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span><span class="preprocessor">  METAL_FUNC otype simd_prefix_inclusive_sum(itype data) {                     \</span></div>
-<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
-<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span><span class="preprocessor">        __metal_simd_prefix_inclusive_sum(static_cast&lt;ctype&gt;(data)));          \</span></div>
-<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span><span class="preprocessor">  METAL_FUNC otype simd_product(itype data) {                                  \</span></div>
-<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_product(static_cast&lt;ctype&gt;(data))); \</span></div>
-<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span><span class="preprocessor">  METAL_FUNC otype simd_sum(itype data) {                                      \</span></div>
-<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_sum(static_cast&lt;ctype&gt;(data)));     \</span></div>
-<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span><span class="preprocessor">  }                                                                            \</span></div>
-<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span><span class="preprocessor">                                                                               \</span></div>
-<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span><span class="preprocessor">  METAL_FUNC otype simd_xor(itype data) {                                      \</span></div>
-<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_xor(static_cast&lt;ctype&gt;(data)));     \</span></div>
-<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span><span class="preprocessor">  }</span></div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span><span class="comment">// Metal simd for bfloat16</span></div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span> </div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno"><a class="line" href="bf16__math_8h.html#aecc11cb898846d01bfc9faa109fcf791">  260</a></span><span class="preprocessor">#define instantiate_metal_simd_comm_funcs(                                   \</span></div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span><span class="preprocessor">    itype, otype, ctype, itype_to_ctype, ctype_to_otype)                     \</span></div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span><span class="preprocessor">  METAL_FUNC otype simd_broadcast(itype data, ushort broadcast_lane_id) {    \</span></div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span><span class="preprocessor">        __metal_simd_broadcast(itype_to_ctype(data), broadcast_lane_id));    \</span></div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle(itype data, ushort simd_lane_id) {           \</span></div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span><span class="preprocessor">        __metal_simd_shuffle(itype_to_ctype(data), simd_lane_id));           \</span></div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_and_fill_down(                               \</span></div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span><span class="preprocessor">      itype data, itype filling_data, ushort delta, ushort modulo) {         \</span></div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span><span class="preprocessor">    return ctype_to_otype(__metal_simd_shuffle_and_fill_down(                \</span></div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span><span class="preprocessor">        itype_to_ctype(data), itype_to_ctype(filling_data), delta, modulo)); \</span></div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_and_fill_down(                               \</span></div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span><span class="preprocessor">      itype data, itype filling_data, ushort delta) {                        \</span></div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span><span class="preprocessor">    return ctype_to_otype(__metal_simd_shuffle_and_fill_down(                \</span></div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span><span class="preprocessor">        itype_to_ctype(data),                                                \</span></div>
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span><span class="preprocessor">        itype_to_ctype(filling_data),                                        \</span></div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span><span class="preprocessor">        delta,                                                               \</span></div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span><span class="preprocessor">        __metal_get_simdgroup_size(ushort())));                              \</span></div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_and_fill_up(                                 \</span></div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span><span class="preprocessor">      itype data, itype filling_data, ushort delta, ushort modulo) {         \</span></div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span><span class="preprocessor">    return ctype_to_otype(__metal_simd_shuffle_and_fill_up(                  \</span></div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span><span class="preprocessor">        itype_to_ctype(data), itype_to_ctype(filling_data), delta, modulo)); \</span></div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_and_fill_up(                                 \</span></div>
+<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span><span class="preprocessor">      itype data, itype filling_data, ushort delta) {                        \</span></div>
+<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span><span class="preprocessor">    return ctype_to_otype(__metal_simd_shuffle_and_fill_up(                  \</span></div>
+<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span><span class="preprocessor">        itype_to_ctype(data),                                                \</span></div>
+<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span><span class="preprocessor">        itype_to_ctype(filling_data),                                        \</span></div>
+<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span><span class="preprocessor">        delta,                                                               \</span></div>
+<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span><span class="preprocessor">        __metal_get_simdgroup_size(ushort())));                              \</span></div>
+<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_down(itype data, ushort delta) {             \</span></div>
+<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
+<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span><span class="preprocessor">        __metal_simd_shuffle_down(itype_to_ctype(data), delta));             \</span></div>
+<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_rotate_down(itype data, ushort delta) {      \</span></div>
+<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
+<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span><span class="preprocessor">        __metal_simd_shuffle_rotate_down(itype_to_ctype(data), delta));      \</span></div>
+<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_rotate_up(itype data, ushort delta) {        \</span></div>
+<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
+<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span><span class="preprocessor">        __metal_simd_shuffle_rotate_up(itype_to_ctype(data), delta));        \</span></div>
+<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_up(itype data, ushort delta) {               \</span></div>
+<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
+<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span><span class="preprocessor">        __metal_simd_shuffle_up(itype_to_ctype(data), delta));               \</span></div>
+<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span><span class="preprocessor">  }                                                                          \</span></div>
+<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span><span class="preprocessor">                                                                             \</span></div>
+<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span><span class="preprocessor">  METAL_FUNC otype simd_shuffle_xor(itype data, ushort mask) {               \</span></div>
+<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span><span class="preprocessor">    return ctype_to_otype(                                                   \</span></div>
+<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span><span class="preprocessor">        __metal_simd_shuffle_xor(itype_to_ctype(data), mask));               \</span></div>
+<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span><span class="preprocessor">  }</span></div>
+<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span> </div>
+<div class="foldopen" id="foldopen00328" data-start="" data-end="">
+<div class="line"><a id="l00328" name="l00328"></a><span class="lineno"><a class="line" href="bf16__math_8h.html#a51688bc24fc9292aaec5f54a58eaa2d0">  328</a></span><span class="preprocessor">#define instantiate_metal_simd_reduction_funcs(itype, otype, ctype)            \</span></div>
+<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span><span class="preprocessor">  METAL_FUNC otype simd_max(itype data) {                                      \</span></div>
+<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_max(static_cast&lt;ctype&gt;(data)));     \</span></div>
+<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span><span class="preprocessor">  METAL_FUNC otype simd_min(itype data) {                                      \</span></div>
+<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_min(static_cast&lt;ctype&gt;(data)));     \</span></div>
+<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span><span class="preprocessor">  METAL_FUNC otype simd_prefix_exclusive_product(itype data) {                 \</span></div>
+<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span><span class="preprocessor">        __metal_simd_prefix_exclusive_product(static_cast&lt;ctype&gt;(data)));      \</span></div>
+<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span><span class="preprocessor">  METAL_FUNC otype simd_prefix_exclusive_sum(itype data) {                     \</span></div>
+<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span><span class="preprocessor">        __metal_simd_prefix_exclusive_sum(static_cast&lt;ctype&gt;(data)));          \</span></div>
+<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span><span class="preprocessor">  METAL_FUNC otype simd_prefix_inclusive_product(itype data) {                 \</span></div>
+<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span><span class="preprocessor">        __metal_simd_prefix_inclusive_product(static_cast&lt;ctype&gt;(data)));      \</span></div>
+<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span><span class="preprocessor">  METAL_FUNC otype simd_prefix_inclusive_sum(itype data) {                     \</span></div>
+<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span><span class="preprocessor">    return static_cast&lt;otype&gt;(                                                 \</span></div>
+<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span><span class="preprocessor">        __metal_simd_prefix_inclusive_sum(static_cast&lt;ctype&gt;(data)));          \</span></div>
+<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span><span class="preprocessor">  METAL_FUNC otype simd_product(itype data) {                                  \</span></div>
+<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_product(static_cast&lt;ctype&gt;(data))); \</span></div>
+<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span><span class="preprocessor">  METAL_FUNC otype simd_sum(itype data) {                                      \</span></div>
+<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_sum(static_cast&lt;ctype&gt;(data)));     \</span></div>
+<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span><span class="preprocessor">  }                                                                            \</span></div>
+<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span><span class="preprocessor">                                                                               \</span></div>
+<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span><span class="preprocessor">  METAL_FUNC otype simd_xor(itype data) {                                      \</span></div>
+<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span><span class="preprocessor">    return static_cast&lt;otype&gt;(__metal_simd_xor(static_cast&lt;ctype&gt;(data)));     \</span></div>
+<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span><span class="preprocessor">  }</span></div>
 </div>
+<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span> </div>
+<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a> {</div>
 <div class="line"><a id="l00371" name="l00371"></a><span class="lineno">  371</span> </div>
-<div class="line"><a id="l00372" name="l00372"></a><span class="lineno">  372</span><span class="preprocessor">#if (MLX_METAL_VERSION &gt;= 310) || (__METAL_VERSION__ &gt;= 310)</span></div>
-<div class="line"><a id="l00373" name="l00373"></a><span class="lineno">  373</span> </div>
-<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span><span class="preprocessor">#define bfloat16_to_uint16(x) as_type&lt;uint16_t&gt;(x)</span></div>
-<div class="line"><a id="l00375" name="l00375"></a><span class="lineno">  375</span><span class="preprocessor">#define uint16_to_bfloat16(x) as_type&lt;bfloat16_t&gt;(x)</span></div>
-<div class="line"><a id="l00376" name="l00376"></a><span class="lineno">  376</span> </div>
-<div class="line"><a id="l00377" name="l00377"></a><span class="lineno">  377</span><span class="preprocessor">#else</span></div>
-<div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span> </div>
-<div class="line"><a id="l00379" name="l00379"></a><span class="lineno"><a class="line" href="bf16__math_8h.html#a51cfdd4502e755310f6f3456f039bea7">  379</a></span><span class="preprocessor">#define bfloat16_to_uint16(x) x.bits_</span></div>
-<div class="line"><a id="l00380" name="l00380"></a><span class="lineno"><a class="line" href="bf16__math_8h.html#a030d871474c0e7d907fccffcc8c047e0">  380</a></span><span class="preprocessor">#define uint16_to_bfloat16(x) _MLX_BFloat16(x, _MLX_BFloat16::bits_to_bfloat())</span></div>
-<div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span> </div>
-<div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span><span class="preprocessor">#endif</span></div>
-<div class="line"><a id="l00383" name="l00383"></a><span class="lineno">  383</span> </div>
-<div class="line"><a id="l00384" name="l00384"></a><span class="lineno">  384</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a> {</div>
-<div class="line"><a id="l00385" name="l00385"></a><span class="lineno">  385</span> </div>
-<div class="line"><a id="l00386" name="l00386"></a><span class="lineno">  386</span><a class="code hl_define" href="bf16__math_8h.html#aecc11cb898846d01bfc9faa109fcf791">instantiate_metal_simd_comm_funcs</a>(</div>
-<div class="line"><a id="l00387" name="l00387"></a><span class="lineno">  387</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
-<div class="line"><a id="l00388" name="l00388"></a><span class="lineno">  388</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
-<div class="line"><a id="l00389" name="l00389"></a><span class="lineno">  389</span>    uint16_t,</div>
-<div class="line"><a id="l00390" name="l00390"></a><span class="lineno">  390</span>    <a class="code hl_define" href="bf16__math_8h.html#a51cfdd4502e755310f6f3456f039bea7">bfloat16_to_uint16</a>,</div>
-<div class="line"><a id="l00391" name="l00391"></a><span class="lineno"><a class="line" href="namespacemetal.html#a498f1e85107eb5f01ba4435977f8efe0">  391</a></span>    <a class="code hl_define" href="bf16__math_8h.html#a030d871474c0e7d907fccffcc8c047e0">uint16_to_bfloat16</a>);</div>
-<div class="line"><a id="l00392" name="l00392"></a><span class="lineno"><a class="line" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">  392</a></span><a class="code hl_define" href="bf16__math_8h.html#a51688bc24fc9292aaec5f54a58eaa2d0">instantiate_metal_simd_reduction_funcs</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>, <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>, <span class="keywordtype">float</span>);</div>
-<div class="line"><a id="l00393" name="l00393"></a><span class="lineno">  393</span> </div>
-<div class="line"><a id="l00394" name="l00394"></a><span class="lineno">  394</span>} <span class="comment">// namespace metal</span></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html">bf16.h</a></div></div>
-<div class="ttc" id="abf16__math_8h_html_a030d871474c0e7d907fccffcc8c047e0"><div class="ttname"><a href="bf16__math_8h.html#a030d871474c0e7d907fccffcc8c047e0">uint16_to_bfloat16</a></div><div class="ttdeci">#define uint16_to_bfloat16(x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:380</div></div>
-<div class="ttc" id="abf16__math_8h_html_a51688bc24fc9292aaec5f54a58eaa2d0"><div class="ttname"><a href="bf16__math_8h.html#a51688bc24fc9292aaec5f54a58eaa2d0">instantiate_metal_simd_reduction_funcs</a></div><div class="ttdeci">#define instantiate_metal_simd_reduction_funcs(itype, otype, ctype)</div><div class="ttdef"><b>Definition</b> bf16_math.h:330</div></div>
-<div class="ttc" id="abf16__math_8h_html_a51cfdd4502e755310f6f3456f039bea7"><div class="ttname"><a href="bf16__math_8h.html#a51cfdd4502e755310f6f3456f039bea7">bfloat16_to_uint16</a></div><div class="ttdeci">#define bfloat16_to_uint16(x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:379</div></div>
-<div class="ttc" id="abf16__math_8h_html_a9e9f0fdd6e304522bc88acd22c576842"><div class="ttname"><a href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">instantiate_metal_math_funcs</a></div><div class="ttdeci">#define instantiate_metal_math_funcs(itype, otype, ctype, mfast)</div><div class="ttdef"><b>Definition</b> bf16_math.h:35</div></div>
-<div class="ttc" id="abf16__math_8h_html_aecc11cb898846d01bfc9faa109fcf791"><div class="ttname"><a href="bf16__math_8h.html#aecc11cb898846d01bfc9faa109fcf791">instantiate_metal_simd_comm_funcs</a></div><div class="ttdeci">#define instantiate_metal_simd_comm_funcs( itype, otype, ctype, itype_to_ctype, ctype_to_otype)</div><div class="ttdef"><b>Definition</b> bf16_math.h:262</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
+<div class="line"><a id="l00372" name="l00372"></a><span class="lineno">  372</span><a class="code hl_define" href="bf16__math_8h.html#aecc11cb898846d01bfc9faa109fcf791">instantiate_metal_simd_comm_funcs</a>(</div>
+<div class="line"><a id="l00373" name="l00373"></a><span class="lineno">  373</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
+<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span>    <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>,</div>
+<div class="line"><a id="l00375" name="l00375"></a><span class="lineno">  375</span>    uint16_t,</div>
+<div class="line"><a id="l00376" name="l00376"></a><span class="lineno">  376</span>    <a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">bfloat16_to_uint16</a>,</div>
+<div class="line"><a id="l00377" name="l00377"></a><span class="lineno"><a class="line" href="namespacemetal.html#a498f1e85107eb5f01ba4435977f8efe0">  377</a></span>    <a class="code hl_function" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">uint16_to_bfloat16</a>);</div>
+<div class="line"><a id="l00378" name="l00378"></a><span class="lineno"><a class="line" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">  378</a></span><a class="code hl_define" href="bf16__math_8h.html#a51688bc24fc9292aaec5f54a58eaa2d0">instantiate_metal_simd_reduction_funcs</a>(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>, <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>, <span class="keywordtype">float</span>);</div>
+<div class="line"><a id="l00379" name="l00379"></a><span class="lineno">  379</span> </div>
+<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>} <span class="comment">// namespace metal</span></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a1420e191fa60d707dce327d0938e3088"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">bfloat16_to_uint16</a></div><div class="ttdeci">uint16_t bfloat16_to_uint16(const bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16.h:308</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a8d066e48cf3e2a0583c71816fa40f7f4"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">uint16_to_bfloat16</a></div><div class="ttdeci">bfloat16_t uint16_to_bfloat16(const uint16_t x)</div><div class="ttdef"><b>Definition</b> bf16.h:312</div></div>
+<div class="ttc" id="abf16__math_8h_html_a51688bc24fc9292aaec5f54a58eaa2d0"><div class="ttname"><a href="bf16__math_8h.html#a51688bc24fc9292aaec5f54a58eaa2d0">instantiate_metal_simd_reduction_funcs</a></div><div class="ttdeci">#define instantiate_metal_simd_reduction_funcs(itype, otype, ctype)</div><div class="ttdef"><b>Definition</b> bf16_math.h:328</div></div>
+<div class="ttc" id="abf16__math_8h_html_a9e9f0fdd6e304522bc88acd22c576842"><div class="ttname"><a href="bf16__math_8h.html#a9e9f0fdd6e304522bc88acd22c576842">instantiate_metal_math_funcs</a></div><div class="ttdeci">#define instantiate_metal_math_funcs(itype, otype, ctype, mfast)</div><div class="ttdef"><b>Definition</b> bf16_math.h:33</div></div>
+<div class="ttc" id="abf16__math_8h_html_aecc11cb898846d01bfc9faa109fcf791"><div class="ttname"><a href="bf16__math_8h.html#aecc11cb898846d01bfc9faa109fcf791">instantiate_metal_simd_comm_funcs</a></div><div class="ttdeci">#define instantiate_metal_simd_comm_funcs( itype, otype, ctype, itype_to_ctype, ctype_to_otype)</div><div class="ttdef"><b>Definition</b> bf16_math.h:260</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/binary__ops_8h_source.html b/docs/build/html/binary__ops_8h_source.html
index 7731805a3..5216c8201 100644
--- a/docs/build/html/binary__ops_8h_source.html
+++ b/docs/build/html/binary__ops_8h_source.html
@@ -519,18 +519,18 @@ $(function(){ initResizable(false); });
 </div>
 <div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>};</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a27c03f2f90ab56db2e4d59559a3d2e9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a></div><div class="ttdeci">float log1p(float x)</div><div class="ttdef"><b>Definition</b> utils.h:277</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a6f161b049cc6884f87b09b33c2d1cd7f"><div class="ttname"><a href="namespacemetal_1_1precise.html#a6f161b049cc6884f87b09b33c2d1cd7f">metal::precise::atan2</a></div><div class="ttdeci">METAL_FUNC bfloat16_t atan2(bfloat16_t y, bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_html_a1d430793eaa38ccf0d07145e3fcd1e61"><div class="ttname"><a href="namespacemetal.html#a1d430793eaa38ccf0d07145e3fcd1e61">metal::atan2</a></div><div class="ttdeci">METAL_FUNC bfloat16_t atan2(bfloat16_t y, bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a2fa4778a6fe2fa43253ea724e5a608a3"><div class="ttname"><a href="namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3">metal::cos</a></div><div class="ttdeci">METAL_FUNC bfloat16_t cos(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a423a9f4f2fc7ef5ec7eda061277b51b6"><div class="ttname"><a href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a619a159ca5f2ddfe3647d3a6bb6e804c"><div class="ttname"><a href="namespacemetal.html#a619a159ca5f2ddfe3647d3a6bb6e804c">metal::sin</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sin(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a83320ba983d90dd1fa5847b6940dc0bb"><div class="ttname"><a href="namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb">metal::isnan</a></div><div class="ttdeci">METAL_FUNC bool isnan(_MLX_BFloat16 x)</div><div class="ttdef"><b>Definition</b> bf16.h:307</div></div>
-<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_ac2a0b3618d922ac014baac8189d44650"><div class="ttname"><a href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">metal::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_acd288d4552215bd10455584a214c57b8"><div class="ttname"><a href="namespacemetal.html#acd288d4552215bd10455584a214c57b8">metal::pow</a></div><div class="ttdeci">METAL_FUNC bfloat16_t pow(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a27c03f2f90ab56db2e4d59559a3d2e9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a></div><div class="ttdeci">float log1p(float x)</div><div class="ttdef"><b>Definition</b> utils.h:318</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a6f161b049cc6884f87b09b33c2d1cd7f"><div class="ttname"><a href="namespacemetal_1_1precise.html#a6f161b049cc6884f87b09b33c2d1cd7f">metal::precise::atan2</a></div><div class="ttdeci">METAL_FUNC bfloat16_t atan2(bfloat16_t y, bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_html_a1d430793eaa38ccf0d07145e3fcd1e61"><div class="ttname"><a href="namespacemetal.html#a1d430793eaa38ccf0d07145e3fcd1e61">metal::atan2</a></div><div class="ttdeci">METAL_FUNC bfloat16_t atan2(bfloat16_t y, bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a2fa4778a6fe2fa43253ea724e5a608a3"><div class="ttname"><a href="namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3">metal::cos</a></div><div class="ttdeci">METAL_FUNC bfloat16_t cos(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a423a9f4f2fc7ef5ec7eda061277b51b6"><div class="ttname"><a href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a619a159ca5f2ddfe3647d3a6bb6e804c"><div class="ttname"><a href="namespacemetal.html#a619a159ca5f2ddfe3647d3a6bb6e804c">metal::sin</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sin(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a83320ba983d90dd1fa5847b6940dc0bb"><div class="ttname"><a href="namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb">metal::isnan</a></div><div class="ttdeci">METAL_FUNC bool isnan(_MLX_BFloat16 x)</div><div class="ttdef"><b>Definition</b> bf16.h:301</div></div>
+<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_ac2a0b3618d922ac014baac8189d44650"><div class="ttname"><a href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">metal::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_acd288d4552215bd10455584a214c57b8"><div class="ttname"><a href="namespacemetal.html#acd288d4552215bd10455584a214c57b8">metal::pow</a></div><div class="ttdeci">METAL_FUNC bfloat16_t pow(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
 <div class="ttc" id="astruct_add_html"><div class="ttname"><a href="struct_add.html">Add</a></div><div class="ttdef"><b>Definition</b> binary_ops.h:8</div></div>
 <div class="ttc" id="astruct_add_html_ac5c66b63d63a222d3ae0ab8cc7c90eb5"><div class="ttname"><a href="struct_add.html#ac5c66b63d63a222d3ae0ab8cc7c90eb5">Add::operator()</a></div><div class="ttdeci">T operator()(T x, T y)</div><div class="ttdef"><b>Definition</b> binary_ops.h:10</div></div>
 <div class="ttc" id="astruct_arc_tan2_html"><div class="ttname"><a href="struct_arc_tan2.html">ArcTan2</a></div><div class="ttdef"><b>Definition</b> binary_ops.h:284</div></div>
diff --git a/docs/build/html/classes.html b/docs/build/html/classes.html
index 31672a75b..77d6a0d14 100644
--- a/docs/build/html/classes.html
+++ b/docs/build/html/classes.html
@@ -91,19 +91,19 @@ $(function(){ initResizable(false); });
 <div class="classindex">
 <dl class="classindex even">
 <dt class="alphachar"><a id="letter_A" name="letter_A">A</a></dt>
-<dd><a class="el" href="struct_abs.html">Abs</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_abs.html">Abs</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_abs.html">Abs</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_accum_helper.html">AccumHelper</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct_add.html">Add</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_add.html">Add</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_add.html">Add</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1add__vec.html">add_vec</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1add__vec_3_01cmplx_3_01_t_01_4_01_4.html">add_vec&lt; cmplx&lt; T &gt; &gt;</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_add_m_m.html">AddMM</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html">AffineQuantize</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html">aligned_allocator</a> (<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html">pocketfft::detail::threading</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html">AllGather</a> (<a class="el" href="namespacemlx_1_1core_1_1distributed.html">mlx::core::distributed</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1allocator_1_1_allocator.html">Allocator</a> (<a class="el" href="namespacemlx_1_1core_1_1allocator.html">mlx::core::allocator</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html">AllReduce</a> (<a class="el" href="namespacemlx_1_1core_1_1distributed.html">mlx::core::distributed</a>)</dd><dd><a class="el" href="struct_and.html">And</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arange.html">Arange</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_arc_cos.html">ArcCos</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_cos.html">ArcCos</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_cos.html">ArcCos</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_cosh.html">ArcCosh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_cosh.html">ArcCosh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_cosh.html">ArcCosh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_sin.html">ArcSin</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_sin.html">ArcSin</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_sin.html">ArcSin</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_sinh.html">ArcSinh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_sinh.html">ArcSinh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_sinh.html">ArcSinh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_tan.html">ArcTan</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_tan.html">ArcTan</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_tan.html">ArcTan</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_tan2.html">ArcTan2</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_tan2.html">ArcTan2</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_tan2.html">ArcTan2</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_tanh.html">ArcTanh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_tanh.html">ArcTanh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_tanh.html">ArcTanh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_arg_partition.html">ArgPartition</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_arg_reduce.html">ArgReduce</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_arg_sort.html">ArgSort</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1arr.html">arr</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1arr__info.html">arr_info</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1array.html">array</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1array_1_1_array_iterator.html">array::ArrayIterator</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_as_strided.html">AsStrided</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_as_type.html">AsType</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
+<dd><a class="el" href="struct_abs.html">Abs</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_abs.html">Abs</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_abs.html">Abs</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_accum_helper.html">AccumHelper</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct_add.html">Add</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_add.html">Add</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_add.html">Add</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1add__vec.html">add_vec</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1add__vec_3_01cmplx_3_01_t_01_4_01_4.html">add_vec&lt; cmplx&lt; T &gt; &gt;</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_add_m_m.html">AddMM</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html">AffineQuantize</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html">aligned_allocator</a> (<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html">pocketfft::detail::threading</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html">AllGather</a> (<a class="el" href="namespacemlx_1_1core_1_1distributed.html">mlx::core::distributed</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1allocator_1_1_allocator.html">Allocator</a> (<a class="el" href="namespacemlx_1_1core_1_1allocator.html">mlx::core::allocator</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html">AllReduce</a> (<a class="el" href="namespacemlx_1_1core_1_1distributed.html">mlx::core::distributed</a>)</dd><dd><a class="el" href="struct_and.html">And</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arange.html">Arange</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_arc_cos.html">ArcCos</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_cos.html">ArcCos</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_cos.html">ArcCos</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_cosh.html">ArcCosh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_cosh.html">ArcCosh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_cosh.html">ArcCosh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_sin.html">ArcSin</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_sin.html">ArcSin</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_sin.html">ArcSin</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_sinh.html">ArcSinh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_sinh.html">ArcSinh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_sinh.html">ArcSinh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_tan.html">ArcTan</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_tan.html">ArcTan</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_tan.html">ArcTan</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_tan2.html">ArcTan2</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_tan2.html">ArcTan2</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_tan2.html">ArcTan2</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_arc_tanh.html">ArcTanh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_arc_tanh.html">ArcTanh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_arc_tanh.html">ArcTanh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_arg_partition.html">ArgPartition</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_arg_reduce.html">ArgReduce</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_arg_sort.html">ArgSort</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1arr.html">arr</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1arr__info.html">arr_info</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1array.html">array</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1array_1_1_array_iterator.html">array::ArrayIterator</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_as_strided.html">AsStrided</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_as_type.html">AsType</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">AttnParams</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd></dl>
 <dl class="classindex odd">
 <dt class="alphachar"><a id="letter_B" name="letter_B">B</a></dt>
-<dd><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">BaseMMAFrag&lt; T, 8, 8 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">_MLX_BFloat16::bits_to_bfloat_struct</a></dd><dd><a class="el" href="struct_bitwise_and.html">BitwiseAnd</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_and.html">BitwiseAnd</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html">BitwiseBinary</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_bitwise_or.html">BitwiseOr</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_or.html">BitwiseOr</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_bitwise_xor.html">BitwiseXor</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_xor.html">BitwiseXor</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html">BlockMaskedMM</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_block_merge_sort.html">BlockMergeSort</a></dd><dd><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">BlockMMA</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html">BlockSwizzle</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="unionbool4__or__uint.html">bool4_or_uint</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_broadcast.html">Broadcast</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1allocator_1_1_buffer.html">Buffer</a> (<a class="el" href="namespacemlx_1_1core_1_1allocator.html">mlx::core::allocator</a>)</dd></dl>
+<dd><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">BaseMMAFrag&lt; T, 8, 8 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html">_MLX_BFloat16::bits_to_bfloat_struct</a></dd><dd><a class="el" href="struct_bitwise_and.html">BitwiseAnd</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_and.html">BitwiseAnd</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html">BitwiseBinary</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_bitwise_or.html">BitwiseOr</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_or.html">BitwiseOr</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_bitwise_xor.html">BitwiseXor</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_xor.html">BitwiseXor</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">BlockLoaderT</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html">BlockMaskedMM</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_block_merge_sort.html">BlockMergeSort</a></dd><dd><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">BlockMMA</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html">BlockSwizzle</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="unionbool4__or__uint.html">bool4_or_uint</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_broadcast.html">Broadcast</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1allocator_1_1_buffer.html">Buffer</a> (<a class="el" href="namespacemlx_1_1core_1_1allocator.html">mlx::core::allocator</a>)</dd></dl>
 <dl class="classindex even">
 <dt class="alphachar"><a id="letter_C" name="letter_C">C</a></dt>
-<dd><a class="el" href="struct_ceil.html">Ceil</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_ceil.html">Ceil</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_ceil.html">Ceil</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1cfftp.html">cfftp</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper.html">ChannelHelper</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html">ChannelHelper&lt; 1 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html">ChannelHelper&lt; 2 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html">ChannelHelper&lt; 3 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html">ChannelHelper&lt; 4 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_cholesky.html">Cholesky</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1cmplx.html">cmplx</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1cndarr.html">cndarr</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1allocator_1_1_common_allocator.html">CommonAllocator</a> (<a class="el" href="namespacemlx_1_1core_1_1allocator.html">mlx::core::allocator</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_compiled.html">Compiled</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1complex128__t.html">complex128_t</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structcomplex64__t.html">complex64_t</a></dd><dd><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_concatenate.html">Concatenate</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html">concurrent_queue</a> (<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html">pocketfft::detail::threading</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">CommandEncoder::ConcurrentContext</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="struct_conjugate.html">Conjugate</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_conjugate.html">Conjugate</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_conjugate.html">Conjugate</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html">ContiguousIterator</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_base_info.html">Conv2DGeneralBaseInfo</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_jump_params.html">Conv2DGeneralJumpParams</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html">Conv2DInputBlockLoaderGeneral</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html">Conv2DInputBlockLoaderLargeFilter</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html">Conv2DInputBlockLoaderSmallChannels</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html">Conv2DInputBlockLoaderSmallFilter</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html">Conv2DWeightBlockLoader</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html">Conv2DWeightBlockLoaderGeneral</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html">Conv2DWeightBlockLoaderSmallChannels</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_convolution.html">Convolution</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_copy.html">Copy</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_cos.html">Cos</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_cos.html">Cos</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_cos.html">Cos</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_cosh.html">Cosh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_cosh.html">Cosh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_cosh.html">Cosh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_cum_max.html">CumMax</a></dd><dd><a class="el" href="struct_cum_min.html">CumMin</a></dd><dd><a class="el" href="struct_cum_prod.html">CumProd</a></dd><dd><a class="el" href="struct_cum_prod_3_01bool_01_4.html">CumProd&lt; bool &gt;</a></dd><dd><a class="el" href="struct_cum_sum.html">CumSum</a></dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html">Custom</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">CustomKernel</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html">CustomKernelShapeInfo</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_custom_transforms.html">CustomTransforms</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
+<dd><a class="el" href="struct_ceil.html">Ceil</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_ceil.html">Ceil</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_ceil.html">Ceil</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1cfftp.html">cfftp</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper.html">ChannelHelper</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html">ChannelHelper&lt; 1 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html">ChannelHelper&lt; 2 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html">ChannelHelper&lt; 3 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html">ChannelHelper&lt; 4 &gt;</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_cholesky.html">Cholesky</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1cmplx.html">cmplx</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1cndarr.html">cndarr</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1allocator_1_1_common_allocator.html">CommonAllocator</a> (<a class="el" href="namespacemlx_1_1core_1_1allocator.html">mlx::core::allocator</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_compiled.html">Compiled</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1complex128__t.html">complex128_t</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structcomplex64__t.html">complex64_t</a></dd><dd><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_concatenate.html">Concatenate</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html">concurrent_queue</a> (<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html">pocketfft::detail::threading</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">CommandEncoder::ConcurrentContext</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="struct_conjugate.html">Conjugate</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_conjugate.html">Conjugate</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_conjugate.html">Conjugate</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_contiguous.html">Contiguous</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html">ContiguousIterator</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_base_info.html">Conv2DGeneralBaseInfo</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_jump_params.html">Conv2DGeneralJumpParams</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html">Conv2DInputBlockLoaderGeneral</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html">Conv2DInputBlockLoaderLargeFilter</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html">Conv2DInputBlockLoaderSmallChannels</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html">Conv2DInputBlockLoaderSmallFilter</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html">Conv2DWeightBlockLoader</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html">Conv2DWeightBlockLoaderGeneral</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html">Conv2DWeightBlockLoaderSmallChannels</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_convolution.html">Convolution</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_copy.html">Copy</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_cos.html">Cos</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_cos.html">Cos</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_cos.html">Cos</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_cosh.html">Cosh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_cosh.html">Cosh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_cosh.html">Cosh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_c_shape.html">CShape</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct_cum_max.html">CumMax</a></dd><dd><a class="el" href="struct_cum_min.html">CumMin</a></dd><dd><a class="el" href="struct_cum_prod.html">CumProd</a></dd><dd><a class="el" href="struct_cum_prod_3_01bool_01_4.html">CumProd&lt; bool &gt;</a></dd><dd><a class="el" href="struct_cum_sum.html">CumSum</a></dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html">Custom</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">CustomKernel</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html">CustomKernelShapeInfo</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_custom_transforms.html">CustomTransforms</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
 <dl class="classindex odd">
 <dt class="alphachar"><a id="letter_D" name="letter_D">D</a></dt>
-<dd><a class="el" href="structmlx_1_1core_1_1array_1_1_data.html">array::Data</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_default_contiguous_reduce.html">DefaultContiguousReduce</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_default_strided_reduce.html">DefaultStridedReduce</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_depends.html">Depends</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_device.html">Device</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1distributed_1_1_dist_primitive.html">DistPrimitive</a> (<a class="el" href="namespacemlx_1_1core_1_1distributed.html">mlx::core::distributed</a>)</dd><dd><a class="el" href="struct_divide.html">Divide</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_divide.html">Divide</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_divide.html">Divide</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_div_mod.html">DivMod</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_div_mod.html">DivMod</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
+<dd><a class="el" href="structmlx_1_1core_1_1array_1_1_data.html">array::Data</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_default_contiguous_reduce.html">DefaultContiguousReduce</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_default_strided_reduce.html">DefaultStridedReduce</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_depends.html">Depends</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_device.html">Device</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1distributed_1_1_dist_primitive.html">DistPrimitive</a> (<a class="el" href="namespacemlx_1_1core_1_1distributed.html">mlx::core::distributed</a>)</dd><dd><a class="el" href="struct_divide.html">Divide</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_divide.html">Divide</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_divide.html">Divide</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_div_mod.html">DivMod</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_div_mod.html">DivMod</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_div_op.html">DivOp</a></dd><dd><a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
 <dl class="classindex even">
 <dt class="alphachar"><a id="letter_E" name="letter_E">E</a></dt>
-<dd><a class="el" href="classmlx_1_1core_1_1_eigh.html">Eigh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_equal.html">Equal</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_equal.html">Equal</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_equal.html">Equal</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_erf.html">Erf</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_erf.html">Erf</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_erf.html">Erf</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_erf_inv.html">ErfInv</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_erf_inv.html">ErfInv</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_erf_inv.html">ErfInv</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_event.html">Event</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1_exec_c2_c.html">ExecC2C</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1_exec_dcst.html">ExecDcst</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1_exec_hartley.html">ExecHartley</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1_exec_r2_r.html">ExecR2R</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="struct_exp.html">Exp</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_exp.html">Exp</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_exp.html">Exp</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_expm1.html">Expm1</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_expm1.html">Expm1</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_expm1.html">Expm1</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
+<dd><a class="el" href="classmlx_1_1core_1_1_eigh.html">Eigh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_equal.html">Equal</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_equal.html">Equal</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_equal.html">Equal</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_erf.html">Erf</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_erf.html">Erf</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_erf.html">Erf</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_erf_inv.html">ErfInv</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_erf_inv.html">ErfInv</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_erf_inv.html">ErfInv</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_event.html">Event</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1_exec_c2_c.html">ExecC2C</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1_exec_dcst.html">ExecDcst</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1_exec_hartley.html">ExecHartley</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1_exec_r2_r.html">ExecR2R</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="struct_exp.html">Exp</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_exp.html">Exp</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_exp.html">Exp</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_expm1.html">Expm1</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_expm1.html">Expm1</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_expm1.html">Expm1</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_exp_sub_op.html">ExpSubOp</a></dd></dl>
 <dl class="classindex odd">
 <dt class="alphachar"><a id="letter_F" name="letter_F">F</a></dt>
 <dd><a class="el" href="structmlx_1_1core_1_1metal_1_1_fence.html">Fence</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_f_f_t.html">FFT</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1fftblue.html">fftblue</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1io_1_1_file_writer.html">FileWriter</a> (<a class="el" href="namespacemlx_1_1core_1_1io.html">mlx::core::io</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html">array::Flags</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_floor.html">Floor</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_floor.html">Floor</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_floor.html">Floor</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_floor_divide.html">FloorDivide</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_full.html">Full</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
@@ -121,10 +121,10 @@ $(function(){ initResizable(false); });
 <dd><a class="el" href="struct_kernel_merge_sort.html">KernelMergeSort</a></dd><dd><a class="el" href="struct_kernel_multi_block_merge_sort.html">KernelMultiBlockMergeSort</a></dd><dd><a class="el" href="classmlx_1_1core_1_1random_1_1_key_sequence.html">KeySequence</a> (<a class="el" href="namespacemlx_1_1core_1_1random.html">mlx::core::random</a>)</dd></dl>
 <dl class="classindex even">
 <dt class="alphachar"><a id="letter_L" name="letter_L">L</a></dt>
-<dd><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html">latch</a> (<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html">pocketfft::detail::threading</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html">LayerNorm</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html">LayerNormVJP</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="struct_left_shift.html">LeftShift</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_left_shift.html">LeftShift</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_less.html">Less</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_less.html">Less</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_less.html">Less</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_less_equal.html">LessEqual</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_less_equal.html">LessEqual</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_less_equal.html">LessEqual</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_less_than.html">LessThan</a></dd><dd><a class="el" href="struct_limits.html">Limits</a></dd><dd><a class="el" href="struct_limits_3_01bfloat16__t_01_4.html">Limits&lt; bfloat16_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01bool_01_4.html">Limits&lt; bool &gt;</a></dd><dd><a class="el" href="struct_limits_3_01complex64__t_01_4.html">Limits&lt; complex64_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01float_01_4.html">Limits&lt; float &gt;</a></dd><dd><a class="el" href="struct_limits_3_01half_01_4.html">Limits&lt; half &gt;</a></dd><dd><a class="el" href="struct_limits_3_01int16__t_01_4.html">Limits&lt; int16_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01int32__t_01_4.html">Limits&lt; int32_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01int64__t_01_4.html">Limits&lt; int64_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01int8__t_01_4.html">Limits&lt; int8_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01uint16__t_01_4.html">Limits&lt; uint16_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01uint32__t_01_4.html">Limits&lt; uint32_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01uint64__t_01_4.html">Limits&lt; uint64_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01uint8__t_01_4.html">Limits&lt; uint8_t &gt;</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_load.html">Load</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_log.html">Log</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log.html">Log</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_log.html">Log</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_log10.html">Log10</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log10.html">Log10</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_log1p.html">Log1p</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log1p.html">Log1p</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_log1p.html">Log1p</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_log2.html">Log2</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log2.html">Log2</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_log_add_exp.html">LogAddExp</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log_add_exp.html">LogAddExp</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_log_add_exp.html">LogAddExp</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_logical_and.html">LogicalAnd</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_and.html">LogicalAnd</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_logical_and.html">LogicalAnd</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_logical_not.html">LogicalNot</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_not.html">LogicalNot</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_logical_not.html">LogicalNot</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_logical_or.html">LogicalOr</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_or.html">LogicalOr</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_logical_or.html">LogicalOr</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html">LoopAlignment</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a></dd><dd><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 0, offset_t &gt;</a></dd><dd><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 1, offset_t &gt;</a></dd></dl>
+<dd><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html">latch</a> (<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html">pocketfft::detail::threading</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html">LayerNorm</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html">LayerNormVJP</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">Layout2D</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct_left_shift.html">LeftShift</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_left_shift.html">LeftShift</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_less.html">Less</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_less.html">Less</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_less.html">Less</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_less_equal.html">LessEqual</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_less_equal.html">LessEqual</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_less_equal.html">LessEqual</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_less_than.html">LessThan</a></dd><dd><a class="el" href="struct_limits.html">Limits</a></dd><dd><a class="el" href="struct_limits_3_01bfloat16__t_01_4.html">Limits&lt; bfloat16_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01bool_01_4.html">Limits&lt; bool &gt;</a></dd><dd><a class="el" href="struct_limits_3_01complex64__t_01_4.html">Limits&lt; complex64_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01float_01_4.html">Limits&lt; float &gt;</a></dd><dd><a class="el" href="struct_limits_3_01half_01_4.html">Limits&lt; half &gt;</a></dd><dd><a class="el" href="struct_limits_3_01int16__t_01_4.html">Limits&lt; int16_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01int32__t_01_4.html">Limits&lt; int32_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01int64__t_01_4.html">Limits&lt; int64_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01int8__t_01_4.html">Limits&lt; int8_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01uint16__t_01_4.html">Limits&lt; uint16_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01uint32__t_01_4.html">Limits&lt; uint32_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01uint64__t_01_4.html">Limits&lt; uint64_t &gt;</a></dd><dd><a class="el" href="struct_limits_3_01uint8__t_01_4.html">Limits&lt; uint8_t &gt;</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_load.html">Load</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_log.html">Log</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log.html">Log</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_log.html">Log</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_log10.html">Log10</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log10.html">Log10</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_log1p.html">Log1p</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log1p.html">Log1p</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_log1p.html">Log1p</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_log2.html">Log2</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log2.html">Log2</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_log_add_exp.html">LogAddExp</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_log_add_exp.html">LogAddExp</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_log_add_exp.html">LogAddExp</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_logical_and.html">LogicalAnd</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_and.html">LogicalAnd</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_logical_and.html">LogicalAnd</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_logical_not.html">LogicalNot</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_not.html">LogicalNot</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_logical_not.html">LogicalNot</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_logical_or.html">LogicalOr</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_or.html">LogicalOr</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_logical_or.html">LogicalOr</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html">LoopAlignment</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a></dd><dd><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a></dd><dd><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></dd></dl>
 <dl class="classindex odd">
 <dt class="alphachar"><a id="letter_M" name="letter_M">M</a></dt>
-<dd><a class="el" href="structmetal_1_1make__void.html">make_void</a> (<a class="el" href="namespacemetal.html">metal</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_matmul.html">Matmul</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_max.html">Max</a></dd><dd><a class="el" href="struct_maximum.html">Maximum</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_maximum.html">Maximum</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_maximum.html">Maximum</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html">MetalAllocator</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="struct_min.html">Min</a></dd><dd><a class="el" href="struct_minimum.html">Minimum</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_minimum.html">Minimum</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_minimum.html">Minimum</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx__atomic.html">mlx_atomic</a></dd><dd><a class="el" href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html">mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;</a></dd><dd><a class="el" href="struct_m_l_x_conv_params.html">MLXConvParams</a></dd><dd><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></dd><dd><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a></dd><dd><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1multi__iter.html">multi_iter</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_multiply.html">Multiply</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_multiply.html">Multiply</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_multiply.html">Multiply</a></dd></dl>
+<dd><a class="el" href="structmetal_1_1make__void.html">make_void</a> (<a class="el" href="namespacemetal.html">metal</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_matmul.html">Matmul</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_max.html">Max</a></dd><dd><a class="el" href="struct_maximum.html">Maximum</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_maximum.html">Maximum</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_maximum.html">Maximum</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_max_op.html">MaxOp</a></dd><dd><a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html">MetalAllocator</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="struct_min.html">Min</a></dd><dd><a class="el" href="struct_minimum.html">Minimum</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_minimum.html">Minimum</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_minimum.html">Minimum</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx__atomic.html">mlx_atomic</a></dd><dd><a class="el" href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html">mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;</a></dd><dd><a class="el" href="struct_m_l_x_conv_params.html">MLXConvParams</a></dd><dd><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct_mul_op.html">MulOp</a></dd><dd><a class="el" href="classpocketfft_1_1detail_1_1multi__iter.html">multi_iter</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_multiply.html">Multiply</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_multiply.html">Multiply</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_multiply.html">Multiply</a></dd></dl>
 <dl class="classindex even">
 <dt class="alphachar"><a id="letter_N" name="letter_N">N</a></dt>
 <dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_na_n_equal.html">NaNEqual</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_na_n_equal.html">NaNEqual</a></dd><dd><a class="el" href="classpocketfft_1_1detail_1_1ndarr.html">ndarr</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_negative.html">Negative</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_negative.html">Negative</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_negative.html">Negative</a></dd><dd><a class="el" href="structmlx_1_1core_1_1_node_namer.html">NodeNamer</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_none.html">None</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_not_equal.html">NotEqual</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_not_equal.html">NotEqual</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_not_equal.html">NotEqual</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_number_of_elements.html">NumberOfElements</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
@@ -142,10 +142,10 @@ $(function(){ initResizable(false); });
 <dd><a class="el" href="classmlx_1_1core_1_1_random_bits.html">RandomBits</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1io_1_1_reader.html">Reader</a> (<a class="el" href="namespacemlx_1_1core_1_1io.html">mlx::core::io</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">BlockLoader::ReadVector</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct_read_writer.html">ReadWriter</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_real.html">Real</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_real.html">Real</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_real.html">Real</a></dd><dd><a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html">Recv</a> (<a class="el" href="namespacemlx_1_1core_1_1distributed.html">mlx::core::distributed</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_reduce.html">Reduce</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_reduction_plan.html">ReductionPlan</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_remainder.html">Remainder</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_remainder.html">Remainder</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_remainder.html">Remainder</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_reshape.html">Reshape</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1metal_1_1_residency_set.html">ResidencySet</a> (<a class="el" href="namespacemlx_1_1core_1_1metal.html">mlx::core::metal</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_retain_graph.html">RetainGraph</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1rev__iter.html">rev_iter</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1rfftp.html">rfftp</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_right_shift.html">RightShift</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_right_shift.html">RightShift</a></dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html">RMSNorm</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html">RMSNormVJP</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html">RoPE</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_round.html">Round</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_round.html">Round</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_round.html">Round</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_rsqrt.html">Rsqrt</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="struct_rsqrt.html">Rsqrt</a></dd></dl>
 <dl class="classindex odd">
 <dt class="alphachar"><a id="letter_S" name="letter_S">S</a></dt>
-<dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html">ScaledDotProductAttention</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="struct_scale_op.html">ScaleOp</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_scan.html">Scan</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_scatter.html">Scatter</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html">Scheduler</a> (<a class="el" href="namespacemlx_1_1core_1_1scheduler.html">mlx::core::scheduler</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_select.html">Select</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_select.html">Select</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_select.html">Select</a></dd><dd><a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html">Send</a> (<a class="el" href="namespacemlx_1_1core_1_1distributed.html">mlx::core::distributed</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sigmoid.html">Sigmoid</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sigmoid.html">Sigmoid</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sigmoid.html">Sigmoid</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sign.html">Sign</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sign.html">Sign</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sign.html">Sign</a></dd><dd><a class="el" href="classpocketfft_1_1detail_1_1simple__iter.html">simple_iter</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sin.html">Sin</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sin.html">Sin</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sin.html">Sin</a></dd><dd><a class="el" href="classpocketfft_1_1detail_1_1sincos__2pibyn.html">sincos_2pibyn</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sinh.html">Sinh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sinh.html">Sinh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sinh.html">Sinh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_slice.html">Slice</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_slice_update.html">SliceUpdate</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_softmax.html">Softmax</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sort.html">Sort</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_split.html">Split</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sqrt.html">Sqrt</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sqrt.html">Sqrt</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sqrt.html">Sqrt</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_square.html">Square</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_square.html">Square</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_square.html">Square</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_stop_gradient.html">StopGradient</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_stream_context.html">StreamContext</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html">StreamThread</a> (<a class="el" href="namespacemlx_1_1core_1_1scheduler.html">mlx::core::scheduler</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_subtract.html">Subtract</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_subtract.html">Subtract</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_subtract.html">Subtract</a></dd><dd><a class="el" href="struct_sum.html">Sum</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_s_v_d.html">SVD</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
+<dd><a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html">ScaledDotProductAttention</a> (<a class="el" href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a>)</dd><dd><a class="el" href="struct_scale_op.html">ScaleOp</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_scan.html">Scan</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_scatter.html">Scatter</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html">Scheduler</a> (<a class="el" href="namespacemlx_1_1core_1_1scheduler.html">mlx::core::scheduler</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_select.html">Select</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_select.html">Select</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_select.html">Select</a></dd><dd><a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html">Send</a> (<a class="el" href="namespacemlx_1_1core_1_1distributed.html">mlx::core::distributed</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">Shape2D</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sigmoid.html">Sigmoid</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sigmoid.html">Sigmoid</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sigmoid.html">Sigmoid</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sign.html">Sign</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sign.html">Sign</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sign.html">Sign</a></dd><dd><a class="el" href="classpocketfft_1_1detail_1_1simple__iter.html">simple_iter</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sin.html">Sin</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sin.html">Sin</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sin.html">Sin</a></dd><dd><a class="el" href="classpocketfft_1_1detail_1_1sincos__2pibyn.html">sincos_2pibyn</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sinh.html">Sinh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sinh.html">Sinh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sinh.html">Sinh</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_slice.html">Slice</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_slice_update.html">SliceUpdate</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_softmax.html">Softmax</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sort.html">Sort</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_split.html">Split</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_sqrt.html">Sqrt</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_sqrt.html">Sqrt</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_sqrt.html">Sqrt</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_square.html">Square</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_square.html">Square</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_square.html">Square</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_stop_gradient.html">StopGradient</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_stream_context.html">StreamContext</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html">StreamThread</a> (<a class="el" href="namespacemlx_1_1core_1_1scheduler.html">mlx::core::scheduler</a>)</dd><dd><a class="el" href="struct_sub_op.html">SubOp</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_subtract.html">Subtract</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_subtract.html">Subtract</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_subtract.html">Subtract</a></dd><dd><a class="el" href="struct_sum.html">Sum</a></dd><dd><a class="el" href="struct_sum_op.html">SumOp</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_s_v_d.html">SVD</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
 <dl class="classindex even">
 <dt class="alphachar"><a id="letter_T" name="letter_T">T</a></dt>
-<dd><a class="el" href="classpocketfft_1_1detail_1_1_t__dcst23.html">T_dcst23</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1_t__dcst4.html">T_dcst4</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1_t__dct1.html">T_dct1</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1_t__dst1.html">T_dst1</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_tan.html">Tan</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_tan.html">Tan</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_tan.html">Tan</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_tanh.html">Tanh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_tanh.html">Tanh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_tanh.html">Tanh</a></dd><dd><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1thread__pool.html">thread_pool</a> (<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html">pocketfft::detail::threading</a>)</dd><dd><a class="el" href="class_thread_pool.html">ThreadPool</a></dd><dd><a class="el" href="struct_thread_sort.html">ThreadSort</a></dd><dd><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">TransformAdd</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">TransformAxpby</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_transform_none.html">TransformNone</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_transpose.html">Transpose</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_type_to_dtype.html">TypeToDtype</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
+<dd><a class="el" href="classpocketfft_1_1detail_1_1_t__dcst23.html">T_dcst23</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1_t__dcst4.html">T_dcst4</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1_t__dct1.html">T_dct1</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="classpocketfft_1_1detail_1_1_t__dst1.html">T_dst1</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_tan.html">Tan</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_tan.html">Tan</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_tan.html">Tan</a></dd><dd><a class="el" href="structmlx_1_1core_1_1detail_1_1_tanh.html">Tanh</a> (<a class="el" href="namespacemlx_1_1core_1_1detail.html">mlx::core::detail</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_tanh.html">Tanh</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="struct_tanh.html">Tanh</a></dd><dd><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1thread__pool.html">thread_pool</a> (<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html">pocketfft::detail::threading</a>)</dd><dd><a class="el" href="class_thread_pool.html">ThreadPool</a></dd><dd><a class="el" href="struct_thread_sort.html">ThreadSort</a></dd><dd><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">TransformAdd</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">TransformAxpby</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="structmlx_1_1steel_1_1_transform_none.html">TransformNone</a> (<a class="el" href="namespacemlx_1_1steel.html">mlx::steel</a>)</dd><dd><a class="el" href="struct_transform_scale.html">TransformScale</a></dd><dd><a class="el" href="classmlx_1_1core_1_1_transpose.html">Transpose</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structmlx_1_1core_1_1_type_to_dtype.html">TypeToDtype</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd></dl>
 <dl class="classindex odd">
 <dt class="alphachar"><a id="letter_U" name="letter_U">U</a></dt>
 <dd><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="classmlx_1_1core_1_1_uniform.html">Uniform</a> (<a class="el" href="namespacemlx_1_1core.html">mlx::core</a>)</dd><dd><a class="el" href="structpocketfft_1_1detail_1_1util.html">util</a> (<a class="el" href="namespacepocketfft_1_1detail.html">pocketfft::detail</a>)</dd></dl>
diff --git a/docs/build/html/classmlx_1_1core_1_1_contiguous-members.html b/docs/build/html/classmlx_1_1core_1_1_contiguous-members.html
new file mode 100644
index 000000000..49a90d86e
--- /dev/null
+++ b/docs/build/html/classmlx_1_1core_1_1_contiguous-members.html
@@ -0,0 +1,129 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1core.html">core</a></li><li class="navelem"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">Contiguous</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">mlx::core::Contiguous Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html#a3e83f414c02ae0b92a50b6f8e402e1c0">Contiguous</a>(Stream stream, bool allow_col_major)</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">explicit</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a8ae61e3289c4134232a69295268f8261">device</a>()</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336">eval_cpu</a>(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#aa0ed6e32c36200a3ff9bc592c9b300db">mlx::core::UnaryPrimitive::eval_cpu</a>(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f">eval_gpu</a>(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a971fe9ad47f6569118879ce1d0f41447">mlx::core::UnaryPrimitive::eval_gpu</a>(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html#aa5d273a461fc6e64f3c9a67c24cb3372">is_equivalent</a>(const Primitive &amp;other) const override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html#a1f9fcae7235e0ae9217825b78cb0f991">jvp</a>(const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;tangents, const std::vector&lt; int &gt; &amp;argnums) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a0a859309a4f192f2679e07f2e4ff4d22">operator=</a>(const UnaryPrimitive &amp;other)=delete</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#ab90b2ea80f1d914be03cf44def5db5a5">operator=</a>(UnaryPrimitive &amp;&amp;other)=delete</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a6b1be7ea92f3a7bb19875c70259dad6b">mlx::core::Primitive::operator=</a>(const Primitive &amp;other)=delete</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a50bbddd43e1ba0cf5f127cd7aa756a9e">mlx::core::Primitive::operator=</a>(Primitive &amp;&amp;other)=delete</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html#a1a53623d7c591ba6567ac1533fbc2b7c">output_shapes</a>(const std::vector&lt; array &gt; &amp;inputs) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#afc69f22ee1f6e8a9ecc2c3a8f43b8fdb">Primitive</a>(Stream stream)</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">explicit</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a3349f745fae50ca7627f79a731a19e32">Primitive</a>(const Primitive &amp;other)=delete</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a342da891b9882bdee9a0e0c1ac826eda">Primitive</a>(Primitive &amp;&amp;other)=delete</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html#aca8a4ba9a58cc10f063e6b082fa2fc23">print</a>(std::ostream &amp;os) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">stream</a>()</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a189f6d4ed369f82a4b724a29eb056d4e">UnaryPrimitive</a>(Stream stream)</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">explicit</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a9935cffc4f246d3d883bc3d26c5163f2">UnaryPrimitive</a>(const UnaryPrimitive &amp;other)=delete</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a780281fb04e2daf1be630c124bd605e3">UnaryPrimitive</a>(UnaryPrimitive &amp;&amp;other)=delete</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html#abf488f02057fd5852f38b2e8a600ad2a">vjp</a>(const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotangents, const std::vector&lt; int &gt; &amp;argnums, const std::vector&lt; array &gt; &amp;outputs) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html#a563221e90b15aa90bfae23d29c10e4ec">vmap</a>(const std::vector&lt; array &gt; &amp;inputs, const std::vector&lt; int &gt; &amp;axes) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a29f70eb2d3b7e6c5fe52779c03f03777">~Primitive</a>()=default</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#ac0677ab99a5ca660ed6ab7902ea364de">~UnaryPrimitive</a>()=default</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/classmlx_1_1core_1_1_contiguous.html b/docs/build/html/classmlx_1_1core_1_1_contiguous.html
new file mode 100644
index 000000000..98a3d0d20
--- /dev/null
+++ b/docs/build/html/classmlx_1_1core_1_1_contiguous.html
@@ -0,0 +1,481 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx::core::Contiguous Class Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1core.html">core</a></li><li class="navelem"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">Contiguous</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-methods">Public Member Functions</a> &#124;
+<a href="classmlx_1_1core_1_1_contiguous-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">mlx::core::Contiguous Class Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="primitives_8h_source.html">primitives.h</a>&gt;</code></p>
+<div class="dynheader">
+Inheritance diagram for mlx::core::Contiguous:</div>
+<div class="dyncontent">
+ <div class="center">
+  <img src="classmlx_1_1core_1_1_contiguous.png" usemap="#mlx::core::Contiguous_map" alt=""/>
+  <map id="mlx::core::Contiguous_map" name="mlx::core::Contiguous_map">
+<area href="classmlx_1_1core_1_1_unary_primitive.html" alt="mlx::core::UnaryPrimitive" shape="rect" coords="0,56,150,80"/>
+<area href="classmlx_1_1core_1_1_primitive.html" alt="mlx::core::Primitive" shape="rect" coords="0,0,150,24"/>
+  </map>
+</div></div>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
+Public Member Functions</h2></td></tr>
+<tr class="memitem:a3e83f414c02ae0b92a50b6f8e402e1c0" id="r_a3e83f414c02ae0b92a50b6f8e402e1c0"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3e83f414c02ae0b92a50b6f8e402e1c0">Contiguous</a> (<a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> <a class="el" href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">stream</a>, bool allow_col_major)</td></tr>
+<tr class="separator:a3e83f414c02ae0b92a50b6f8e402e1c0"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a742de24e6c0310cd85a606dec0cd8336" id="r_a742de24e6c0310cd85a606dec0cd8336"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a742de24e6c0310cd85a606dec0cd8336">eval_cpu</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out) override</td></tr>
+<tr class="separator:a742de24e6c0310cd85a606dec0cd8336"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a519cd16fd0c55b371ea7625fbb37c70f" id="r_a519cd16fd0c55b371ea7625fbb37c70f"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a519cd16fd0c55b371ea7625fbb37c70f">eval_gpu</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out) override</td></tr>
+<tr class="separator:a519cd16fd0c55b371ea7625fbb37c70f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a563221e90b15aa90bfae23d29c10e4ec" id="r_a563221e90b15aa90bfae23d29c10e4ec"><td class="memItemLeft" align="right" valign="top">virtual std::pair&lt; std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt;, std::vector&lt; int &gt; &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a563221e90b15aa90bfae23d29c10e4ec">vmap</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, const std::vector&lt; int &gt; &amp;axes) override</td></tr>
+<tr class="memdesc:a563221e90b15aa90bfae23d29c10e4ec"><td class="mdescLeft">&#160;</td><td class="mdescRight">The primitive must know how to vectorize itself across the given axes.  <br /></td></tr>
+<tr class="separator:a563221e90b15aa90bfae23d29c10e4ec"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1f9fcae7235e0ae9217825b78cb0f991" id="r_a1f9fcae7235e0ae9217825b78cb0f991"><td class="memItemLeft" align="right" valign="top">std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1f9fcae7235e0ae9217825b78cb0f991">jvp</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;primals, const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;tangents, const std::vector&lt; int &gt; &amp;argnums) override</td></tr>
+<tr class="memdesc:a1f9fcae7235e0ae9217825b78cb0f991"><td class="mdescLeft">&#160;</td><td class="mdescRight">The Jacobian-vector product.  <br /></td></tr>
+<tr class="separator:a1f9fcae7235e0ae9217825b78cb0f991"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abf488f02057fd5852f38b2e8a600ad2a" id="r_abf488f02057fd5852f38b2e8a600ad2a"><td class="memItemLeft" align="right" valign="top">std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abf488f02057fd5852f38b2e8a600ad2a">vjp</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;primals, const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;cotangents, const std::vector&lt; int &gt; &amp;argnums, const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;outputs) override</td></tr>
+<tr class="memdesc:abf488f02057fd5852f38b2e8a600ad2a"><td class="mdescLeft">&#160;</td><td class="mdescRight">The vector-Jacobian product.  <br /></td></tr>
+<tr class="separator:abf488f02057fd5852f38b2e8a600ad2a"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aca8a4ba9a58cc10f063e6b082fa2fc23" id="r_aca8a4ba9a58cc10f063e6b082fa2fc23"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aca8a4ba9a58cc10f063e6b082fa2fc23">print</a> (std::ostream &amp;os) override</td></tr>
+<tr class="memdesc:aca8a4ba9a58cc10f063e6b082fa2fc23"><td class="mdescLeft">&#160;</td><td class="mdescRight">Print the primitive.  <br /></td></tr>
+<tr class="separator:aca8a4ba9a58cc10f063e6b082fa2fc23"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1a53623d7c591ba6567ac1533fbc2b7c" id="r_a1a53623d7c591ba6567ac1533fbc2b7c"><td class="memItemLeft" align="right" valign="top">std::vector&lt; std::vector&lt; int &gt; &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1a53623d7c591ba6567ac1533fbc2b7c">output_shapes</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs) override</td></tr>
+<tr class="memdesc:a1a53623d7c591ba6567ac1533fbc2b7c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Get the output shapes of the primitive.  <br /></td></tr>
+<tr class="separator:a1a53623d7c591ba6567ac1533fbc2b7c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa5d273a461fc6e64f3c9a67c24cb3372" id="r_aa5d273a461fc6e64f3c9a67c24cb3372"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa5d273a461fc6e64f3c9a67c24cb3372">is_equivalent</a> (const <a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> &amp;other) const override</td></tr>
+<tr class="memdesc:aa5d273a461fc6e64f3c9a67c24cb3372"><td class="mdescLeft">&#160;</td><td class="mdescRight">Equivalence check defaults to false unless overridden by the primitive.  <br /></td></tr>
+<tr class="separator:aa5d273a461fc6e64f3c9a67c24cb3372"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="inherit_header pub_methods_classmlx_1_1core_1_1_unary_primitive"><td colspan="2" onclick="javascript:dynsection.toggleInherit('pub_methods_classmlx_1_1core_1_1_unary_primitive')"><img src="closed.png" alt="-"/>&#160;Public Member Functions inherited from <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></td></tr>
+<tr class="memitem:a189f6d4ed369f82a4b724a29eb056d4e inherit pub_methods_classmlx_1_1core_1_1_unary_primitive" id="r_a189f6d4ed369f82a4b724a29eb056d4e"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a189f6d4ed369f82a4b724a29eb056d4e">UnaryPrimitive</a> (<a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> <a class="el" href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">stream</a>)</td></tr>
+<tr class="memdesc:a189f6d4ed369f82a4b724a29eb056d4e inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="mdescLeft">&#160;</td><td class="mdescRight">An abstract base class for a primitive with a single output.  <br /></td></tr>
+<tr class="separator:a189f6d4ed369f82a4b724a29eb056d4e inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa0ed6e32c36200a3ff9bc592c9b300db inherit pub_methods_classmlx_1_1core_1_1_unary_primitive" id="r_aa0ed6e32c36200a3ff9bc592c9b300db"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#aa0ed6e32c36200a3ff9bc592c9b300db">eval_cpu</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;outputs) override</td></tr>
+<tr class="memdesc:aa0ed6e32c36200a3ff9bc592c9b300db inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="mdescLeft">&#160;</td><td class="mdescRight">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the output arrays.  <br /></td></tr>
+<tr class="separator:aa0ed6e32c36200a3ff9bc592c9b300db inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a971fe9ad47f6569118879ce1d0f41447 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive" id="r_a971fe9ad47f6569118879ce1d0f41447"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a971fe9ad47f6569118879ce1d0f41447">eval_gpu</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;outputs) override</td></tr>
+<tr class="separator:a971fe9ad47f6569118879ce1d0f41447 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac0677ab99a5ca660ed6ab7902ea364de inherit pub_methods_classmlx_1_1core_1_1_unary_primitive" id="r_ac0677ab99a5ca660ed6ab7902ea364de"><td class="memItemLeft" align="right" valign="top">virtual&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#ac0677ab99a5ca660ed6ab7902ea364de">~UnaryPrimitive</a> ()=default</td></tr>
+<tr class="separator:ac0677ab99a5ca660ed6ab7902ea364de inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a9935cffc4f246d3d883bc3d26c5163f2 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive" id="r_a9935cffc4f246d3d883bc3d26c5163f2"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a9935cffc4f246d3d883bc3d26c5163f2">UnaryPrimitive</a> (const <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> &amp;other)=delete</td></tr>
+<tr class="separator:a9935cffc4f246d3d883bc3d26c5163f2 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a780281fb04e2daf1be630c124bd605e3 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive" id="r_a780281fb04e2daf1be630c124bd605e3"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a780281fb04e2daf1be630c124bd605e3">UnaryPrimitive</a> (<a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> &amp;&amp;other)=delete</td></tr>
+<tr class="separator:a780281fb04e2daf1be630c124bd605e3 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a0a859309a4f192f2679e07f2e4ff4d22 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive" id="r_a0a859309a4f192f2679e07f2e4ff4d22"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a0a859309a4f192f2679e07f2e4ff4d22">operator=</a> (const <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> &amp;other)=delete</td></tr>
+<tr class="separator:a0a859309a4f192f2679e07f2e4ff4d22 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ab90b2ea80f1d914be03cf44def5db5a5 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive" id="r_ab90b2ea80f1d914be03cf44def5db5a5"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#ab90b2ea80f1d914be03cf44def5db5a5">operator=</a> (<a class="el" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> &amp;&amp;other)=delete</td></tr>
+<tr class="separator:ab90b2ea80f1d914be03cf44def5db5a5 inherit pub_methods_classmlx_1_1core_1_1_unary_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="inherit_header pub_methods_classmlx_1_1core_1_1_primitive"><td colspan="2" onclick="javascript:dynsection.toggleInherit('pub_methods_classmlx_1_1core_1_1_primitive')"><img src="closed.png" alt="-"/>&#160;Public Member Functions inherited from <a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td></tr>
+<tr class="memitem:afc69f22ee1f6e8a9ecc2c3a8f43b8fdb inherit pub_methods_classmlx_1_1core_1_1_primitive" id="r_afc69f22ee1f6e8a9ecc2c3a8f43b8fdb"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_primitive.html#afc69f22ee1f6e8a9ecc2c3a8f43b8fdb">Primitive</a> (<a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> <a class="el" href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">stream</a>)</td></tr>
+<tr class="separator:afc69f22ee1f6e8a9ecc2c3a8f43b8fdb inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a8ae61e3289c4134232a69295268f8261 inherit pub_methods_classmlx_1_1core_1_1_primitive" id="r_a8ae61e3289c4134232a69295268f8261"><td class="memItemLeft" align="right" valign="top">const <a class="el" href="structmlx_1_1core_1_1_device.html">Device</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a8ae61e3289c4134232a69295268f8261">device</a> ()</td></tr>
+<tr class="memdesc:a8ae61e3289c4134232a69295268f8261 inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="mdescLeft">&#160;</td><td class="mdescRight">The device the primitive will run on.  <br /></td></tr>
+<tr class="separator:a8ae61e3289c4134232a69295268f8261 inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a46e6257397a662528f9f831842ac456a inherit pub_methods_classmlx_1_1core_1_1_primitive" id="r_a46e6257397a662528f9f831842ac456a"><td class="memItemLeft" align="right" valign="top">const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">stream</a> ()</td></tr>
+<tr class="memdesc:a46e6257397a662528f9f831842ac456a inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="mdescLeft">&#160;</td><td class="mdescRight">The stream the primitive will run on.  <br /></td></tr>
+<tr class="separator:a46e6257397a662528f9f831842ac456a inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a29f70eb2d3b7e6c5fe52779c03f03777 inherit pub_methods_classmlx_1_1core_1_1_primitive" id="r_a29f70eb2d3b7e6c5fe52779c03f03777"><td class="memItemLeft" align="right" valign="top">virtual&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a29f70eb2d3b7e6c5fe52779c03f03777">~Primitive</a> ()=default</td></tr>
+<tr class="separator:a29f70eb2d3b7e6c5fe52779c03f03777 inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3349f745fae50ca7627f79a731a19e32 inherit pub_methods_classmlx_1_1core_1_1_primitive" id="r_a3349f745fae50ca7627f79a731a19e32"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a3349f745fae50ca7627f79a731a19e32">Primitive</a> (const <a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> &amp;other)=delete</td></tr>
+<tr class="separator:a3349f745fae50ca7627f79a731a19e32 inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a342da891b9882bdee9a0e0c1ac826eda inherit pub_methods_classmlx_1_1core_1_1_primitive" id="r_a342da891b9882bdee9a0e0c1ac826eda"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a342da891b9882bdee9a0e0c1ac826eda">Primitive</a> (<a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> &amp;&amp;other)=delete</td></tr>
+<tr class="separator:a342da891b9882bdee9a0e0c1ac826eda inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6b1be7ea92f3a7bb19875c70259dad6b inherit pub_methods_classmlx_1_1core_1_1_primitive" id="r_a6b1be7ea92f3a7bb19875c70259dad6b"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a6b1be7ea92f3a7bb19875c70259dad6b">operator=</a> (const <a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> &amp;other)=delete</td></tr>
+<tr class="separator:a6b1be7ea92f3a7bb19875c70259dad6b inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a50bbddd43e1ba0cf5f127cd7aa756a9e inherit pub_methods_classmlx_1_1core_1_1_primitive" id="r_a50bbddd43e1ba0cf5f127cd7aa756a9e"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a50bbddd43e1ba0cf5f127cd7aa756a9e">operator=</a> (<a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> &amp;&amp;other)=delete</td></tr>
+<tr class="separator:a50bbddd43e1ba0cf5f127cd7aa756a9e inherit pub_methods_classmlx_1_1core_1_1_primitive"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
+<a id="a3e83f414c02ae0b92a50b6f8e402e1c0" name="a3e83f414c02ae0b92a50b6f8e402e1c0"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a3e83f414c02ae0b92a50b6f8e402e1c0">&#9670;&#160;</a></span>Contiguous()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">mlx::core::Contiguous::Contiguous </td>
+          <td>(</td>
+          <td class="paramtype"><a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a></td>          <td class="paramname"><span class="paramname"><em>stream</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">bool</td>          <td class="paramname"><span class="paramname"><em>allow_col_major</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">explicit</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="a742de24e6c0310cd85a606dec0cd8336" name="a742de24e6c0310cd85a606dec0cd8336"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a742de24e6c0310cd85a606dec0cd8336">&#9670;&#160;</a></span>eval_cpu()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::Contiguous::eval_cpu </td>
+          <td>(</td>
+          <td class="paramtype">const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>inputs</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>out</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Implements <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a7e8f6f5d6ae0a33f6abc0f5a46e0b132">mlx::core::UnaryPrimitive</a>.</p>
+
+</div>
+</div>
+<a id="a519cd16fd0c55b371ea7625fbb37c70f" name="a519cd16fd0c55b371ea7625fbb37c70f"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a519cd16fd0c55b371ea7625fbb37c70f">&#9670;&#160;</a></span>eval_gpu()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::Contiguous::eval_gpu </td>
+          <td>(</td>
+          <td class="paramtype">const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>inputs</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>out</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Implements <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a6b7f80abaf038d53ec6ffbb0dfac6adb">mlx::core::UnaryPrimitive</a>.</p>
+
+</div>
+</div>
+<a id="aa5d273a461fc6e64f3c9a67c24cb3372" name="aa5d273a461fc6e64f3c9a67c24cb3372"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa5d273a461fc6e64f3c9a67c24cb3372">&#9670;&#160;</a></span>is_equivalent()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">bool mlx::core::Contiguous::is_equivalent </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>other</em></span></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Equivalence check defaults to false unless overridden by the primitive. </p>
+
+<p>Reimplemented from <a class="el" href="classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd">mlx::core::Primitive</a>.</p>
+
+</div>
+</div>
+<a id="a1f9fcae7235e0ae9217825b78cb0f991" name="a1f9fcae7235e0ae9217825b78cb0f991"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1f9fcae7235e0ae9217825b78cb0f991">&#9670;&#160;</a></span>jvp()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; mlx::core::Contiguous::jvp </td>
+          <td>(</td>
+          <td class="paramtype">const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>primals</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>tangents</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::vector&lt; int &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>argnums</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>The Jacobian-vector product. </p>
+
+<p>Reimplemented from <a class="el" href="classmlx_1_1core_1_1_primitive.html#a9fecf38f53da08ba1947543c2b3158c2">mlx::core::Primitive</a>.</p>
+
+</div>
+</div>
+<a id="a1a53623d7c591ba6567ac1533fbc2b7c" name="a1a53623d7c591ba6567ac1533fbc2b7c"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1a53623d7c591ba6567ac1533fbc2b7c">&#9670;&#160;</a></span>output_shapes()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">std::vector&lt; std::vector&lt; int &gt; &gt; mlx::core::Contiguous::output_shapes </td>
+          <td>(</td>
+          <td class="paramtype">const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>inputs</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Get the output shapes of the primitive. </p>
+<p>This is not required to be implemented by derived classes, in which case it will throw. </p>
+
+<p>Reimplemented from <a class="el" href="classmlx_1_1core_1_1_primitive.html#a8849dc20991398f6f9a24d6785673853">mlx::core::Primitive</a>.</p>
+
+</div>
+</div>
+<a id="aca8a4ba9a58cc10f063e6b082fa2fc23" name="aca8a4ba9a58cc10f063e6b082fa2fc23"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aca8a4ba9a58cc10f063e6b082fa2fc23">&#9670;&#160;</a></span>print()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::Contiguous::print </td>
+          <td>(</td>
+          <td class="paramtype">std::ostream &amp;</td>          <td class="paramname"><span class="paramname"><em>os</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Print the primitive. </p>
+
+<p>Implements <a class="el" href="classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb">mlx::core::Primitive</a>.</p>
+
+</div>
+</div>
+<a id="abf488f02057fd5852f38b2e8a600ad2a" name="abf488f02057fd5852f38b2e8a600ad2a"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#abf488f02057fd5852f38b2e8a600ad2a">&#9670;&#160;</a></span>vjp()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; mlx::core::Contiguous::vjp </td>
+          <td>(</td>
+          <td class="paramtype">const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>primals</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>cotangents</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::vector&lt; int &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>argnums</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>outputs</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>The vector-Jacobian product. </p>
+
+<p>Reimplemented from <a class="el" href="classmlx_1_1core_1_1_primitive.html#a1dcb6807326eeab62474c6a0e3836d42">mlx::core::Primitive</a>.</p>
+
+</div>
+</div>
+<a id="a563221e90b15aa90bfae23d29c10e4ec" name="a563221e90b15aa90bfae23d29c10e4ec"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a563221e90b15aa90bfae23d29c10e4ec">&#9670;&#160;</a></span>vmap()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">virtual std::pair&lt; std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt;, std::vector&lt; int &gt; &gt; mlx::core::Contiguous::vmap </td>
+          <td>(</td>
+          <td class="paramtype">const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>inputs</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::vector&lt; int &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>axes</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+<p>The primitive must know how to vectorize itself across the given axes. </p>
+<p>The output is a pair containing the output arrays representing the vectorized computation and the axes which corresponds to the vectorized dimensions of each output. </p>
+
+<p>Reimplemented from <a class="el" href="classmlx_1_1core_1_1_primitive.html#ac632b9619dd7a6a0f177bd36202e8103">mlx::core::Primitive</a>.</p>
+
+</div>
+</div>
+<hr/>The documentation for this class was generated from the following file:<ul>
+<li>mlx/<a class="el" href="primitives_8h_source.html">primitives.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/classmlx_1_1core_1_1_contiguous.png b/docs/build/html/classmlx_1_1core_1_1_contiguous.png
new file mode 100644
index 000000000..13ba3febe
Binary files /dev/null and b/docs/build/html/classmlx_1_1core_1_1_contiguous.png differ
diff --git a/docs/build/html/classmlx_1_1core_1_1_primitive.html b/docs/build/html/classmlx_1_1core_1_1_primitive.html
index b59585a8d..0a127787f 100644
--- a/docs/build/html/classmlx_1_1core_1_1_primitive.html
+++ b/docs/build/html/classmlx_1_1core_1_1_primitive.html
@@ -380,7 +380,7 @@ Public Member Functions</h2></td></tr>
 
 <p>Equivalence check defaults to false unless overridden by the primitive. </p>
 
-<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64">mlx::core::View</a>.</p>
+<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#aa5d273a461fc6e64f3c9a67c24cb3372">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64">mlx::core::View</a>.</p>
 
 </div>
 </div>
@@ -418,7 +418,7 @@ Public Member Functions</h2></td></tr>
 
 <p>The Jacobian-vector product. </p>
 
-<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a">mlx::core::Tanh</a>, and <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1">mlx::core::Transpose</a>.</p>
+<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a1f9fcae7235e0ae9217825b78cb0f991">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a">mlx::core::Tanh</a>, and <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1">mlx::core::Transpose</a>.</p>
 
 </div>
 </div>
@@ -498,7 +498,7 @@ Public Member Functions</h2></td></tr>
 <p>Get the output shapes of the primitive. </p>
 <p>This is not required to be implemented by derived classes, in which case it will throw. </p>
 
-<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37">mlx::core::Tan</a>, and <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325">mlx::core::Tanh</a>.</p>
+<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a1a53623d7c591ba6567ac1533fbc2b7c">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37">mlx::core::Tan</a>, and <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325">mlx::core::Tanh</a>.</p>
 
 </div>
 </div>
@@ -527,7 +527,7 @@ Public Member Functions</h2></td></tr>
 
 <p>Print the primitive. </p>
 
-<p>Implemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c">mlx::core::View</a>.</p>
+<p>Implemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#aca8a4ba9a58cc10f063e6b082fa2fc23">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c">mlx::core::View</a>.</p>
 
 </div>
 </div>
@@ -597,7 +597,7 @@ Public Member Functions</h2></td></tr>
 
 <p>The vector-Jacobian product. </p>
 
-<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95">mlx::core::Tanh</a>, and <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80">mlx::core::Transpose</a>.</p>
+<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#abf488f02057fd5852f38b2e8a600ad2a">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95">mlx::core::Tanh</a>, and <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80">mlx::core::Transpose</a>.</p>
 
 </div>
 </div>
@@ -631,7 +631,7 @@ Public Member Functions</h2></td></tr>
 <p>The primitive must know how to vectorize itself across the given axes. </p>
 <p>The output is a pair containing the output arrays representing the vectorized computation and the axes which corresponds to the vectorized dimensions of each output. </p>
 
-<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121">mlx::core::View</a>.</p>
+<p>Reimplemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a563221e90b15aa90bfae23d29c10e4ec">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121">mlx::core::View</a>.</p>
 
 </div>
 </div>
diff --git a/docs/build/html/classmlx_1_1core_1_1_unary_primitive.html b/docs/build/html/classmlx_1_1core_1_1_unary_primitive.html
index 8335de9db..610953b50 100644
--- a/docs/build/html/classmlx_1_1core_1_1_unary_primitive.html
+++ b/docs/build/html/classmlx_1_1core_1_1_unary_primitive.html
@@ -126,73 +126,74 @@ Inheritance diagram for mlx::core::UnaryPrimitive:</div>
 <area href="classmlx_1_1core_1_1_cholesky.html" alt="mlx::core::Cholesky" shape="rect" coords="186,1232,362,1256"/>
 <area href="classmlx_1_1core_1_1_concatenate.html" alt="mlx::core::Concatenate" shape="rect" coords="186,1288,362,1312"/>
 <area href="classmlx_1_1core_1_1_conjugate.html" alt="mlx::core::Conjugate" shape="rect" coords="186,1344,362,1368"/>
-<area href="classmlx_1_1core_1_1_convolution.html" alt="mlx::core::Convolution" shape="rect" coords="186,1400,362,1424"/>
-<area href="classmlx_1_1core_1_1_copy.html" alt="mlx::core::Copy" shape="rect" coords="186,1456,362,1480"/>
-<area href="classmlx_1_1core_1_1_cos.html" alt="mlx::core::Cos" shape="rect" coords="186,1512,362,1536"/>
-<area href="classmlx_1_1core_1_1_cosh.html" alt="mlx::core::Cosh" shape="rect" coords="186,1568,362,1592"/>
-<area href="classmlx_1_1core_1_1_divide.html" alt="mlx::core::Divide" shape="rect" coords="186,1624,362,1648"/>
-<area href="classmlx_1_1core_1_1_equal.html" alt="mlx::core::Equal" shape="rect" coords="186,1680,362,1704"/>
-<area href="classmlx_1_1core_1_1_erf.html" alt="mlx::core::Erf" shape="rect" coords="186,1736,362,1760"/>
-<area href="classmlx_1_1core_1_1_erf_inv.html" alt="mlx::core::ErfInv" shape="rect" coords="186,1792,362,1816"/>
-<area href="classmlx_1_1core_1_1_exp.html" alt="mlx::core::Exp" shape="rect" coords="186,1848,362,1872"/>
-<area href="classmlx_1_1core_1_1_expm1.html" alt="mlx::core::Expm1" shape="rect" coords="186,1904,362,1928"/>
-<area href="classmlx_1_1core_1_1_f_f_t.html" alt="mlx::core::FFT" shape="rect" coords="186,1960,362,1984"/>
-<area href="classmlx_1_1core_1_1_floor.html" alt="mlx::core::Floor" shape="rect" coords="186,2016,362,2040"/>
-<area href="classmlx_1_1core_1_1_full.html" alt="mlx::core::Full" shape="rect" coords="186,2072,362,2096"/>
-<area href="classmlx_1_1core_1_1_gather.html" alt="mlx::core::Gather" shape="rect" coords="186,2128,362,2152"/>
-<area href="classmlx_1_1core_1_1_gather_m_m.html" alt="mlx::core::GatherMM" shape="rect" coords="186,2184,362,2208"/>
-<area href="classmlx_1_1core_1_1_gather_q_m_m.html" alt="mlx::core::GatherQMM" shape="rect" coords="186,2240,362,2264"/>
-<area href="classmlx_1_1core_1_1_greater.html" alt="mlx::core::Greater" shape="rect" coords="186,2296,362,2320"/>
-<area href="classmlx_1_1core_1_1_greater_equal.html" alt="mlx::core::GreaterEqual" shape="rect" coords="186,2352,362,2376"/>
-<area href="classmlx_1_1core_1_1_hadamard.html" alt="mlx::core::Hadamard" shape="rect" coords="186,2408,362,2432"/>
-<area href="classmlx_1_1core_1_1_imag.html" alt="mlx::core::Imag" shape="rect" coords="186,2464,362,2488"/>
-<area href="classmlx_1_1core_1_1_inverse.html" alt="mlx::core::Inverse" shape="rect" coords="186,2520,362,2544"/>
-<area href="classmlx_1_1core_1_1_less.html" alt="mlx::core::Less" shape="rect" coords="186,2576,362,2600"/>
-<area href="classmlx_1_1core_1_1_less_equal.html" alt="mlx::core::LessEqual" shape="rect" coords="186,2632,362,2656"/>
-<area href="classmlx_1_1core_1_1_load.html" alt="mlx::core::Load" shape="rect" coords="186,2688,362,2712"/>
-<area href="classmlx_1_1core_1_1_log.html" alt="mlx::core::Log" shape="rect" coords="186,2744,362,2768"/>
-<area href="classmlx_1_1core_1_1_log1p.html" alt="mlx::core::Log1p" shape="rect" coords="186,2800,362,2824"/>
-<area href="classmlx_1_1core_1_1_log_add_exp.html" alt="mlx::core::LogAddExp" shape="rect" coords="186,2856,362,2880"/>
-<area href="classmlx_1_1core_1_1_logical_and.html" alt="mlx::core::LogicalAnd" shape="rect" coords="186,2912,362,2936"/>
-<area href="classmlx_1_1core_1_1_logical_not.html" alt="mlx::core::LogicalNot" shape="rect" coords="186,2968,362,2992"/>
-<area href="classmlx_1_1core_1_1_logical_or.html" alt="mlx::core::LogicalOr" shape="rect" coords="186,3024,362,3048"/>
-<area href="classmlx_1_1core_1_1_matmul.html" alt="mlx::core::Matmul" shape="rect" coords="186,3080,362,3104"/>
-<area href="classmlx_1_1core_1_1_maximum.html" alt="mlx::core::Maximum" shape="rect" coords="186,3136,362,3160"/>
-<area href="classmlx_1_1core_1_1_minimum.html" alt="mlx::core::Minimum" shape="rect" coords="186,3192,362,3216"/>
-<area href="classmlx_1_1core_1_1_multiply.html" alt="mlx::core::Multiply" shape="rect" coords="186,3248,362,3272"/>
-<area href="classmlx_1_1core_1_1_negative.html" alt="mlx::core::Negative" shape="rect" coords="186,3304,362,3328"/>
-<area href="classmlx_1_1core_1_1_not_equal.html" alt="mlx::core::NotEqual" shape="rect" coords="186,3360,362,3384"/>
-<area href="classmlx_1_1core_1_1_number_of_elements.html" alt="mlx::core::NumberOfElements" shape="rect" coords="186,3416,362,3440"/>
-<area href="classmlx_1_1core_1_1_pad.html" alt="mlx::core::Pad" shape="rect" coords="186,3472,362,3496"/>
-<area href="classmlx_1_1core_1_1_partition.html" alt="mlx::core::Partition" shape="rect" coords="186,3528,362,3552"/>
-<area href="classmlx_1_1core_1_1_power.html" alt="mlx::core::Power" shape="rect" coords="186,3584,362,3608"/>
-<area href="classmlx_1_1core_1_1_quantized_matmul.html" alt="mlx::core::QuantizedMatmul" shape="rect" coords="186,3640,362,3664"/>
-<area href="classmlx_1_1core_1_1_random_bits.html" alt="mlx::core::RandomBits" shape="rect" coords="186,3696,362,3720"/>
-<area href="classmlx_1_1core_1_1_real.html" alt="mlx::core::Real" shape="rect" coords="186,3752,362,3776"/>
-<area href="classmlx_1_1core_1_1_reduce.html" alt="mlx::core::Reduce" shape="rect" coords="186,3808,362,3832"/>
-<area href="classmlx_1_1core_1_1_remainder.html" alt="mlx::core::Remainder" shape="rect" coords="186,3864,362,3888"/>
-<area href="classmlx_1_1core_1_1_reshape.html" alt="mlx::core::Reshape" shape="rect" coords="186,3920,362,3944"/>
-<area href="classmlx_1_1core_1_1_round.html" alt="mlx::core::Round" shape="rect" coords="186,3976,362,4000"/>
-<area href="classmlx_1_1core_1_1_scan.html" alt="mlx::core::Scan" shape="rect" coords="186,4032,362,4056"/>
-<area href="classmlx_1_1core_1_1_scatter.html" alt="mlx::core::Scatter" shape="rect" coords="186,4088,362,4112"/>
-<area href="classmlx_1_1core_1_1_select.html" alt="mlx::core::Select" shape="rect" coords="186,4144,362,4168"/>
-<area href="classmlx_1_1core_1_1_sigmoid.html" alt="mlx::core::Sigmoid" shape="rect" coords="186,4200,362,4224"/>
-<area href="classmlx_1_1core_1_1_sign.html" alt="mlx::core::Sign" shape="rect" coords="186,4256,362,4280"/>
-<area href="classmlx_1_1core_1_1_sin.html" alt="mlx::core::Sin" shape="rect" coords="186,4312,362,4336"/>
-<area href="classmlx_1_1core_1_1_sinh.html" alt="mlx::core::Sinh" shape="rect" coords="186,4368,362,4392"/>
-<area href="classmlx_1_1core_1_1_slice.html" alt="mlx::core::Slice" shape="rect" coords="186,4424,362,4448"/>
-<area href="classmlx_1_1core_1_1_slice_update.html" alt="mlx::core::SliceUpdate" shape="rect" coords="186,4480,362,4504"/>
-<area href="classmlx_1_1core_1_1_softmax.html" alt="mlx::core::Softmax" shape="rect" coords="186,4536,362,4560"/>
-<area href="classmlx_1_1core_1_1_sort.html" alt="mlx::core::Sort" shape="rect" coords="186,4592,362,4616"/>
-<area href="classmlx_1_1core_1_1_sqrt.html" alt="mlx::core::Sqrt" shape="rect" coords="186,4648,362,4672"/>
-<area href="classmlx_1_1core_1_1_square.html" alt="mlx::core::Square" shape="rect" coords="186,4704,362,4728"/>
-<area href="classmlx_1_1core_1_1_stop_gradient.html" alt="mlx::core::StopGradient" shape="rect" coords="186,4760,362,4784"/>
-<area href="classmlx_1_1core_1_1_subtract.html" alt="mlx::core::Subtract" shape="rect" coords="186,4816,362,4840"/>
-<area href="classmlx_1_1core_1_1_tan.html" alt="mlx::core::Tan" shape="rect" coords="186,4872,362,4896"/>
-<area href="classmlx_1_1core_1_1_tanh.html" alt="mlx::core::Tanh" shape="rect" coords="186,4928,362,4952"/>
-<area href="classmlx_1_1core_1_1_transpose.html" alt="mlx::core::Transpose" shape="rect" coords="186,4984,362,5008"/>
-<area href="classmlx_1_1core_1_1_uniform.html" alt="mlx::core::Uniform" shape="rect" coords="186,5040,362,5064"/>
-<area href="classmlx_1_1core_1_1_view.html" alt="mlx::core::View" shape="rect" coords="186,5096,362,5120"/>
+<area href="classmlx_1_1core_1_1_contiguous.html" alt="mlx::core::Contiguous" shape="rect" coords="186,1400,362,1424"/>
+<area href="classmlx_1_1core_1_1_convolution.html" alt="mlx::core::Convolution" shape="rect" coords="186,1456,362,1480"/>
+<area href="classmlx_1_1core_1_1_copy.html" alt="mlx::core::Copy" shape="rect" coords="186,1512,362,1536"/>
+<area href="classmlx_1_1core_1_1_cos.html" alt="mlx::core::Cos" shape="rect" coords="186,1568,362,1592"/>
+<area href="classmlx_1_1core_1_1_cosh.html" alt="mlx::core::Cosh" shape="rect" coords="186,1624,362,1648"/>
+<area href="classmlx_1_1core_1_1_divide.html" alt="mlx::core::Divide" shape="rect" coords="186,1680,362,1704"/>
+<area href="classmlx_1_1core_1_1_equal.html" alt="mlx::core::Equal" shape="rect" coords="186,1736,362,1760"/>
+<area href="classmlx_1_1core_1_1_erf.html" alt="mlx::core::Erf" shape="rect" coords="186,1792,362,1816"/>
+<area href="classmlx_1_1core_1_1_erf_inv.html" alt="mlx::core::ErfInv" shape="rect" coords="186,1848,362,1872"/>
+<area href="classmlx_1_1core_1_1_exp.html" alt="mlx::core::Exp" shape="rect" coords="186,1904,362,1928"/>
+<area href="classmlx_1_1core_1_1_expm1.html" alt="mlx::core::Expm1" shape="rect" coords="186,1960,362,1984"/>
+<area href="classmlx_1_1core_1_1_f_f_t.html" alt="mlx::core::FFT" shape="rect" coords="186,2016,362,2040"/>
+<area href="classmlx_1_1core_1_1_floor.html" alt="mlx::core::Floor" shape="rect" coords="186,2072,362,2096"/>
+<area href="classmlx_1_1core_1_1_full.html" alt="mlx::core::Full" shape="rect" coords="186,2128,362,2152"/>
+<area href="classmlx_1_1core_1_1_gather.html" alt="mlx::core::Gather" shape="rect" coords="186,2184,362,2208"/>
+<area href="classmlx_1_1core_1_1_gather_m_m.html" alt="mlx::core::GatherMM" shape="rect" coords="186,2240,362,2264"/>
+<area href="classmlx_1_1core_1_1_gather_q_m_m.html" alt="mlx::core::GatherQMM" shape="rect" coords="186,2296,362,2320"/>
+<area href="classmlx_1_1core_1_1_greater.html" alt="mlx::core::Greater" shape="rect" coords="186,2352,362,2376"/>
+<area href="classmlx_1_1core_1_1_greater_equal.html" alt="mlx::core::GreaterEqual" shape="rect" coords="186,2408,362,2432"/>
+<area href="classmlx_1_1core_1_1_hadamard.html" alt="mlx::core::Hadamard" shape="rect" coords="186,2464,362,2488"/>
+<area href="classmlx_1_1core_1_1_imag.html" alt="mlx::core::Imag" shape="rect" coords="186,2520,362,2544"/>
+<area href="classmlx_1_1core_1_1_inverse.html" alt="mlx::core::Inverse" shape="rect" coords="186,2576,362,2600"/>
+<area href="classmlx_1_1core_1_1_less.html" alt="mlx::core::Less" shape="rect" coords="186,2632,362,2656"/>
+<area href="classmlx_1_1core_1_1_less_equal.html" alt="mlx::core::LessEqual" shape="rect" coords="186,2688,362,2712"/>
+<area href="classmlx_1_1core_1_1_load.html" alt="mlx::core::Load" shape="rect" coords="186,2744,362,2768"/>
+<area href="classmlx_1_1core_1_1_log.html" alt="mlx::core::Log" shape="rect" coords="186,2800,362,2824"/>
+<area href="classmlx_1_1core_1_1_log1p.html" alt="mlx::core::Log1p" shape="rect" coords="186,2856,362,2880"/>
+<area href="classmlx_1_1core_1_1_log_add_exp.html" alt="mlx::core::LogAddExp" shape="rect" coords="186,2912,362,2936"/>
+<area href="classmlx_1_1core_1_1_logical_and.html" alt="mlx::core::LogicalAnd" shape="rect" coords="186,2968,362,2992"/>
+<area href="classmlx_1_1core_1_1_logical_not.html" alt="mlx::core::LogicalNot" shape="rect" coords="186,3024,362,3048"/>
+<area href="classmlx_1_1core_1_1_logical_or.html" alt="mlx::core::LogicalOr" shape="rect" coords="186,3080,362,3104"/>
+<area href="classmlx_1_1core_1_1_matmul.html" alt="mlx::core::Matmul" shape="rect" coords="186,3136,362,3160"/>
+<area href="classmlx_1_1core_1_1_maximum.html" alt="mlx::core::Maximum" shape="rect" coords="186,3192,362,3216"/>
+<area href="classmlx_1_1core_1_1_minimum.html" alt="mlx::core::Minimum" shape="rect" coords="186,3248,362,3272"/>
+<area href="classmlx_1_1core_1_1_multiply.html" alt="mlx::core::Multiply" shape="rect" coords="186,3304,362,3328"/>
+<area href="classmlx_1_1core_1_1_negative.html" alt="mlx::core::Negative" shape="rect" coords="186,3360,362,3384"/>
+<area href="classmlx_1_1core_1_1_not_equal.html" alt="mlx::core::NotEqual" shape="rect" coords="186,3416,362,3440"/>
+<area href="classmlx_1_1core_1_1_number_of_elements.html" alt="mlx::core::NumberOfElements" shape="rect" coords="186,3472,362,3496"/>
+<area href="classmlx_1_1core_1_1_pad.html" alt="mlx::core::Pad" shape="rect" coords="186,3528,362,3552"/>
+<area href="classmlx_1_1core_1_1_partition.html" alt="mlx::core::Partition" shape="rect" coords="186,3584,362,3608"/>
+<area href="classmlx_1_1core_1_1_power.html" alt="mlx::core::Power" shape="rect" coords="186,3640,362,3664"/>
+<area href="classmlx_1_1core_1_1_quantized_matmul.html" alt="mlx::core::QuantizedMatmul" shape="rect" coords="186,3696,362,3720"/>
+<area href="classmlx_1_1core_1_1_random_bits.html" alt="mlx::core::RandomBits" shape="rect" coords="186,3752,362,3776"/>
+<area href="classmlx_1_1core_1_1_real.html" alt="mlx::core::Real" shape="rect" coords="186,3808,362,3832"/>
+<area href="classmlx_1_1core_1_1_reduce.html" alt="mlx::core::Reduce" shape="rect" coords="186,3864,362,3888"/>
+<area href="classmlx_1_1core_1_1_remainder.html" alt="mlx::core::Remainder" shape="rect" coords="186,3920,362,3944"/>
+<area href="classmlx_1_1core_1_1_reshape.html" alt="mlx::core::Reshape" shape="rect" coords="186,3976,362,4000"/>
+<area href="classmlx_1_1core_1_1_round.html" alt="mlx::core::Round" shape="rect" coords="186,4032,362,4056"/>
+<area href="classmlx_1_1core_1_1_scan.html" alt="mlx::core::Scan" shape="rect" coords="186,4088,362,4112"/>
+<area href="classmlx_1_1core_1_1_scatter.html" alt="mlx::core::Scatter" shape="rect" coords="186,4144,362,4168"/>
+<area href="classmlx_1_1core_1_1_select.html" alt="mlx::core::Select" shape="rect" coords="186,4200,362,4224"/>
+<area href="classmlx_1_1core_1_1_sigmoid.html" alt="mlx::core::Sigmoid" shape="rect" coords="186,4256,362,4280"/>
+<area href="classmlx_1_1core_1_1_sign.html" alt="mlx::core::Sign" shape="rect" coords="186,4312,362,4336"/>
+<area href="classmlx_1_1core_1_1_sin.html" alt="mlx::core::Sin" shape="rect" coords="186,4368,362,4392"/>
+<area href="classmlx_1_1core_1_1_sinh.html" alt="mlx::core::Sinh" shape="rect" coords="186,4424,362,4448"/>
+<area href="classmlx_1_1core_1_1_slice.html" alt="mlx::core::Slice" shape="rect" coords="186,4480,362,4504"/>
+<area href="classmlx_1_1core_1_1_slice_update.html" alt="mlx::core::SliceUpdate" shape="rect" coords="186,4536,362,4560"/>
+<area href="classmlx_1_1core_1_1_softmax.html" alt="mlx::core::Softmax" shape="rect" coords="186,4592,362,4616"/>
+<area href="classmlx_1_1core_1_1_sort.html" alt="mlx::core::Sort" shape="rect" coords="186,4648,362,4672"/>
+<area href="classmlx_1_1core_1_1_sqrt.html" alt="mlx::core::Sqrt" shape="rect" coords="186,4704,362,4728"/>
+<area href="classmlx_1_1core_1_1_square.html" alt="mlx::core::Square" shape="rect" coords="186,4760,362,4784"/>
+<area href="classmlx_1_1core_1_1_stop_gradient.html" alt="mlx::core::StopGradient" shape="rect" coords="186,4816,362,4840"/>
+<area href="classmlx_1_1core_1_1_subtract.html" alt="mlx::core::Subtract" shape="rect" coords="186,4872,362,4896"/>
+<area href="classmlx_1_1core_1_1_tan.html" alt="mlx::core::Tan" shape="rect" coords="186,4928,362,4952"/>
+<area href="classmlx_1_1core_1_1_tanh.html" alt="mlx::core::Tanh" shape="rect" coords="186,4984,362,5008"/>
+<area href="classmlx_1_1core_1_1_transpose.html" alt="mlx::core::Transpose" shape="rect" coords="186,5040,362,5064"/>
+<area href="classmlx_1_1core_1_1_uniform.html" alt="mlx::core::Uniform" shape="rect" coords="186,5096,362,5120"/>
+<area href="classmlx_1_1core_1_1_view.html" alt="mlx::core::View" shape="rect" coords="186,5152,362,5176"/>
   </map>
 </div></div>
 <table class="memberdecls">
@@ -389,7 +390,7 @@ Public Member Functions</h2></td></tr>
 </table>
 </div><div class="memdoc">
 
-<p>Implemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497">mlx::core::View</a>.</p>
+<p>Implemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497">mlx::core::View</a>.</p>
 
 </div>
 </div>
@@ -454,7 +455,7 @@ Public Member Functions</h2></td></tr>
 </table>
 </div><div class="memdoc">
 
-<p>Implemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075">mlx::core::View</a>.</p>
+<p>Implemented in <a class="el" href="classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0">mlx::core::Uniform</a>, and <a class="el" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075">mlx::core::View</a>.</p>
 
 </div>
 </div>
diff --git a/docs/build/html/classmlx_1_1core_1_1_unary_primitive.png b/docs/build/html/classmlx_1_1core_1_1_unary_primitive.png
index fa6e8dc0f..d36a3f474 100644
Binary files a/docs/build/html/classmlx_1_1core_1_1_unary_primitive.png and b/docs/build/html/classmlx_1_1core_1_1_unary_primitive.png differ
diff --git a/docs/build/html/classmlx_1_1core_1_1fast_1_1_affine_quantize-members.html b/docs/build/html/classmlx_1_1core_1_1fast_1_1_affine_quantize-members.html
index 11e1dd754..adeb73e45 100644
--- a/docs/build/html/classmlx_1_1core_1_1fast_1_1_affine_quantize-members.html
+++ b/docs/build/html/classmlx_1_1core_1_1fast_1_1_affine_quantize-members.html
@@ -98,7 +98,7 @@ $(function(){ initResizable(false); });
   <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313">Custom</a>(Stream stream, std::function&lt; std::vector&lt; array &gt;(std::vector&lt; array &gt;)&gt; fallback)</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html">mlx::core::fast::Custom</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">explicit</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a4b8f1b1f633002c8ca6fa8f0ef4dd587">DEFINE_PRINT</a>(AffineQuantize)</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html">mlx::core::fast::AffineQuantize</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a8ae61e3289c4134232a69295268f8261">device</a>()</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">eval_cpu</a>(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html">mlx::core::fast::AffineQuantize</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">eval_cpu</a>(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html">mlx::core::fast::AffineQuantize</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">eval_gpu</a>(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html">mlx::core::fast::AffineQuantize</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd">is_equivalent</a>(const Primitive &amp;other) const</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584">jvp</a>(const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;tangents, const std::vector&lt; int &gt; &amp;argnums) override</td><td class="entry"><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html">mlx::core::fast::Custom</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
diff --git a/docs/build/html/classmlx_1_1core_1_1fast_1_1_affine_quantize.html b/docs/build/html/classmlx_1_1core_1_1fast_1_1_affine_quantize.html
index 940fe29c3..ca6128e8e 100644
--- a/docs/build/html/classmlx_1_1core_1_1fast_1_1_affine_quantize.html
+++ b/docs/build/html/classmlx_1_1core_1_1fast_1_1_affine_quantize.html
@@ -244,7 +244,7 @@ Public Member Functions</h2></td></tr>
       </table>
   </td>
   <td class="mlabels-right">
-<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
+<span class="mlabels"><span class="mlabel">override</span><span class="mlabel">virtual</span></span>  </td>
   </tr>
 </table>
 </div><div class="memdoc">
diff --git a/docs/build/html/compiled_8h.html b/docs/build/html/compiled_8h.html
index 0f20b7922..5727d9696 100644
--- a/docs/build/html/compiled_8h.html
+++ b/docs/build/html/compiled_8h.html
@@ -132,7 +132,7 @@ Functions</h2></td></tr>
 <tr class="separator:a985c60929757190e0b4ec51f57c767d0"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a3b900ab319948c5a01a3ecd30a709027" id="r_a3b900ab319948c5a01a3ecd30a709027"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027">mlx::core::compiled_check_contiguity</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, const std::vector&lt; int &gt; &amp;shape)</td></tr>
 <tr class="separator:a3b900ab319948c5a01a3ecd30a709027"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab8c3c4fc05745f586de922c8266f4fce" id="r_ab8c3c4fc05745f586de922c8266f4fce"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#ab8c3c4fc05745f586de922c8266f4fce">mlx::core::compiled_allocate_outputs</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;outputs, const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs_, const std::unordered_set&lt; uintptr_t &gt; &amp;constant_ids_, bool contiguous, bool move_buffers=false)</td></tr>
+<tr class="memitem:ab8c3c4fc05745f586de922c8266f4fce" id="r_ab8c3c4fc05745f586de922c8266f4fce"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#ab8c3c4fc05745f586de922c8266f4fce">mlx::core::compiled_allocate_outputs</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;outputs, const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs_, const std::unordered_set&lt; uintptr_t &gt; &amp;constant_ids_, bool <a class="el" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">contiguous</a>, bool move_buffers=false)</td></tr>
 <tr class="separator:ab8c3c4fc05745f586de922c8266f4fce"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 </div><!-- contents -->
diff --git a/docs/build/html/compiled_8h_source.html b/docs/build/html/compiled_8h_source.html
index 020b90fd1..ba238defa 100644
--- a/docs/build/html/compiled_8h_source.html
+++ b/docs/build/html/compiled_8h_source.html
@@ -167,20 +167,21 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    std::vector&lt;array&gt;&amp; outputs,</div>
 <div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>    <span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs_,</div>
 <div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    <span class="keyword">const</span> std::unordered_set&lt;uintptr_t&gt;&amp; constant_ids_,</div>
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    <span class="keywordtype">bool</span> contiguous,</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    <span class="keywordtype">bool</span> <a class="code hl_function" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">contiguous</a>,</div>
 <div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    <span class="keywordtype">bool</span> move_buffers = <span class="keyword">false</span>);</div>
 <div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span> </div>
 <div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_as_type_html"><div class="ttname"><a href="classmlx_1_1core_1_1_as_type.html">mlx::core::AsType</a></div><div class="ttdef"><b>Definition</b> primitives.h:418</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_broadcast_html"><div class="ttname"><a href="classmlx_1_1core_1_1_broadcast.html">mlx::core::Broadcast</a></div><div class="ttdef"><b>Definition</b> primitives.h:528</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_copy_html"><div class="ttname"><a href="classmlx_1_1core_1_1_copy.html">mlx::core::Copy</a></div><div class="ttdef"><b>Definition</b> primitives.h:683</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_copy_html"><div class="ttname"><a href="classmlx_1_1core_1_1_copy.html">mlx::core::Copy</a></div><div class="ttdef"><b>Definition</b> primitives.h:702</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></div><div class="ttdef"><b>Definition</b> primitives.h:48</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_stop_gradient_html"><div class="ttname"><a href="classmlx_1_1core_1_1_stop_gradient.html">mlx::core::StopGradient</a></div><div class="ttdef"><b>Definition</b> primitives.h:2015</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_stop_gradient_html"><div class="ttname"><a href="classmlx_1_1core_1_1_stop_gradient.html">mlx::core::StopGradient</a></div><div class="ttdef"><b>Definition</b> primitives.h:2034</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html_a53006e77d13d9d88b525ef577748939f"><div class="ttname"><a href="classmlx_1_1core_1_1array.html#a53006e77d13d9d88b525ef577748939f">mlx::core::array::ndim</a></div><div class="ttdeci">size_t ndim() const</div><div class="ttdoc">The number of dimensions of the array.</div><div class="ttdef"><b>Definition</b> array.h:94</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html_a90c5afddc2fa3028c0f8099bd64c8a99"><div class="ttname"><a href="classmlx_1_1core_1_1array.html#a90c5afddc2fa3028c0f8099bd64c8a99">mlx::core::array::item</a></div><div class="ttdeci">T item()</div><div class="ttdoc">Get the value from a scalar array.</div><div class="ttdef"><b>Definition</b> array.h:535</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html_ae29e7d6fbfbea1e5e321a8d1ea3cfacd"><div class="ttname"><a href="classmlx_1_1core_1_1array.html#ae29e7d6fbfbea1e5e321a8d1ea3cfacd">mlx::core::array::dtype</a></div><div class="ttdeci">Dtype dtype() const</div><div class="ttdoc">Get the arrays data type.</div><div class="ttdef"><b>Definition</b> array.h:127</div></div>
+<div class="ttc" id="agroup__ops_html_ga8ab10aa6c41416d739791164a52b25d5"><div class="ttname"><a href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">mlx::core::contiguous</a></div><div class="ttdeci">array contiguous(const array &amp;a, bool allow_col_major=false, StreamOrDevice s={})</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a2b78f270942c6eb185e8045f1c5b4286"><div class="ttname"><a href="namespacemlx_1_1core.html#a2b78f270942c6eb185e8045f1c5b4286">mlx::core::print_complex_constant</a></div><div class="ttdeci">void print_complex_constant(std::ostream &amp;os, const array &amp;x)</div><div class="ttdef"><b>Definition</b> compiled.h:40</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a3b900ab319948c5a01a3ecd30a709027"><div class="ttname"><a href="namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027">mlx::core::compiled_check_contiguity</a></div><div class="ttdeci">bool compiled_check_contiguity(const std::vector&lt; array &gt; &amp;inputs, const std::vector&lt; int &gt; &amp;shape)</div></div>
diff --git a/docs/build/html/conv_8h.html b/docs/build/html/conv_8h.html
index f673ac747..f35a8c3e4 100644
--- a/docs/build/html/conv_8h.html
+++ b/docs/build/html/conv_8h.html
@@ -95,7 +95,7 @@ $(function(){ initResizable(false); });
 <code>#include &quot;<a class="el" href="backend_2metal_2kernels_2steel_2utils_8h_source.html">mlx/backend/metal/kernels/steel/utils.h</a>&quot;</code><br />
 <code>#include &quot;<a class="el" href="conv_2loader_8h_source.html">mlx/backend/metal/kernels/steel/conv/loader.h</a>&quot;</code><br />
 <code>#include &quot;<a class="el" href="conv_2params_8h_source.html">mlx/backend/metal/kernels/steel/conv/params.h</a>&quot;</code><br />
-<code>#include &quot;<a class="el" href="mma_8h_source.html">mlx/backend/metal/kernels/steel/gemm/mma.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="gemm_2mma_8h_source.html">mlx/backend/metal/kernels/steel/gemm/mma.h</a>&quot;</code><br />
 </div>
 <p><a href="conv_8h_source.html">Go to the source code of this file.</a></p>
 </div><!-- contents -->
diff --git a/docs/build/html/conv_8h_source.html b/docs/build/html/conv_8h_source.html
index df66e3310..17bfc4bcb 100644
--- a/docs/build/html/conv_8h_source.html
+++ b/docs/build/html/conv_8h_source.html
@@ -100,16 +100,16 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span> </div>
 <div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span><span class="preprocessor">#include &quot;<a class="code" href="conv_2loader_8h.html">mlx/backend/metal/kernels/steel/conv/loader.h</a>&quot;</span></div>
 <div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span><span class="preprocessor">#include &quot;<a class="code" href="conv_2params_8h.html">mlx/backend/metal/kernels/steel/conv/params.h</a>&quot;</span></div>
-<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="preprocessor">#include &quot;<a class="code" href="mma_8h.html">mlx/backend/metal/kernels/steel/gemm/mma.h</a>&quot;</span></div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="preprocessor">#include &quot;<a class="code" href="gemm_2mma_8h.html">mlx/backend/metal/kernels/steel/gemm/mma.h</a>&quot;</span></div>
 <div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span> </div>
 <div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span><span class="keyword">using namespace </span><a class="code hl_namespace" href="namespacemetal.html">metal</a>;</div>
 <div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span><span class="keyword">using namespace </span><a class="code hl_namespace" href="namespacemlx_1_1steel.html">mlx::steel</a>;</div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html">utils.h</a></div></div>
 <div class="ttc" id="aconv_2loader_8h_html"><div class="ttname"><a href="conv_2loader_8h.html">loader.h</a></div></div>
 <div class="ttc" id="aconv_2params_8h_html"><div class="ttname"><a href="conv_2params_8h.html">params.h</a></div></div>
-<div class="ttc" id="amma_8h_html"><div class="ttname"><a href="mma_8h.html">mma.h</a></div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> loader_channel_l.h:14</div></div>
+<div class="ttc" id="agemm_2mma_8h_html"><div class="ttname"><a href="gemm_2mma_8h.html">mma.h</a></div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> attn.h:19</div></div>
 <div class="ttc" id="asteel_2defines_8h_html"><div class="ttname"><a href="steel_2defines_8h.html">defines.h</a></div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/cpp/ops.html b/docs/build/html/cpp/ops.html
index f8e1ac230..71ad84825 100644
--- a/docs/build/html/cpp/ops.html
+++ b/docs/build/html/cpp/ops.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Operations &#8212; MLX 0.20.0 documentation</title>
+    <title>Operations &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.utils.tree_reduce" href="../python/_autosummary/mlx.utils.tree_reduce.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -1157,6 +1158,7 @@
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice"><code class="docutils literal notranslate"><span class="pre">roll()</span></code></a></li>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv44realRK5array14StreamOrDevice"><code class="docutils literal notranslate"><span class="pre">real()</span></code></a></li>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv44imagRK5array14StreamOrDevice"><code class="docutils literal notranslate"><span class="pre">imag()</span></code></a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv410contiguousRK5arrayb14StreamOrDevice"><code class="docutils literal notranslate"><span class="pre">contiguous()</span></code></a></li>
 </ul>
             </nav>
         </div>
@@ -2966,6 +2968,11 @@
 <span id="_CPPv34imagRK5array14StreamOrDevice"></span><span id="_CPPv24imagRK5array14StreamOrDevice"></span><span id="imag__arrayCR.StreamOrDevice"></span><span class="target" id="group__ops_1ga7ff592a64d528f0cf4f3d098465da029"></span><span class="n"><span class="pre">array</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">imag</span></span></span><span class="sig-paren">(</span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="n"><span class="pre">array</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">a</span></span>, <span class="n"><span class="pre">StreamOrDevice</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">s</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="p"><span class="pre">{</span></span><span class="p"><span class="pre">}</span></span><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv44imagRK5array14StreamOrDevice" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv410contiguousRK5arrayb14StreamOrDevice">
+<span id="_CPPv310contiguousRK5arrayb14StreamOrDevice"></span><span id="_CPPv210contiguousRK5arrayb14StreamOrDevice"></span><span id="contiguous__arrayCR.b.StreamOrDevice"></span><span class="target" id="group__ops_1ga8ab10aa6c41416d739791164a52b25d5"></span><span class="n"><span class="pre">array</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">contiguous</span></span></span><span class="sig-paren">(</span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="n"><span class="pre">array</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">a</span></span>, <span class="kt"><span class="pre">bool</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">allow_col_major</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="k"><span class="pre">false</span></span>, <span class="n"><span class="pre">StreamOrDevice</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">s</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="p"><span class="pre">{</span></span><span class="p"><span class="pre">}</span></span><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv410contiguousRK5arrayb14StreamOrDevice" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 </section>
 
 
@@ -3322,6 +3329,7 @@
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice"><code class="docutils literal notranslate"><span class="pre">roll()</span></code></a></li>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv44realRK5array14StreamOrDevice"><code class="docutils literal notranslate"><span class="pre">real()</span></code></a></li>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv44imagRK5array14StreamOrDevice"><code class="docutils literal notranslate"><span class="pre">imag()</span></code></a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv410contiguousRK5arrayb14StreamOrDevice"><code class="docutils literal notranslate"><span class="pre">contiguous()</span></code></a></li>
 </ul>
   </nav></div>
 
diff --git a/docs/build/html/dev/custom_metal_kernels.html b/docs/build/html/dev/custom_metal_kernels.html
index 41ee05ff9..c342bdf55 100644
--- a/docs/build/html/dev/custom_metal_kernels.html
+++ b/docs/build/html/dev/custom_metal_kernels.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Custom Metal Kernels &#8212; MLX 0.20.0 documentation</title>
+    <title>Custom Metal Kernels &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -50,7 +50,7 @@
     <link rel="prev" title="Metal Debugger" href="metal_debugger.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -129,8 +129,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -443,7 +443,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -520,6 +519,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -549,6 +549,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/dev/extensions.html b/docs/build/html/dev/extensions.html
index 61d2c14e9..0268bea8b 100644
--- a/docs/build/html/dev/extensions.html
+++ b/docs/build/html/dev/extensions.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Custom Extensions in MLX &#8212; MLX 0.20.0 documentation</title>
+    <title>Custom Extensions in MLX &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Operations" href="../cpp/ops.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -1342,7 +1343,7 @@ below.</p>
 
 <span class="w">    </span><span class="c1">// Prepare to encode kernel</span>
 <span class="w">    </span><span class="k">auto</span><span class="o">&amp;</span><span class="w"> </span><span class="n">compute_encoder</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">d</span><span class="p">.</span><span class="n">get_command_encoder</span><span class="p">(</span><span class="n">s</span><span class="p">.</span><span class="n">index</span><span class="p">);</span>
-<span class="w">    </span><span class="n">compute_encoder</span><span class="o">-&gt;</span><span class="n">setComputePipelineState</span><span class="p">(</span><span class="n">kernel</span><span class="p">);</span>
+<span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">set_compute_pipeline_state</span><span class="p">(</span><span class="n">kernel</span><span class="p">);</span>
 
 <span class="w">    </span><span class="c1">// Kernel parameters are registered with buffer indices corresponding to</span>
 <span class="w">    </span><span class="c1">// those in the kernel declaration at axpby.metal</span>
@@ -1357,14 +1358,14 @@ below.</p>
 <span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">set_output_array</span><span class="p">(</span><span class="n">out</span><span class="p">,</span><span class="w"> </span><span class="mi">2</span><span class="p">);</span>
 
 <span class="w">    </span><span class="c1">// Encode alpha and beta</span>
-<span class="w">    </span><span class="n">compute_encoder</span><span class="o">-&gt;</span><span class="n">setBytes</span><span class="p">(</span><span class="o">&amp;</span><span class="n">alpha_</span><span class="p">,</span><span class="w"> </span><span class="k">sizeof</span><span class="p">(</span><span class="kt">float</span><span class="p">),</span><span class="w"> </span><span class="mi">3</span><span class="p">);</span>
-<span class="w">    </span><span class="n">compute_encoder</span><span class="o">-&gt;</span><span class="n">setBytes</span><span class="p">(</span><span class="o">&amp;</span><span class="n">beta_</span><span class="p">,</span><span class="w"> </span><span class="k">sizeof</span><span class="p">(</span><span class="kt">float</span><span class="p">),</span><span class="w"> </span><span class="mi">4</span><span class="p">);</span>
+<span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">set_bytes</span><span class="p">(</span><span class="n">alpha_</span><span class="p">,</span><span class="w"> </span><span class="mi">3</span><span class="p">);</span>
+<span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">set_bytes</span><span class="p">(</span><span class="n">beta_</span><span class="p">,</span><span class="w"> </span><span class="mi">4</span><span class="p">);</span>
 
 <span class="w">    </span><span class="c1">// Encode shape, strides and ndim</span>
-<span class="w">    </span><span class="n">compute_encoder</span><span class="o">-&gt;</span><span class="n">setBytes</span><span class="p">(</span><span class="n">x</span><span class="p">.</span><span class="n">shape</span><span class="p">().</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">ndim</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="k">sizeof</span><span class="p">(</span><span class="kt">int</span><span class="p">),</span><span class="w"> </span><span class="mi">5</span><span class="p">);</span>
-<span class="w">    </span><span class="n">compute_encoder</span><span class="o">-&gt;</span><span class="n">setBytes</span><span class="p">(</span><span class="n">x</span><span class="p">.</span><span class="n">strides</span><span class="p">().</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">ndim</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="k">sizeof</span><span class="p">(</span><span class="kt">size_t</span><span class="p">),</span><span class="w"> </span><span class="mi">6</span><span class="p">);</span>
-<span class="w">    </span><span class="n">compute_encoder</span><span class="o">-&gt;</span><span class="n">setBytes</span><span class="p">(</span><span class="n">y</span><span class="p">.</span><span class="n">strides</span><span class="p">().</span><span class="n">data</span><span class="p">(),</span><span class="w"> </span><span class="n">ndim</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="k">sizeof</span><span class="p">(</span><span class="kt">size_t</span><span class="p">),</span><span class="w"> </span><span class="mi">7</span><span class="p">);</span>
-<span class="w">    </span><span class="n">compute_encoder</span><span class="o">-&gt;</span><span class="n">setBytes</span><span class="p">(</span><span class="o">&amp;</span><span class="n">ndim</span><span class="p">,</span><span class="w"> </span><span class="k">sizeof</span><span class="p">(</span><span class="kt">int</span><span class="p">),</span><span class="w"> </span><span class="mi">8</span><span class="p">);</span>
+<span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">set_vector_bytes</span><span class="p">(</span><span class="n">x</span><span class="p">.</span><span class="n">shape</span><span class="p">(),</span><span class="w"> </span><span class="mi">5</span><span class="p">);</span>
+<span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">set_vector_bytes</span><span class="p">(</span><span class="n">x</span><span class="p">.</span><span class="n">strides</span><span class="p">(),</span><span class="w"> </span><span class="mi">6</span><span class="p">);</span>
+<span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">set_bytes</span><span class="p">(</span><span class="n">y</span><span class="p">.</span><span class="n">strides</span><span class="p">(),</span><span class="w"> </span><span class="mi">7</span><span class="p">);</span>
+<span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">set_bytes</span><span class="p">(</span><span class="n">ndim</span><span class="p">,</span><span class="w"> </span><span class="mi">8</span><span class="p">);</span>
 
 <span class="w">    </span><span class="c1">// We launch 1 thread for each input and make sure that the number of</span>
 <span class="w">    </span><span class="c1">// threads in any given threadgroup is not higher than the max allowed</span>
@@ -1378,7 +1379,7 @@ below.</p>
 
 <span class="w">    </span><span class="c1">// Launch the grid with the given number of threads divided among</span>
 <span class="w">    </span><span class="c1">// the given threadgroups</span>
-<span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">dispatchThreads</span><span class="p">(</span><span class="n">grid_dims</span><span class="p">,</span><span class="w"> </span><span class="n">group_dims</span><span class="p">);</span>
+<span class="w">    </span><span class="n">compute_encoder</span><span class="p">.</span><span class="n">dispatch_threads</span><span class="p">(</span><span class="n">grid_dims</span><span class="p">,</span><span class="w"> </span><span class="n">group_dims</span><span class="p">);</span>
 <span class="p">}</span>
 </pre></div>
 </div>
diff --git a/docs/build/html/dev/metal_debugger.html b/docs/build/html/dev/metal_debugger.html
index 31fe3dfbe..d7ef311ba 100644
--- a/docs/build/html/dev/metal_debugger.html
+++ b/docs/build/html/dev/metal_debugger.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Metal Debugger &#8212; MLX 0.20.0 documentation</title>
+    <title>Metal Debugger &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Custom Extensions in MLX" href="extensions.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/scaled__dot__product__attention__params_8h.html b/docs/build/html/dir_5aea41cce495e77a0857a0aecf063e33.html
similarity index 81%
rename from docs/build/html/scaled__dot__product__attention__params_8h.html
rename to docs/build/html/dir_5aea41cce495e77a0857a0aecf063e33.html
index c2695c4c0..2001241f8 100644
--- a/docs/build/html/scaled__dot__product__attention__params_8h.html
+++ b/docs/build/html/dir_5aea41cce495e77a0857a0aecf063e33.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=11"/>
 <meta name="generator" content="Doxygen 1.12.0"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: mlx/backend/metal/kernels/scaled_dot_product_attention_params.h File Reference</title>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/kernels Directory Reference</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -83,24 +83,18 @@ $(function(){ initResizable(false); });
 
 <div id="nav-path" class="navpath">
   <ul>
-<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li>  </ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li><li class="navelem"><a class="el" href="dir_5aea41cce495e77a0857a0aecf063e33.html">kernels</a></li>  </ul>
 </div>
 </div><!-- top -->
 <div id="doc-content">
 <div class="header">
-  <div class="summary">
-<a href="#nested-classes">Classes</a>  </div>
-  <div class="headertitle"><div class="title">scaled_dot_product_attention_params.h File Reference</div></div>
+  <div class="headertitle"><div class="title">kernels Directory Reference</div></div>
 </div><!--header-->
 <div class="contents">
-
-<p><a href="scaled__dot__product__attention__params_8h_source.html">Go to the source code of this file.</a></p>
 <table class="memberdecls">
-<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
-Classes</h2></td></tr>
-<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td></tr>
-<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a></td></tr>
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="files" name="files"></a>
+Files</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="steel__attention_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="steel__attention_8h.html">steel_attention.h</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 </div><!-- contents -->
diff --git a/docs/build/html/dir_6768c99e6145fb9510ccdb40db8ede25.html b/docs/build/html/dir_6768c99e6145fb9510ccdb40db8ede25.html
index 2c23d6630..4bc85b3b1 100644
--- a/docs/build/html/dir_6768c99e6145fb9510ccdb40db8ede25.html
+++ b/docs/build/html/dir_6768c99e6145fb9510ccdb40db8ede25.html
@@ -103,7 +103,7 @@ Files</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="gemm_2loader_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="gemm_2loader_8h.html">loader.h</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="mma_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="mma_8h.html">mma.h</a></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="gemm_2mma_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="gemm_2mma_8h.html">mma.h</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="gemm_2params_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="gemm_2params_8h.html">params.h</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
diff --git a/docs/build/html/dir_70a37effa88bcbd6b791977fa1e64356.html b/docs/build/html/dir_70a37effa88bcbd6b791977fa1e64356.html
index 85eb93c5e..e88fa7636 100644
--- a/docs/build/html/dir_70a37effa88bcbd6b791977fa1e64356.html
+++ b/docs/build/html/dir_70a37effa88bcbd6b791977fa1e64356.html
@@ -96,6 +96,12 @@ $(function(){ initResizable(false); });
 Directories</h2></td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_8c751ccfa9f494753d976761a9d60a84.html">fft</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_fb5e52e7ad5a84a63db2993d12f7610c.html">jit</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_d36f9e79442ec4bd53287b83bdefe7e5.html">metal_3_0</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_83367edb60e23ad59b1a493d8c883287.html">metal_3_1</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_f60cd69d27fd3faa641c79056fff0e2d.html">reduction</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></td></tr>
@@ -107,8 +113,6 @@ Files</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="atomic_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="atomic_8h.html">atomic.h</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="backend_2metal_2kernels_2bf16_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html">bf16.h</a></td></tr>
-<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="bf16__math_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="bf16__math_8h.html">bf16_math.h</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="metal_2kernels_2binary_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="metal_2kernels_2binary_8h.html">binary.h</a></td></tr>
@@ -143,8 +147,6 @@ Files</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="reduce__utils_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="reduce__utils_8h.html">reduce_utils.h</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="scaled__dot__product__attention__params_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="scaled__dot__product__attention__params_8h.html">scaled_dot_product_attention_params.h</a></td></tr>
-<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="scan_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="scan_8h.html">scan.h</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="scatter_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="scatter_8h.html">scatter.h</a></td></tr>
diff --git a/docs/build/html/dir_76215a6c54e2b67053e723fc2395583c.html b/docs/build/html/dir_76215a6c54e2b67053e723fc2395583c.html
index a10d315e1..374387cf7 100644
--- a/docs/build/html/dir_76215a6c54e2b67053e723fc2395583c.html
+++ b/docs/build/html/dir_76215a6c54e2b67053e723fc2395583c.html
@@ -94,6 +94,8 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="subdirs" name="subdirs"></a>
 Directories</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_df9494e83ef22ae6150a0e080d9709ed.html">conv</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_6768c99e6145fb9510ccdb40db8ede25.html">gemm</a></td></tr>
diff --git a/docs/build/html/dir_83367edb60e23ad59b1a493d8c883287.html b/docs/build/html/dir_83367edb60e23ad59b1a493d8c883287.html
new file mode 100644
index 000000000..7383c68c1
--- /dev/null
+++ b/docs/build/html/dir_83367edb60e23ad59b1a493d8c883287.html
@@ -0,0 +1,107 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/metal_3_1 Directory Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_83367edb60e23ad59b1a493d8c883287.html">metal_3_1</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">metal_3_1 Directory Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="files" name="files"></a>
+Files</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html">bf16.h</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/dir_d36f9e79442ec4bd53287b83bdefe7e5.html b/docs/build/html/dir_d36f9e79442ec4bd53287b83bdefe7e5.html
new file mode 100644
index 000000000..bd8458eac
--- /dev/null
+++ b/docs/build/html/dir_d36f9e79442ec4bd53287b83bdefe7e5.html
@@ -0,0 +1,107 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/metal_3_0 Directory Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_d36f9e79442ec4bd53287b83bdefe7e5.html">metal_3_0</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">metal_3_0 Directory Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="files" name="files"></a>
+Files</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html">bf16.h</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/dir_e1756c7634b0c14aead026895ad71c6d.html b/docs/build/html/dir_e1756c7634b0c14aead026895ad71c6d.html
new file mode 100644
index 000000000..904ba2fa9
--- /dev/null
+++ b/docs/build/html/dir_e1756c7634b0c14aead026895ad71c6d.html
@@ -0,0 +1,120 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn Directory Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">attn Directory Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="subdirs" name="subdirs"></a>
+Directories</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><span class="iconfclosed"></span>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="dir_5aea41cce495e77a0857a0aecf063e33.html">kernels</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="files" name="files"></a>
+Files</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="attn_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="attn_8h.html">attn.h</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="attn_2loader_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="attn_2loader_8h.html">loader.h</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="attn_2mma_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="attn_2mma_8h.html">mma.h</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="attn_2params_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="attn_2params_8h.html">params.h</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h.html">transforms.h</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/dir_fb5e52e7ad5a84a63db2993d12f7610c.html b/docs/build/html/dir_fb5e52e7ad5a84a63db2993d12f7610c.html
new file mode 100644
index 000000000..000f91aaa
--- /dev/null
+++ b/docs/build/html/dir_fb5e52e7ad5a84a63db2993d12f7610c.html
@@ -0,0 +1,107 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/jit Directory Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_fb5e52e7ad5a84a63db2993d12f7610c.html">jit</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">jit Directory Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="files" name="files"></a>
+Files</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top"><a href="backend_2metal_2kernels_2jit_2bf16_8h_source.html"><span class="icondoc"></span></a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="backend_2metal_2kernels_2jit_2bf16_8h.html">bf16.h</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/doxygen_crawl.html b/docs/build/html/doxygen_crawl.html
index 0236914c1..a4d1775a8 100644
--- a/docs/build/html/doxygen_crawl.html
+++ b/docs/build/html/doxygen_crawl.html
@@ -53,22 +53,25 @@
 <a href="reduce__col_8h_source.html"/>
 <a href="reduce__init_8h_source.html"/>
 <a href="reduce__row_8h_source.html"/>
-<a href="scaled__dot__product__attention__params_8h_source.html"/>
 <a href="scan_8h_source.html"/>
 <a href="scatter_8h_source.html"/>
 <a href="sdpa__vector_8h_source.html"/>
 <a href="jit_2softmax_8h_source.html"/>
 <a href="kernels_2softmax_8h_source.html"/>
 <a href="sort_8h_source.html"/>
+<a href="attn_8h_source.html"/>
+<a href="steel__attention_8h_source.html"/>
 <a href="conv_8h_source.html"/>
 <a href="jit_2steel__conv_8h_source.html"/>
 <a href="kernels_2steel_2conv_2kernels_2steel__conv_8h_source.html"/>
 <a href="steel__conv__general_8h_source.html"/>
+<a href="attn_2loader_8h_source.html"/>
 <a href="conv_2loader_8h_source.html"/>
 <a href="gemm_2loader_8h_source.html"/>
 <a href="loader__channel__l_8h_source.html"/>
 <a href="loader__channel__n_8h_source.html"/>
 <a href="loader__general_8h_source.html"/>
+<a href="attn_2params_8h_source.html"/>
 <a href="conv_2params_8h_source.html"/>
 <a href="gemm_2params_8h_source.html"/>
 <a href="defines_8h_source.html"/>
@@ -77,7 +80,9 @@
 <a href="steel__gemm__fused_8h_source.html"/>
 <a href="steel__gemm__masked_8h_source.html"/>
 <a href="steel__gemm__splitk_8h_source.html"/>
-<a href="mma_8h_source.html"/>
+<a href="attn_2mma_8h_source.html"/>
+<a href="gemm_2mma_8h_source.html"/>
+<a href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html"/>
 <a href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html"/>
 <a href="transforms_8h_source.html"/>
 <a href="integral__constant_8h_source.html"/>
@@ -127,7 +132,9 @@
 <a href="scheduler_8h_source.html"/>
 <a href="stream_8h_source.html"/>
 <a href="transforms__impl_8h_source.html"/>
-<a href="backend_2metal_2kernels_2bf16_8h_source.html"/>
+<a href="backend_2metal_2kernels_2jit_2bf16_8h_source.html"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html"/>
+<a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h_source.html"/>
 <a href="types_2bf16_8h_source.html"/>
 <a href="backend_2metal_2kernels_2complex_8h_source.html"/>
 <a href="types_2complex_8h_source.html"/>
@@ -184,22 +191,25 @@
 <a href="reduce__col_8h.html"/>
 <a href="reduce__init_8h.html"/>
 <a href="reduce__row_8h.html"/>
-<a href="scaled__dot__product__attention__params_8h.html"/>
 <a href="scan_8h.html"/>
 <a href="scatter_8h.html"/>
 <a href="sdpa__vector_8h.html"/>
 <a href="jit_2softmax_8h.html"/>
 <a href="kernels_2softmax_8h.html"/>
 <a href="sort_8h.html"/>
+<a href="attn_8h.html"/>
+<a href="steel__attention_8h.html"/>
 <a href="conv_8h.html"/>
 <a href="jit_2steel__conv_8h.html"/>
 <a href="kernels_2steel_2conv_2kernels_2steel__conv_8h.html"/>
 <a href="steel__conv__general_8h.html"/>
+<a href="attn_2loader_8h.html"/>
 <a href="conv_2loader_8h.html"/>
 <a href="gemm_2loader_8h.html"/>
 <a href="loader__channel__l_8h.html"/>
 <a href="loader__channel__n_8h.html"/>
 <a href="loader__general_8h.html"/>
+<a href="attn_2params_8h.html"/>
 <a href="conv_2params_8h.html"/>
 <a href="gemm_2params_8h.html"/>
 <a href="defines_8h.html"/>
@@ -208,7 +218,9 @@
 <a href="steel__gemm__fused_8h.html"/>
 <a href="steel__gemm__masked_8h.html"/>
 <a href="steel__gemm__splitk_8h.html"/>
-<a href="mma_8h.html"/>
+<a href="attn_2mma_8h.html"/>
+<a href="gemm_2mma_8h.html"/>
+<a href="backend_2metal_2kernels_2steel_2attn_2transforms_8h.html"/>
 <a href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html"/>
 <a href="transforms_8h.html"/>
 <a href="integral__constant_8h.html"/>
@@ -258,7 +270,9 @@
 <a href="scheduler_8h.html"/>
 <a href="stream_8h.html"/>
 <a href="transforms__impl_8h.html"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html"/>
+<a href="backend_2metal_2kernels_2jit_2bf16_8h.html"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html"/>
+<a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html"/>
 <a href="types_2bf16_8h.html"/>
 <a href="backend_2metal_2kernels_2complex_8h.html"/>
 <a href="types_2complex_8h.html"/>
@@ -327,6 +341,8 @@
 <a href="struct_divide-members.html"/>
 <a href="struct_div_mod.html"/>
 <a href="struct_div_mod-members.html"/>
+<a href="struct_div_op.html"/>
+<a href="struct_div_op-members.html"/>
 <a href="struct_equal.html"/>
 <a href="struct_equal-members.html"/>
 <a href="struct_erf.html"/>
@@ -337,6 +353,8 @@
 <a href="struct_exp-members.html"/>
 <a href="struct_expm1.html"/>
 <a href="struct_expm1-members.html"/>
+<a href="struct_exp_sub_op.html"/>
+<a href="struct_exp_sub_op-members.html"/>
 <a href="struct_floor.html"/>
 <a href="struct_floor-members.html"/>
 <a href="struct_floor_divide.html"/>
@@ -409,16 +427,18 @@
 <a href="struct_logical_not-members.html"/>
 <a href="struct_logical_or.html"/>
 <a href="struct_logical_or-members.html"/>
-<a href="structlooped__elem__to__loc.html"/>
-<a href="structlooped__elem__to__loc-members.html"/>
-<a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html"/>
-<a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4-members.html"/>
-<a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html"/>
-<a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4-members.html"/>
+<a href="struct_looped_elem_to_loc.html"/>
+<a href="struct_looped_elem_to_loc-members.html"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4-members.html"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4-members.html"/>
 <a href="struct_max.html"/>
 <a href="struct_max-members.html"/>
 <a href="struct_maximum.html"/>
 <a href="struct_maximum-members.html"/>
+<a href="struct_max_op.html"/>
+<a href="struct_max_op-members.html"/>
 <a href="struct_min.html"/>
 <a href="struct_minimum.html"/>
 <a href="struct_minimum-members.html"/>
@@ -428,10 +448,8 @@
 <a href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4-members.html"/>
 <a href="struct_m_l_x_conv_params.html"/>
 <a href="struct_m_l_x_conv_params-members.html"/>
-<a href="struct_m_l_x_fast_attention_params.html"/>
-<a href="struct_m_l_x_fast_attention_params-members.html"/>
-<a href="struct_m_l_x_scaled_dot_product_attention_params.html"/>
-<a href="struct_m_l_x_scaled_dot_product_attention_params-members.html"/>
+<a href="struct_mul_op.html"/>
+<a href="struct_mul_op-members.html"/>
 <a href="struct_multiply.html"/>
 <a href="struct_multiply-members.html"/>
 <a href="struct_na_n_equal.html"/>
@@ -476,9 +494,13 @@
 <a href="struct_sqrt-members.html"/>
 <a href="struct_square.html"/>
 <a href="struct_square-members.html"/>
+<a href="struct_sub_op.html"/>
+<a href="struct_sub_op-members.html"/>
 <a href="struct_subtract.html"/>
 <a href="struct_subtract-members.html"/>
 <a href="struct_sum.html"/>
+<a href="struct_sum_op.html"/>
+<a href="struct_sum_op-members.html"/>
 <a href="struct_tan.html"/>
 <a href="struct_tan-members.html"/>
 <a href="struct_tanh.html"/>
@@ -487,6 +509,8 @@
 <a href="class_thread_pool-members.html"/>
 <a href="struct_thread_sort.html"/>
 <a href="struct_thread_sort-members.html"/>
+<a href="struct_transform_scale.html"/>
+<a href="struct_transform_scale-members.html"/>
 <a href="namespacemetal.html"/>
 <a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html"/>
 <a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4-members.html"/>
@@ -571,6 +595,8 @@
 <a href="classmlx_1_1core_1_1_concatenate-members.html"/>
 <a href="classmlx_1_1core_1_1_conjugate.html"/>
 <a href="classmlx_1_1core_1_1_conjugate-members.html"/>
+<a href="classmlx_1_1core_1_1_contiguous.html"/>
+<a href="classmlx_1_1core_1_1_contiguous-members.html"/>
 <a href="structmlx_1_1core_1_1_contiguous_iterator.html"/>
 <a href="structmlx_1_1core_1_1_contiguous_iterator-members.html"/>
 <a href="classmlx_1_1core_1_1_convolution.html"/>
@@ -893,6 +919,7 @@
 <a href="classmlx_1_1core_1_1distributed_1_1_send.html"/>
 <a href="classmlx_1_1core_1_1distributed_1_1_send-members.html"/>
 <a href="namespacemlx_1_1core_1_1distributed_1_1detail.html"/>
+<a href="namespacemlx_1_1core_1_1env.html"/>
 <a href="namespacemlx_1_1core_1_1fast.html"/>
 <a href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html"/>
 <a href="classmlx_1_1core_1_1fast_1_1_affine_quantize-members.html"/>
@@ -951,6 +978,8 @@
 <a href="namespacemlx_1_1steel.html"/>
 <a href="structmlx_1_1steel_1_1_accum_helper.html"/>
 <a href="structmlx_1_1steel_1_1_accum_helper-members.html"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html"/>
+<a href="structmlx_1_1steel_1_1_attn_params-members.html"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag.html"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4-members.html"/>
@@ -958,6 +987,8 @@
 <a href="structmlx_1_1steel_1_1_block_loader-members.html"/>
 <a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html"/>
 <a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector-members.html"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t-members.html"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a-members.html"/>
 <a href="structmlx_1_1steel_1_1_block_swizzle.html"/>
@@ -990,6 +1021,8 @@
 <a href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general-members.html"/>
 <a href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html"/>
 <a href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels-members.html"/>
+<a href="structmlx_1_1steel_1_1_c_shape.html"/>
+<a href="structmlx_1_1steel_1_1_c_shape-members.html"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params-members.html"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html"/>
@@ -1006,9 +1039,13 @@
 <a href="structmlx_1_1steel_1_1is__integral-members.html"/>
 <a href="structmlx_1_1steel_1_1is__integral_3_01integral__constant_3_01_t_00_01v_01_4_01_4.html"/>
 <a href="structmlx_1_1steel_1_1is__integral_3_01integral__constant_3_01_t_00_01v_01_4_01_4-members.html"/>
+<a href="structmlx_1_1steel_1_1_layout2_d.html"/>
+<a href="structmlx_1_1steel_1_1_layout2_d-members.html"/>
 <a href="structmlx_1_1steel_1_1_loop_alignment.html"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile-members.html"/>
+<a href="structmlx_1_1steel_1_1_shape2_d.html"/>
+<a href="structmlx_1_1steel_1_1_shape2_d-members.html"/>
 <a href="structmlx_1_1steel_1_1_transform_add.html"/>
 <a href="structmlx_1_1steel_1_1_transform_add-members.html"/>
 <a href="structmlx_1_1steel_1_1_transform_axpby.html"/>
@@ -1081,6 +1118,7 @@
 <a href="classpocketfft_1_1detail_1_1threading_1_1thread__pool-members.html"/>
 <a href="dir_ad00dcd1517bfdbe01f68ec9b4eff877.html"/>
 <a href="dir_86b95e7b1d0d6e25466bb9213752d32f.html"/>
+<a href="dir_e1756c7634b0c14aead026895ad71c6d.html"/>
 <a href="dir_1d446c9bd3c99228254c9484e0bc5c06.html"/>
 <a href="dir_f149b24a1b5be11cd70151abe517e3f8.html"/>
 <a href="dir_df9494e83ef22ae6150a0e080d9709ed.html"/>
@@ -1089,11 +1127,15 @@
 <a href="dir_6768c99e6145fb9510ccdb40db8ede25.html"/>
 <a href="dir_2193406f5b2eae6fc53753d8a9a80df3.html"/>
 <a href="dir_4336740ec0075891704443b417fef6cb.html"/>
+<a href="dir_fb5e52e7ad5a84a63db2993d12f7610c.html"/>
 <a href="dir_70a37effa88bcbd6b791977fa1e64356.html"/>
+<a href="dir_5aea41cce495e77a0857a0aecf063e33.html"/>
 <a href="dir_6379e541ea5051a09bc0e3fdd92fcd3b.html"/>
 <a href="dir_9c555e3d0f5b8c3fb3a7397c81fd5bf9.html"/>
 <a href="dir_ba4426224ef60f409462a2a12fa18f06.html"/>
 <a href="dir_d0c977ea65824390717cdb7efc36c157.html"/>
+<a href="dir_d36f9e79442ec4bd53287b83bdefe7e5.html"/>
+<a href="dir_83367edb60e23ad59b1a493d8c883287.html"/>
 <a href="dir_938ab0ecf10b8b860ff766c820f665fd.html"/>
 <a href="dir_f60cd69d27fd3faa641c79056fff0e2d.html"/>
 <a href="dir_76215a6c54e2b67053e723fc2395583c.html"/>
@@ -1265,6 +1307,7 @@
 <a href="functions_type.html"/>
 <a href="functions_type.html"/>
 <a href="functions_type.html"/>
+<a href="functions_type.html"/>
 <a href="functions_enum.html"/>
 <a href="functions_enum.html"/>
 <a href="functions_enum.html"/>
@@ -1298,6 +1341,7 @@
 <a href="globals_g.html"/>
 <a href="globals_h.html"/>
 <a href="globals_i.html"/>
+<a href="globals_j.html"/>
 <a href="globals_l.html"/>
 <a href="globals_m.html"/>
 <a href="globals_n.html"/>
@@ -1389,6 +1433,11 @@
 <a href="atomic_8h.html#ac480f2b459a8ad9095cee353e152d00c"/>
 <a href="atomic_8h.html#ad7f32327ff66354cfa2f0cfdac79316f"/>
 <a href="atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3"/>
+<a href="attn_2loader_8h.html"/>
+<a href="attn_2mma_8h.html"/>
+<a href="attn_2mma_8h.html#ad583e6038efc119542410f43b603d4ad"/>
+<a href="attn_2params_8h.html"/>
+<a href="attn_8h.html"/>
 <a href="backend_2accelerate_2utils_8h.html"/>
 <a href="backend_2accelerate_2utils_8h.html#a7a4193f37b1de9c33c31d1da09c77edb"/>
 <a href="backend_2common_2load_8h.html"/>
@@ -1404,8 +1453,10 @@
 <a href="backend_2common_2utils_8h.html#a3ba20a804c306067b7023259429e0e48"/>
 <a href="backend_2common_2utils_8h.html#a4ee50bfb240512d0c0ce151dfe2c74ef"/>
 <a href="backend_2common_2utils_8h.html#a77657cb50fd9392f7f4c64e43843c2b3"/>
+<a href="backend_2common_2utils_8h.html#a830a47d8a317dffb0c88e5a7afe6aee2"/>
 <a href="backend_2common_2utils_8h.html#a90e2b6edc0fe82230cb93f5ea39febb4"/>
 <a href="backend_2common_2utils_8h.html#aab3cc7f3808934ae0727b920eba231bd"/>
+<a href="backend_2common_2utils_8h.html#aae1e770954edf1f9a35d19e0de4d857a"/>
 <a href="backend_2common_2utils_8h.html#ac813412cce77fc1340dcfefc6e099276"/>
 <a href="backend_2common_2utils_8h.html#ad7e4f40eb351b554bbfabb6d7d600d06"/>
 <a href="backend_2common_2utils_8h.html#af2895f9b0083efd8221275eb8cadccbe"/>
@@ -1419,357 +1470,6 @@
 <a href="backend_2metal_2device_8h.html#a5fd6ba2040e53a254b9d71ae7ebd315f"/>
 <a href="backend_2metal_2device_8h.html#a616e09a1ef321d527770721cef264c54"/>
 <a href="backend_2metal_2device_8h.html#a910797b74824e6ee576fbb533dee8b57"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a006763fae6e0577fc168ec9446f0f747"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a007f58508b98bb79e5c323ed0dec89b6"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0197e039d4c65bf49649a6f250c2d436"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a05a4f197a71d0f16879032f44492bb79"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0736a76f56578d26ba1422dc8b744a18"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a08b6071245513e1726ec68e3b63edc53"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a08c1f916302eb9d48c93f8b7260538fe"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a08c7d12a0d16565fbf052dba2db8b22d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a08d2460e259b9106d90d889481ad60d5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a08e778be18e4a291c108fcc528b981d3"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a09c1a797eb7f43742578680899932f50"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0a5bfe15d95ba540795f4c25ebfa4f07"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0aa3bfcfab53700488e5f386e6de60d5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0aa9ffe056f49fda181bbacbd60556ea"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0b8736e2ae24758b6e24ea72668df5b4"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0d3fb52437c677c5d0f1a3642384b15c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0e2c2c2cb50b3a55ff213f18978aca35"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0e4377b120d6305335d296e031ee5b30"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0f360806708b95a3be400af0b8871b57"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0f486bf02c6ad5b9b6a96d3450f03e47"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a0f7fd418408806ef498745c6fdb2c062"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a104cf94cb9e359d1b6ef92ced2ce0c27"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a12a47e8ac0be788edff57ae0a96d7830"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a12a98d71d670b409b8065e0d61672d55"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a13aa79165ec87710e977f33fe0361e91"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a13b3338935440ae51ecc4a356093efc5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a1457da931c28fa4e2500daa4e6441e8b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a14b56c687053ee2432398a25663c068f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a152366ab4e2ccc867e919af6c74ced91"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a15573fefd880adefbba079b1c1bd8082"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a168300bbd04d8e97c5e4218cb14ae378"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a16978f4b16d954ef4d4cf0f32f6c0b94"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a17f47ec9cff60f8e1b3477a2793b7ac0"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a18963246f2b640874bef6dca7049f64d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a190e27077f0fba642a86f5c8f488bcc2"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a194a6670cc25ade35a24b566f31af785"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a1a2a683ff40490226eb1371fb905023d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a1a788f82212afad30e4c2ee40f1c313c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a1b029e4ca72125a5f9471f582c819705"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a1dac193d9f1c8c0eb4473441895f8c58"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a1f4e90909ac1c7280f4c7d1977c55fb7"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a204d13a881ae8d337f6efbb98673790c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2179abbc91ce8763e96e39e1917bfa6e"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a24381d991c2d570aa953694f396a69b5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2486f3b5de85b0d57f458d8f21f82b42"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a24b1fa8998c892f90f8dde7c34fb10a5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a24ca436ab299a710263d65302532dd3b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a251780ac4592cc2b1a543e417ff57770"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a25e7c5d2ecf3375756d59074f333858f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2721c088adfc9d73cde442d6badd2a6c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a279d09ab8542f1c1a8dc8173b65946b6"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2807fa6862b0f9689c81199b1e695ed8"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2826bd301bb5393473ccd363f2052c0d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a284dfc702f0f67b9c233b87162eeabdd"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a28d297705e29009197418546ef435393"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a28f8d21c5eef047c701cf690ce9c2ef0"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2b9de9624c0a507b4ead85f898ad9daf"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2bbdcece13148826d3fe33af727bb79b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2c3c5f793b3d957d7295d7f1faabebee"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2d37130b6fd79b425f5ba92b65e36bed"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a2f6286d222e2176bcbdc824c5d598100"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a303dfcc81ffd355f866f863d7d9f0fa5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3165e37d393be50c2cfa9ddcba153684"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a31a3d8f2ff8038f7e0d717845c039808"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a31bbdbe0b62b90a4d6ea4bb0a7db586b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a323a80492cd17a49e2c3dd18f8c8b5cc"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a333f67614dbf8027439a7e124052cb85"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a33ea086b561c652f25833a5e1ded34dd"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a347c9bbf816bad2e9e5e91aa448f8b65"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a359edd4bcb8776861ceb26a3005624c0"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3602117b4c61d5cd4fd72fb8e5f68bd6"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3796dcf819adb1ef8152f57ba63ff6b1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3816a35f8468156d59c239256c12dcf3"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a383165ea838cc3feeee4d9cf54aa77cc"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a38bb89f925eca4f9c042f6ee7a2c0193"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3936148781ab1c4f33f58d12c116f370"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3a0a3edbf1ba2314551454059c3f422b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3ae2091ada1e39e857fbc53c97bdb79f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3c62ac679d6aa515144d40ebafe4a188"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3d0d689516c99003659c5d026847bd2e"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3eefe9a7f5fb226335ea687012f32d5c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a3ff4ff59f411010ac8502cfabda4bd6f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4126fb7ed5bbb27a2332c543cf56a337"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4154f90ab7857ca856f9e15fe1bf5acf"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a41d55d167e9dc63bf29d15e0ff004869"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a42bead8ef0beb9f3452128d64cd4df9d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4358ee606e66ba2081fcf94f9c3b5915"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a435a2aec4c777b4b184ff5d24992e8a1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a435f2f4256aadb1b57fd62bb7f733cf7"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a43a225e7e548bb041f3a5d844faaf0da"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a44522c2304c6396bbe6b9d32000f4b6f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4611728172afea51860a77fdb06cafa0"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a467a88531150a4d9d30fce07c49c126e"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4720cc79ab2b8e39952ea9ef20e51250"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a491dadfae957cd7cc0c36188d910f6f6"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a495ae2d9be5d97c4c6448fc4e50a03e1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4a0023e2fd08875156cd6ef747fbb5cd"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4ae4a80fde67eea9a0a37b2803946544"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4bd92db6c8b9b5dc96332c7ae3eff8c7"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4ca11d43174baf0a729f93b35eabcbea"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a4dd3cf0e5aa116ff330352a50c18cde7"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a501cc01d5bf15d9f03aa28545f9624ea"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a513501355a5912a1263fd8b10864142b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a523eda93c809733368e2b45382d2add6"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5263b2463fecdc97f9521d00bffea059"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a542affc376726840647a6e93acf2c1a7"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a54ae7216b82c5cea362f6b83e1df3a9b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a551b970f73bb4a3b287653021d000b60"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a55600f3b9859e2891e0e0b5690867b72"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a584a513596de20663dad951a5b81695e"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5868c85c988ec3432cf86d7df40e464d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a588ef0f7e03f306758524d378278976f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a59515695ebc48844345fa5120511aed1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5a0cb8544b4ebd2906ba8e7f2868e8de"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5a4b98a0a11db5b77cf9168df37c8bc7"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5a81eae168dfafd299c2b94e3e8558cf"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5bd875a54b79b2dcedf674807c3e53c5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5be23e296bbed3a885586a6424b1666e"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5cc74ad3e522d7104e6e2117751151ad"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5d00eb2ec2b0e15b2753d100694c45ae"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5d628a5bc4fa755610392f47a523a1f1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a5f997839cf49c24ab594a0dff486a7bc"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a625dcb133f1f953f263e6200399866c6"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a6278bd2e0e2805090b33ef666bf7f6bb"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a62a512d0edd894759c69f724b970fbdb"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a62f891b7dbba0000749cf338f594bedb"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a64f1136b17006f168ef837e17240814f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a64f6787a96386246f83a8981d274150e"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a65f30a2dc199134e35bc7c5d431b2263"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a68125e66f74eaffe5ea9267638ce870d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a696978d9401e09200045b2d8aad045c2"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a69984aaa05ae1d4fccccf7f57e8ecb4a"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a6b9e49ad9ea256d2d0220c0d81552602"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a6baa722c22d66c7510786bb275cb8cc2"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7003e1e5881e3d106257f22b6a3e59fe"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a711693988c437c2fb4d7da505982fe21"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a715c824ee8c87e0256114a85624d9949"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7167343d90eb70e5a0d5fa9ec5398e94"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7172d84db640e6c49dff0d08dd64b53e"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a726cecf778b8584b6f7c37db1b064576"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a72d10ec0e62949247da129eb3a83fb9b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a73416a7415f3fe31525e33419e5e8aab"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a738078eb7d5ff94ff48156a555d763a5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a744f72ba83522fe3cc2a49a007b42543"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a74751abec7086f85f4f26ced44f1ca1f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a74e477567c9477c2cf0684f81ef4498f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7595740d4cc12924905d6bd1b99ee4da"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a76f5bd895b7214cbc3cea3440992718a"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a77bab4481b41be50297b257e95058706"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a77c678665b34df7652dcde053ca73185"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7816a97d16b1d2f8a90227bb1da2f6ac"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7ad7ff44a3200853711869f7a577d931"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7b134429ea0c8493800ff8b465410f9c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7b3bce3f6f17089d87e13e91f580a581"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7bc91aaaf476a37063264d1d53d862cc"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7c56980c234a04260b8b19298085e526"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7c790442f77f2437b482c4a55e224fc3"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7cd44d27fa9a4f13df39894c34fdb348"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7dbf0c75df4817cb4ef8b60c417a89d0"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7e1a6056f9c96f3c89fe204dbf103be5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7e1bcf3bc06cbcbc304c0cdf729802bc"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7eac96f64ca42991caf819c8e8c8d2bc"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7f1b84352a3ed6171444a43da1fc7e92"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a7f601b22ecc480132d82ad782e5363bf"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a80d288f22cadfdf5e904410349e616a1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a8110fae7bcc34a0de5927546b24aa935"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a81f65b04a87a25c7eb1a751d1be9fa55"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a83320ba983d90dd1fa5847b6940dc0bb"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a852689073c17596de4fb545bc046b380"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a86b2a001cbec0d3a8d762a3c7ff47b0b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a87ab4b7a502430da664ccb8abd383058"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a8859b5b8dc241e4f58243c85d2630cc8"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a88c11cd37600de5480570da3d2ae5732"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a891aa4bf46c20a26a55061736aba25f1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a8ad16afd7f1711de83c0cec5af868f76"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a8b6c3fd9d068a2159084359df8b9b449"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a8c8ac6736440fdca366ebdefe2a12b9f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a8cd55d1a579540eb450e12a8a8a950be"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a8f06316063fc91747533105f256b55b5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a90a1c5130db515db48624d8587edbb91"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a912393b7208fa45bd1e87f30b218b68b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a917354f77eac26189da8a2f610a00074"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a92d1348f201d78fcd474f75d5b23ef68"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a94686039356dfa9aa45608a8b0562fdc"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a948579a4d9ba276523190b03b09578fb"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a967a1d7b5664f616e5b6f2d257367f0c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a979374b1dd4e0eaf602326fa901336d1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a998b1ba877a606aedf722ab46b290403"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a99aa4cc110d1c7aa3b4c8c5cbf9235b7"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a9a837c3b9c4e42f53d7cd1ed0d266e2f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a9b31c363ebc93d592b6fa0e27b00335a"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a9d06cceea5c179bcc608452188bd7d6a"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a9e21c5ea9dd724dc2ca8c54ad908f09c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a9f835a0a80c411580c97b65fdc5bdfd3"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#a9ff5ab3aef1057fa083b53a65c8aba03"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa04cfcb52191fd23205a1a3572b46ae0"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa251d6483d3b099d1b5311fbe6f0bce2"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa3277ae33976c70f7bd937ddff027b72"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa332fae098e7c6dc23b98bc0026f1070"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa415ce182fe7582d885fe633fc3527ce"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa504a474ab6e00ebe2b1b7ed2f7d1ffb"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa5fa1a8f2b39c3508fe38205469756d1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa6b99cde403405df1865c989e4ce845a"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa705d87cf4b78e9d7c6b07dd0c66cac6"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa708a970a200822c99c0489f389469fa"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa7198e580e2a83c1fd01a4b6fdf86a80"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa846fde89c7d2d18b18ef180a8a9c8a3"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa8d9f01582a0a9f01a666d110c74db2a"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aa95f9ebfdab3c5f524775651362ce914"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aaa66dc6d7b2c5efbfaa97ca9c7872bd8"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aaab79d0b4c9e9bdc059ace6ec58c5b00"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aab02c65bc38ea66335b2192ead4095a8"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aab74ec4d33a64b92b908717d500f1ecf"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aacaedf12f862c76457133336dd6fc446"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab02f8646b47806e1d2038f248df03f06"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab070ea4676d10a10ff3e9379a4068a57"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab092d9790ef20fc0386707530aee89db"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab155f418f15cabd86ff942c6f9472ddb"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab1de7e7e7304ff3598925d2e69134764"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab27b26182c7c6e08af37e6d511fd9253"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab3b594321fb42b0c2da99954d1e0976c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab43932322f81bf322aa1b0deeee9a987"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab4e9ad547aa23daa351075e0ecc58fa2"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab4f4ecd62c3d8b3363d02019573dc9f1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab706af260b61f735b28464877d02137c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab789f8a400512ff27e36b3373170f0c5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab840ff9de0cdd0e9afffb8baa2a850a3"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab8f211ea896fc5190004f3ad6ad8932f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab933bc3cdf9adfea10ab9dba5292c812"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab93ce536eb7998bee00de4af868e31a9"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab986ae2cec780a1f494b7b4468b7ba11"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab9ae6a51e2027b02cac9966e05f3ba68"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ab9cd098786d2f4c855c42e4a6f30ab3e"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#abb884888f14086cc674657677cb4b8bc"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#abbe42648a46092137b303ccd08f7df86"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#abce5ab327110c164f054b43ed47f79a0"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#abd3d82e2dec1847e97eb8fc3bab2985a"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#abdd04257e6a73883b5f56f1186d0e906"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#abec53064aa96265385ecc57de5fbc74c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#abf5f3040227f021a5b84cf2eda248b2f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#abff1fd2439e31e6e64a3d2fdee3c7821"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac03f6eefb836373d37dc280b0d813d78"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac057d95a2bf087575584aa6f9a2c6bf5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac06eb2fea47a09a8a8abdaa1aa9b4603"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac1498acb8c3623b5f412f70ab6a6528b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac188bd19f236b098d603b0d8acd08921"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac244d140c6149726ea44174d3e836ca3"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac2f1e1f2365cfa531b1519aa9ff67695"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac30a2c1fa6f172af903fdeb6a8632606"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac30c580713f354916088a7dc049ae4cd"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac45e9ca0c7155caebe3d0f7261518077"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac66657077d55e94197b52b63acb50b7d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac703495cb370b52526a5a2d36ae26038"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac766839f8f9e4863e8e18418c342c875"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac7b4d295f3c7b1e09964f24f306422da"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac815eec2c1b15a47b1c6ea6790e77d24"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ac89eb6b29edad8cca63727ab97171c29"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#acb9f0aef9fbdfde8a4f46e33b0d6c52f"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#acba9efe192d22b7781b4622103c7a944"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#acd15d46ea5827a2a39898ccbb8352eb8"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#acf7af2284269544064b68e807064bba4"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#acf7cb9927bf09022088401923f2e1916"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad0125b6baba3065a87a174ec27aa9a61"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad03ef47e6cc7521bbfb45740dee20f88"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad0ae9e2b4874f991a2c853e1c1fe735d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad1a559ab88dbbb4fd2c7509d2c94e55b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad1e28448e35f4934075b397c34ba3d66"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad1e7ef6f065695d4b1d017547b60ef62"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad2817d53fdd4b112babfb6f0b38c8f39"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad30726cc8b69fd300d33c2a46e123c28"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad3565cc6fd1e088d052b1108aa065851"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad6399ba2b8708899739b4cdbb44add8d"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad684bc2ae1a2a627cd3e4a4c641e2d77"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad6859b04680d0d26d75fd6c4dd74ee24"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad8f7b11669736fbd6ed2e28211d877d4"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ad920df9579603f0b0ee2689eba330617"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ada5685d99c2d6708d1c4ef826d68e879"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#adb465776d3868bda0525d632ffc4d129"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#adc268cdbc30500f3009f5de2b2f0f67a"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#adc8e82b8f593b12c6d405e2250ab0f62"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#adc9f32cc6f40768df4285fba2e4783c7"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#add18cfe4c0d38e95c6dff6bab3e7a932"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#adda64cae388baac1f138b06dc8595237"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ade65ebca11e38d56408c512df89b99f4"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#adf0cfd9a608a6fb3d57933e32e7d81d2"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae0614b6b199d8a65ae95d4621b118b82"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae0c70198e236ffe1a98f79987c686419"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae46d75b8046d557452d74513f1106710"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae4acef3e7ae7dfe359422503f894e885"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae71f66d814a03f6377c9d86cf0a2b5d7"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae74bb0a3c12cd1a23f3d29ce307d6fb1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae753526b669fba27771089dc809abd66"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae86f5917847b1ec9f313996250f2e0be"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae88617c4a012c5dc12781a349a28c886"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae91686513e284bcc9635833744bbdda1"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae97ab6c3ddcc2754b24f86319a5398be"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#ae998d8f423a9fb73405cfbd4b836bc72"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aeadc1f36c6bdc219294ce9341d80afa5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aeb1efa47c5f22cc0b35d49ccce73c406"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aecf703522d9ce32dfeefe1e6e903db06"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aede0cc4179507b739849948f1a2fed4b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aee1bdf0ab2e445293708b476e8cfde3b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aee3ae0d0d1f941463b06eca0bf041b2b"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aee64dc1890abb6d1035361cb8c751f96"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aef62c7e3e494b6a511a7833c0d942a60"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aef8e7e499ea9d432aa743d83c076f945"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aef9fa600d107b509f2e3df7d6b080e01"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aeff4c28986f98c23de1df17043edb0f5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af1983edd26245e6e51c6e47354095e32"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af1a12a1efb618a57da6dd41ae18cb53c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af20874a61c6c3f4c3fd045a96e806644"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af2737d09c887ee8cd43fdeabceddbe82"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af4348ce3425dd99d069e8fdf06e25a3c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af5713afb3a62967a02c3c20661951ee4"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af69ef8f1d8ecae0e6f755bf1c46cf075"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af725f935bfa0405e5ff17ede3ac47283"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af802541c4c65ee4442acd495de4d27fe"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af80ff2020ec2c4b406c5fdae3fe55e63"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#af87dfa2122e9c76042dc41fb7f338a87"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#afb3cd302e0b78902c62111dce4494fe8"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#afb9a0e18c0e40c77e6143fb7d84ebfba"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#afba39221eb54e272aae79910b3cd7ef5"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#afd7cdb8ed2a9820efe9cf322c06f188c"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#afe5988aa8147be2bafda6a5b7792fe15"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aff100489cc40ad276c2d5d67a9df67db"/>
-<a href="backend_2metal_2kernels_2bf16_8h.html#aff19193e1b2cee29a8737318e95cc74a"/>
 <a href="backend_2metal_2kernels_2complex_8h.html"/>
 <a href="backend_2metal_2kernels_2complex_8h.html#a032a8d3eec2384c9f03066f7fd945995"/>
 <a href="backend_2metal_2kernels_2complex_8h.html#a226cfd54d49f02e35c5aab3139c7596b"/>
@@ -1818,62 +1518,424 @@
 <a href="backend_2metal_2kernels_2fft_8h.html#ad395c11e6f2aee72cd1928fba93a35a3"/>
 <a href="backend_2metal_2kernels_2fft_8h.html#adb129cc3808c08fd95af9795bfc7ae63"/>
 <a href="backend_2metal_2kernels_2fft_8h.html#afea05e9a7105bafbaafca25042f4d1b4"/>
+<a href="backend_2metal_2kernels_2jit_2bf16_8h.html"/>
+<a href="backend_2metal_2kernels_2jit_2bf16_8h.html#a4b2f08732045407adc7ee181e39e5ae3"/>
+<a href="backend_2metal_2kernels_2jit_2bf16_8h.html#a5049b44a1fffcb837e0c470ae4cafc56"/>
+<a href="backend_2metal_2kernels_2jit_2bf16_8h.html#aaf5bb88c2349054a6c4c2aefee63d3d2"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a006763fae6e0577fc168ec9446f0f747"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a007f58508b98bb79e5c323ed0dec89b6"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0197e039d4c65bf49649a6f250c2d436"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a05a4f197a71d0f16879032f44492bb79"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0736a76f56578d26ba1422dc8b744a18"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08b6071245513e1726ec68e3b63edc53"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08c1f916302eb9d48c93f8b7260538fe"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08c7d12a0d16565fbf052dba2db8b22d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08d2460e259b9106d90d889481ad60d5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08e778be18e4a291c108fcc528b981d3"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a09c1a797eb7f43742578680899932f50"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0a5bfe15d95ba540795f4c25ebfa4f07"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0aa3bfcfab53700488e5f386e6de60d5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0aa9ffe056f49fda181bbacbd60556ea"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0b8736e2ae24758b6e24ea72668df5b4"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0d3fb52437c677c5d0f1a3642384b15c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0e2c2c2cb50b3a55ff213f18978aca35"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0e4377b120d6305335d296e031ee5b30"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0f360806708b95a3be400af0b8871b57"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0f486bf02c6ad5b9b6a96d3450f03e47"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0f7fd418408806ef498745c6fdb2c062"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a104cf94cb9e359d1b6ef92ced2ce0c27"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a12a47e8ac0be788edff57ae0a96d7830"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a12a98d71d670b409b8065e0d61672d55"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a13aa79165ec87710e977f33fe0361e91"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a13b3338935440ae51ecc4a356093efc5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1457da931c28fa4e2500daa4e6441e8b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a14b56c687053ee2432398a25663c068f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a152366ab4e2ccc867e919af6c74ced91"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a15573fefd880adefbba079b1c1bd8082"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a168300bbd04d8e97c5e4218cb14ae378"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a16978f4b16d954ef4d4cf0f32f6c0b94"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a17f47ec9cff60f8e1b3477a2793b7ac0"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a18963246f2b640874bef6dca7049f64d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a190e27077f0fba642a86f5c8f488bcc2"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a194a6670cc25ade35a24b566f31af785"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1a2a683ff40490226eb1371fb905023d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1a788f82212afad30e4c2ee40f1c313c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1b029e4ca72125a5f9471f582c819705"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1dac193d9f1c8c0eb4473441895f8c58"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1f4e90909ac1c7280f4c7d1977c55fb7"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a204d13a881ae8d337f6efbb98673790c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2179abbc91ce8763e96e39e1917bfa6e"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24381d991c2d570aa953694f396a69b5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2486f3b5de85b0d57f458d8f21f82b42"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24b1fa8998c892f90f8dde7c34fb10a5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24ca436ab299a710263d65302532dd3b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a251780ac4592cc2b1a543e417ff57770"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a25e7c5d2ecf3375756d59074f333858f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2721c088adfc9d73cde442d6badd2a6c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a279d09ab8542f1c1a8dc8173b65946b6"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2807fa6862b0f9689c81199b1e695ed8"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2826bd301bb5393473ccd363f2052c0d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a284dfc702f0f67b9c233b87162eeabdd"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a28d297705e29009197418546ef435393"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a28f8d21c5eef047c701cf690ce9c2ef0"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2b9de9624c0a507b4ead85f898ad9daf"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2bbdcece13148826d3fe33af727bb79b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2c3c5f793b3d957d7295d7f1faabebee"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2d37130b6fd79b425f5ba92b65e36bed"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2f6286d222e2176bcbdc824c5d598100"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a303dfcc81ffd355f866f863d7d9f0fa5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3165e37d393be50c2cfa9ddcba153684"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31a3d8f2ff8038f7e0d717845c039808"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31bbdbe0b62b90a4d6ea4bb0a7db586b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a323a80492cd17a49e2c3dd18f8c8b5cc"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a333f67614dbf8027439a7e124052cb85"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a33ea086b561c652f25833a5e1ded34dd"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a347c9bbf816bad2e9e5e91aa448f8b65"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a359edd4bcb8776861ceb26a3005624c0"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3602117b4c61d5cd4fd72fb8e5f68bd6"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3796dcf819adb1ef8152f57ba63ff6b1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3816a35f8468156d59c239256c12dcf3"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a383165ea838cc3feeee4d9cf54aa77cc"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a38bb89f925eca4f9c042f6ee7a2c0193"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3936148781ab1c4f33f58d12c116f370"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3a0a3edbf1ba2314551454059c3f422b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3ae2091ada1e39e857fbc53c97bdb79f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3c62ac679d6aa515144d40ebafe4a188"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3d0d689516c99003659c5d026847bd2e"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3eefe9a7f5fb226335ea687012f32d5c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3ff4ff59f411010ac8502cfabda4bd6f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4126fb7ed5bbb27a2332c543cf56a337"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4154f90ab7857ca856f9e15fe1bf5acf"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a41d55d167e9dc63bf29d15e0ff004869"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a42bead8ef0beb9f3452128d64cd4df9d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4358ee606e66ba2081fcf94f9c3b5915"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a435a2aec4c777b4b184ff5d24992e8a1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a435f2f4256aadb1b57fd62bb7f733cf7"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a43a225e7e548bb041f3a5d844faaf0da"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a44522c2304c6396bbe6b9d32000f4b6f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4611728172afea51860a77fdb06cafa0"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a467a88531150a4d9d30fce07c49c126e"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4720cc79ab2b8e39952ea9ef20e51250"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a491dadfae957cd7cc0c36188d910f6f6"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a495ae2d9be5d97c4c6448fc4e50a03e1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4a0023e2fd08875156cd6ef747fbb5cd"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ae4a80fde67eea9a0a37b2803946544"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4bd92db6c8b9b5dc96332c7ae3eff8c7"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ca11d43174baf0a729f93b35eabcbea"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4dd3cf0e5aa116ff330352a50c18cde7"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a501cc01d5bf15d9f03aa28545f9624ea"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a513501355a5912a1263fd8b10864142b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a523eda93c809733368e2b45382d2add6"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5263b2463fecdc97f9521d00bffea059"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a542affc376726840647a6e93acf2c1a7"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a54ae7216b82c5cea362f6b83e1df3a9b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a551b970f73bb4a3b287653021d000b60"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a55600f3b9859e2891e0e0b5690867b72"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a584a513596de20663dad951a5b81695e"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5868c85c988ec3432cf86d7df40e464d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a588ef0f7e03f306758524d378278976f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a59515695ebc48844345fa5120511aed1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5a0cb8544b4ebd2906ba8e7f2868e8de"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5a4b98a0a11db5b77cf9168df37c8bc7"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5a81eae168dfafd299c2b94e3e8558cf"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5bd875a54b79b2dcedf674807c3e53c5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5be23e296bbed3a885586a6424b1666e"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5cc74ad3e522d7104e6e2117751151ad"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5d00eb2ec2b0e15b2753d100694c45ae"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5d628a5bc4fa755610392f47a523a1f1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5f997839cf49c24ab594a0dff486a7bc"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a625dcb133f1f953f263e6200399866c6"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6278bd2e0e2805090b33ef666bf7f6bb"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a62a512d0edd894759c69f724b970fbdb"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a62f891b7dbba0000749cf338f594bedb"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a64f1136b17006f168ef837e17240814f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a64f6787a96386246f83a8981d274150e"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a65f30a2dc199134e35bc7c5d431b2263"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a68125e66f74eaffe5ea9267638ce870d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a696978d9401e09200045b2d8aad045c2"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a69984aaa05ae1d4fccccf7f57e8ecb4a"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6b9e49ad9ea256d2d0220c0d81552602"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6baa722c22d66c7510786bb275cb8cc2"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7003e1e5881e3d106257f22b6a3e59fe"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a711693988c437c2fb4d7da505982fe21"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a715c824ee8c87e0256114a85624d9949"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7167343d90eb70e5a0d5fa9ec5398e94"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7172d84db640e6c49dff0d08dd64b53e"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a726cecf778b8584b6f7c37db1b064576"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a72d10ec0e62949247da129eb3a83fb9b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a73416a7415f3fe31525e33419e5e8aab"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a738078eb7d5ff94ff48156a555d763a5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a744f72ba83522fe3cc2a49a007b42543"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a74751abec7086f85f4f26ced44f1ca1f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a74e477567c9477c2cf0684f81ef4498f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7595740d4cc12924905d6bd1b99ee4da"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a76f5bd895b7214cbc3cea3440992718a"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a77bab4481b41be50297b257e95058706"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a77c678665b34df7652dcde053ca73185"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7816a97d16b1d2f8a90227bb1da2f6ac"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7ad7ff44a3200853711869f7a577d931"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7b134429ea0c8493800ff8b465410f9c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7b3bce3f6f17089d87e13e91f580a581"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7bc91aaaf476a37063264d1d53d862cc"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7c56980c234a04260b8b19298085e526"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7c790442f77f2437b482c4a55e224fc3"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7cd44d27fa9a4f13df39894c34fdb348"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7dbf0c75df4817cb4ef8b60c417a89d0"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e1a6056f9c96f3c89fe204dbf103be5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e1bcf3bc06cbcbc304c0cdf729802bc"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7eac96f64ca42991caf819c8e8c8d2bc"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7f1b84352a3ed6171444a43da1fc7e92"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7f601b22ecc480132d82ad782e5363bf"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a80d288f22cadfdf5e904410349e616a1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8110fae7bcc34a0de5927546b24aa935"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a81f65b04a87a25c7eb1a751d1be9fa55"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a83320ba983d90dd1fa5847b6940dc0bb"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a852689073c17596de4fb545bc046b380"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a86b2a001cbec0d3a8d762a3c7ff47b0b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a87ab4b7a502430da664ccb8abd383058"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8859b5b8dc241e4f58243c85d2630cc8"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a88c11cd37600de5480570da3d2ae5732"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a891aa4bf46c20a26a55061736aba25f1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8ad16afd7f1711de83c0cec5af868f76"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8b6c3fd9d068a2159084359df8b9b449"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8c8ac6736440fdca366ebdefe2a12b9f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8cd55d1a579540eb450e12a8a8a950be"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8f06316063fc91747533105f256b55b5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a90a1c5130db515db48624d8587edbb91"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a912393b7208fa45bd1e87f30b218b68b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a917354f77eac26189da8a2f610a00074"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a92d1348f201d78fcd474f75d5b23ef68"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a94686039356dfa9aa45608a8b0562fdc"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a948579a4d9ba276523190b03b09578fb"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a967a1d7b5664f616e5b6f2d257367f0c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a979374b1dd4e0eaf602326fa901336d1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a998b1ba877a606aedf722ab46b290403"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a99aa4cc110d1c7aa3b4c8c5cbf9235b7"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9a837c3b9c4e42f53d7cd1ed0d266e2f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9b31c363ebc93d592b6fa0e27b00335a"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9d06cceea5c179bcc608452188bd7d6a"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9e21c5ea9dd724dc2ca8c54ad908f09c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9f835a0a80c411580c97b65fdc5bdfd3"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9ff5ab3aef1057fa083b53a65c8aba03"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa04cfcb52191fd23205a1a3572b46ae0"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa251d6483d3b099d1b5311fbe6f0bce2"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa3277ae33976c70f7bd937ddff027b72"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa332fae098e7c6dc23b98bc0026f1070"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa415ce182fe7582d885fe633fc3527ce"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa504a474ab6e00ebe2b1b7ed2f7d1ffb"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa5fa1a8f2b39c3508fe38205469756d1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa6b99cde403405df1865c989e4ce845a"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa705d87cf4b78e9d7c6b07dd0c66cac6"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa708a970a200822c99c0489f389469fa"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa7198e580e2a83c1fd01a4b6fdf86a80"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa846fde89c7d2d18b18ef180a8a9c8a3"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa8d9f01582a0a9f01a666d110c74db2a"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa95f9ebfdab3c5f524775651362ce914"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aaa66dc6d7b2c5efbfaa97ca9c7872bd8"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aaab79d0b4c9e9bdc059ace6ec58c5b00"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aab02c65bc38ea66335b2192ead4095a8"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aab74ec4d33a64b92b908717d500f1ecf"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aacaedf12f862c76457133336dd6fc446"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab02f8646b47806e1d2038f248df03f06"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab070ea4676d10a10ff3e9379a4068a57"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab092d9790ef20fc0386707530aee89db"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab155f418f15cabd86ff942c6f9472ddb"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab1de7e7e7304ff3598925d2e69134764"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab27b26182c7c6e08af37e6d511fd9253"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab3b594321fb42b0c2da99954d1e0976c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab43932322f81bf322aa1b0deeee9a987"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab4e9ad547aa23daa351075e0ecc58fa2"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab4f4ecd62c3d8b3363d02019573dc9f1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab706af260b61f735b28464877d02137c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab789f8a400512ff27e36b3373170f0c5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab840ff9de0cdd0e9afffb8baa2a850a3"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab8f211ea896fc5190004f3ad6ad8932f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab933bc3cdf9adfea10ab9dba5292c812"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab93ce536eb7998bee00de4af868e31a9"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab986ae2cec780a1f494b7b4468b7ba11"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab9ae6a51e2027b02cac9966e05f3ba68"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab9cd098786d2f4c855c42e4a6f30ab3e"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abb884888f14086cc674657677cb4b8bc"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abbe42648a46092137b303ccd08f7df86"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abce5ab327110c164f054b43ed47f79a0"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abd3d82e2dec1847e97eb8fc3bab2985a"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abdd04257e6a73883b5f56f1186d0e906"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abec53064aa96265385ecc57de5fbc74c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abf5f3040227f021a5b84cf2eda248b2f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abff1fd2439e31e6e64a3d2fdee3c7821"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac03f6eefb836373d37dc280b0d813d78"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac057d95a2bf087575584aa6f9a2c6bf5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac06eb2fea47a09a8a8abdaa1aa9b4603"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac1498acb8c3623b5f412f70ab6a6528b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac188bd19f236b098d603b0d8acd08921"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac244d140c6149726ea44174d3e836ca3"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac2f1e1f2365cfa531b1519aa9ff67695"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac30a2c1fa6f172af903fdeb6a8632606"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac30c580713f354916088a7dc049ae4cd"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac45e9ca0c7155caebe3d0f7261518077"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac66657077d55e94197b52b63acb50b7d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac703495cb370b52526a5a2d36ae26038"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac766839f8f9e4863e8e18418c342c875"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac7b4d295f3c7b1e09964f24f306422da"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac815eec2c1b15a47b1c6ea6790e77d24"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac89eb6b29edad8cca63727ab97171c29"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acb9f0aef9fbdfde8a4f46e33b0d6c52f"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acba9efe192d22b7781b4622103c7a944"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acd15d46ea5827a2a39898ccbb8352eb8"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acf7af2284269544064b68e807064bba4"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acf7cb9927bf09022088401923f2e1916"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad0125b6baba3065a87a174ec27aa9a61"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad03ef47e6cc7521bbfb45740dee20f88"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad0ae9e2b4874f991a2c853e1c1fe735d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad1a559ab88dbbb4fd2c7509d2c94e55b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad1e28448e35f4934075b397c34ba3d66"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad1e7ef6f065695d4b1d017547b60ef62"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad2817d53fdd4b112babfb6f0b38c8f39"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad30726cc8b69fd300d33c2a46e123c28"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad3565cc6fd1e088d052b1108aa065851"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad6399ba2b8708899739b4cdbb44add8d"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad684bc2ae1a2a627cd3e4a4c641e2d77"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad6859b04680d0d26d75fd6c4dd74ee24"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad8f7b11669736fbd6ed2e28211d877d4"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad920df9579603f0b0ee2689eba330617"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ada5685d99c2d6708d1c4ef826d68e879"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adb465776d3868bda0525d632ffc4d129"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adc268cdbc30500f3009f5de2b2f0f67a"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adc8e82b8f593b12c6d405e2250ab0f62"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adc9f32cc6f40768df4285fba2e4783c7"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#add18cfe4c0d38e95c6dff6bab3e7a932"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adda64cae388baac1f138b06dc8595237"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ade65ebca11e38d56408c512df89b99f4"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adf0cfd9a608a6fb3d57933e32e7d81d2"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae0614b6b199d8a65ae95d4621b118b82"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae0c70198e236ffe1a98f79987c686419"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae46d75b8046d557452d74513f1106710"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae4acef3e7ae7dfe359422503f894e885"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae71f66d814a03f6377c9d86cf0a2b5d7"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae74bb0a3c12cd1a23f3d29ce307d6fb1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae753526b669fba27771089dc809abd66"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae86f5917847b1ec9f313996250f2e0be"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae88617c4a012c5dc12781a349a28c886"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae91686513e284bcc9635833744bbdda1"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae97ab6c3ddcc2754b24f86319a5398be"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae998d8f423a9fb73405cfbd4b836bc72"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeadc1f36c6bdc219294ce9341d80afa5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeb1efa47c5f22cc0b35d49ccce73c406"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aecf703522d9ce32dfeefe1e6e903db06"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aede0cc4179507b739849948f1a2fed4b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aee1bdf0ab2e445293708b476e8cfde3b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aee3ae0d0d1f941463b06eca0bf041b2b"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aee64dc1890abb6d1035361cb8c751f96"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aef62c7e3e494b6a511a7833c0d942a60"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aef8e7e499ea9d432aa743d83c076f945"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aef9fa600d107b509f2e3df7d6b080e01"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeff4c28986f98c23de1df17043edb0f5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af1983edd26245e6e51c6e47354095e32"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af1a12a1efb618a57da6dd41ae18cb53c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af20874a61c6c3f4c3fd045a96e806644"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af2737d09c887ee8cd43fdeabceddbe82"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af4348ce3425dd99d069e8fdf06e25a3c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af5713afb3a62967a02c3c20661951ee4"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af69ef8f1d8ecae0e6f755bf1c46cf075"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af725f935bfa0405e5ff17ede3ac47283"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af802541c4c65ee4442acd495de4d27fe"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af80ff2020ec2c4b406c5fdae3fe55e63"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af87dfa2122e9c76042dc41fb7f338a87"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afb3cd302e0b78902c62111dce4494fe8"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afb9a0e18c0e40c77e6143fb7d84ebfba"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afba39221eb54e272aae79910b3cd7ef5"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afd7cdb8ed2a9820efe9cf322c06f188c"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afe5988aa8147be2bafda6a5b7792fe15"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aff100489cc40ad276c2d5d67a9df67db"/>
+<a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aff19193e1b2cee29a8737318e95cc74a"/>
+<a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html"/>
+<a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088"/>
+<a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a58e15a77da988b9104fee00cdf8b280e"/>
+<a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4"/>
 <a href="backend_2metal_2kernels_2reduction_2ops_8h.html"/>
 <a href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3"/>
 <a href="backend_2metal_2kernels_2reduction_2ops_8h.html#acacf99e0ba629ed062ccc3c2eba89b05"/>
+<a href="backend_2metal_2kernels_2steel_2attn_2transforms_8h.html"/>
 <a href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html"/>
 <a href="backend_2metal_2kernels_2steel_2utils_8h.html"/>
 <a href="backend_2metal_2kernels_2steel_2utils_8h.html#a42bd57d203a40d3d7d429f2333590a3c"/>
 <a href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f"/>
 <a href="backend_2metal_2kernels_2utils_8h.html"/>
-<a href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a069b682d7d21827461544817d722bfd3"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a0c1e4d782fcc56e1ab5565cef12430dd"/>
-<a href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a1e520e23f58ca645dea1ac20998d987a"/>
+<a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a"/>
-<a href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a3501b665c8837eabf9789ea27a7d6946"/>
-<a href="backend_2metal_2kernels_2utils_8h.html#a37e00d94751710e81c9632bca2f91e51"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a39e436e0a942912266aae7e0bd82d7c0"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a3bdbdfeb7a1dde40cd3ce1df8d9213b5"/>
+<a href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a48ae83a8caf5c74810df60b6c6cdb062"/>
+<a href="backend_2metal_2kernels_2utils_8h.html#a4b53fb0679f67f9063deba94753d4185"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a617f3857caf33c569afa6148135f8b7a"/>
-<a href="backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b"/>
+<a href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc"/>
+<a href="backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733"/>
+<a href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a7bb56415c5412a6a26f70a990915f064"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3"/>
-<a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a92b455bac6a23af51c35ea83de2383eb"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#a94e02a6ae8c39cbf4cb23aa44df9dbd5"/>
-<a href="backend_2metal_2kernels_2utils_8h.html#aa6b041005351293e68e19b5abf1286cd"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#ab4cbcdb054f9165130da91a3334da0cf"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#ab8175b66bcc080fb89f738143568c30b"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#abedffa358e7ba7782cc78d6772064c7c"/>
+<a href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#ad55bd473647f2c6c68e65e5312c132d1"/>
-<a href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#ad9a671a5f9aaa729ae7a77026f16bcb0"/>
 <a href="backend_2metal_2kernels_2utils_8h.html#ae0f5c42020275a588234e69f1eb7a485"/>
+<a href="backend_2metal_2kernels_2utils_8h.html#aec82f4bf0e22b8d1b89ad654ad8d8753"/>
 <a href="backend_2metal_2utils_8h.html"/>
 <a href="backend_2metal_2utils_8h.html#a0f0f59d3ffe2d16a684e5fc093302e15"/>
 <a href="backend_2metal_2utils_8h.html#a187b9a932c7b3d67ee42d9d12fcb1bb1"/>
 <a href="backend_2metal_2utils_8h.html#a489e45b3a5cd8b46e8ea56b9132eb230"/>
-<a href="backend_2metal_2utils_8h.html#a62340bbaa8b216539688a60adcb568bf"/>
+<a href="backend_2metal_2utils_8h.html#a76a2e310857f60f5ea6f1388d45b964d"/>
 <a href="backend_2metal_2utils_8h.html#a79817d2432e782e596c9c49a08b93be2"/>
 <a href="backend_2metal_2utils_8h.html#a8dc169474a51a1f4f761d5752819bd7c"/>
+<a href="backend_2metal_2utils_8h.html#aaf51544472fa87fa974686eacdd2a4a6"/>
 <a href="backend_2metal_2utils_8h.html#ad4be35b310a252edd80d9cf04f094a60"/>
-<a href="backend_2metal_2utils_8h.html#ae309cb543dfb0239cfccc53a8ad0408e"/>
 <a href="backend_2metal_2utils_8h.html#aed148d95e7b5221f1312473deded0d27"/>
+<a href="backend_2metal_2utils_8h.html#aef60e3a8d9c987c9c338b193673d2164"/>
 <a href="backend_2metal_2utils_8h.html#af1fdfdaa5644394362e6baba30701bae"/>
 <a href="bf16__math_8h.html"/>
 <a href="bf16__math_8h.html#a005510c8c0f964ce2b8aad3ba76a7a3f"/>
 <a href="bf16__math_8h.html#a00e687ea46f5affe26e6aef8fd62b89a"/>
 <a href="bf16__math_8h.html#a00f9c0ad66d969794614f56912eed9c9"/>
 <a href="bf16__math_8h.html#a020790f30c28a9982c4a83deaa258277"/>
-<a href="bf16__math_8h.html#a030d871474c0e7d907fccffcc8c047e0"/>
 <a href="bf16__math_8h.html#a042b98827baa910e9d726227cec55a80"/>
 <a href="bf16__math_8h.html#a048cad0aca52cb737ebf103e76bd1c49"/>
 <a href="bf16__math_8h.html#a0558e56fdb94b456deea6a4eb53964ed"/>
@@ -1928,7 +1990,6 @@
 <a href="bf16__math_8h.html#a5017efc9605e069cfb507137cd1a1852"/>
 <a href="bf16__math_8h.html#a5138d5cdc18139e135707916a243cd8e"/>
 <a href="bf16__math_8h.html#a51688bc24fc9292aaec5f54a58eaa2d0"/>
-<a href="bf16__math_8h.html#a51cfdd4502e755310f6f3456f039bea7"/>
 <a href="bf16__math_8h.html#a5295ab08055d12534cc3775da855ac12"/>
 <a href="bf16__math_8h.html#a567acb18199ac0107712eb8cb8aeb8e9"/>
 <a href="bf16__math_8h.html#a57116427997ba71dd3863bfb15de33bf"/>
@@ -2282,6 +2343,16 @@
 <a href="classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61"/>
 <a href="classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e"/>
 <a href="classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de"/>
+<a href="classmlx_1_1core_1_1_contiguous.html"/>
+<a href="classmlx_1_1core_1_1_contiguous.html#a1a53623d7c591ba6567ac1533fbc2b7c"/>
+<a href="classmlx_1_1core_1_1_contiguous.html#a1f9fcae7235e0ae9217825b78cb0f991"/>
+<a href="classmlx_1_1core_1_1_contiguous.html#a3e83f414c02ae0b92a50b6f8e402e1c0"/>
+<a href="classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f"/>
+<a href="classmlx_1_1core_1_1_contiguous.html#a563221e90b15aa90bfae23d29c10e4ec"/>
+<a href="classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336"/>
+<a href="classmlx_1_1core_1_1_contiguous.html#aa5d273a461fc6e64f3c9a67c24cb3372"/>
+<a href="classmlx_1_1core_1_1_contiguous.html#abf488f02057fd5852f38b2e8a600ad2a"/>
+<a href="classmlx_1_1core_1_1_contiguous.html#aca8a4ba9a58cc10f063e6b082fa2fc23"/>
 <a href="classmlx_1_1core_1_1_convolution.html"/>
 <a href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2"/>
 <a href="classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef"/>
@@ -3494,10 +3565,12 @@
 <a href="dir_23833761034051b43bb6c170b56e2cce.html"/>
 <a href="dir_4336740ec0075891704443b417fef6cb.html"/>
 <a href="dir_47795aa8999234f6f402f7e89d34d08e.html"/>
+<a href="dir_5aea41cce495e77a0857a0aecf063e33.html"/>
 <a href="dir_6379e541ea5051a09bc0e3fdd92fcd3b.html"/>
 <a href="dir_6768c99e6145fb9510ccdb40db8ede25.html"/>
 <a href="dir_70a37effa88bcbd6b791977fa1e64356.html"/>
 <a href="dir_76215a6c54e2b67053e723fc2395583c.html"/>
+<a href="dir_83367edb60e23ad59b1a493d8c883287.html"/>
 <a href="dir_86b95e7b1d0d6e25466bb9213752d32f.html"/>
 <a href="dir_8c751ccfa9f494753d976761a9d60a84.html"/>
 <a href="dir_938ab0ecf10b8b860ff766c820f665fd.html"/>
@@ -3505,9 +3578,12 @@
 <a href="dir_ad00dcd1517bfdbe01f68ec9b4eff877.html"/>
 <a href="dir_ba4426224ef60f409462a2a12fa18f06.html"/>
 <a href="dir_d0c977ea65824390717cdb7efc36c157.html"/>
+<a href="dir_d36f9e79442ec4bd53287b83bdefe7e5.html"/>
 <a href="dir_df9494e83ef22ae6150a0e080d9709ed.html"/>
+<a href="dir_e1756c7634b0c14aead026895ad71c6d.html"/>
 <a href="dir_f149b24a1b5be11cd70151abe517e3f8.html"/>
 <a href="dir_f60cd69d27fd3faa641c79056fff0e2d.html"/>
+<a href="dir_fb5e52e7ad5a84a63db2993d12f7610c.html"/>
 <a href="distributed_2ops_8h.html"/>
 <a href="distributed_2ops_8h.html#a10f9d39c02e6e5db600912c03de8b393"/>
 <a href="distributed_2ops_8h.html#a2822b78bce2c679e6ff940b2fca944f0"/>
@@ -3569,7 +3645,6 @@
 <a href="fast_8h.html#a12c7ef41409d6fb378008e67b6fab328"/>
 <a href="fast_8h.html#a3663b50265b0a9c0cca2b5376852e059"/>
 <a href="fast_8h.html#a534ef357eae24892684a6ecd866d3fab"/>
-<a href="fast_8h.html#a638c7e9b9ea8677f01786d8f9738baf8"/>
 <a href="fast_8h.html#a9390693ff7be931f3ef3428e2ea4c3f9"/>
 <a href="fast_8h.html#aa4b5f6886b2288cb6dfdd8598579f080"/>
 <a href="fast_8h.html#ab16436b465dc10ce472193d541d8426e"/>
@@ -3857,8 +3932,10 @@
 <a href="functions_x.html"/>
 <a href="functions_~.html"/>
 <a href="gather_8h.html"/>
-<a href="gather_8h.html#abdec470e1af0109563ddae3e85e6526c"/>
+<a href="gather_8h.html#a767d7c5be6f2f649101f581449af5599"/>
 <a href="gemm_2loader_8h.html"/>
+<a href="gemm_2mma_8h.html"/>
+<a href="gemm_2mma_8h.html#ad583e6038efc119542410f43b603d4ad"/>
 <a href="gemm_2params_8h.html"/>
 <a href="gemm_8h.html"/>
 <a href="gguf_8h.html"/>
@@ -3894,6 +3971,7 @@
 <a href="globals_g.html"/>
 <a href="globals_h.html"/>
 <a href="globals_i.html"/>
+<a href="globals_j.html"/>
 <a href="globals_l.html"/>
 <a href="globals_m.html"/>
 <a href="globals_n.html"/>
@@ -4095,6 +4173,7 @@
 <a href="group__ops.html#ga8a10a10b81c69996d0aca8ba401f8ff0"/>
 <a href="group__ops.html#ga8a2056f8c9bb30914c40bcf509386491"/>
 <a href="group__ops.html#ga8a3b04e23e347d99ecf411fd6f4e5125"/>
+<a href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5"/>
 <a href="group__ops.html#ga8af4f22c08c11c4ffab7e3d45e0f3cd6"/>
 <a href="group__ops.html#ga8d50480266d258cac40ff51bcb0fc6a7"/>
 <a href="group__ops.html#ga8d656904aa2690b60955ae745aecfc30"/>
@@ -4318,7 +4397,7 @@
 <a href="kernels_2gemv__masked_8h.html#a1480c8cdff1cae1462a5a71632969bca"/>
 <a href="kernels_2gemv__masked_8h.html#ab3070d14cdecb1dd7dc220a551da6b7b"/>
 <a href="kernels_2indexing_8h.html"/>
-<a href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df"/>
+<a href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8"/>
 <a href="kernels_2softmax_8h.html"/>
 <a href="kernels_2softmax_8h.html#a440d4031ee5e86159a4dd715e44a438b"/>
 <a href="kernels_2softmax_8h.html#a815fe70f879f318e5d6e99acf043f52b"/>
@@ -4328,14 +4407,13 @@
 <a href="kernels_8h.html"/>
 <a href="kernels_8h.html#a05a220cff45f12439fde775983c6df78"/>
 <a href="kernels_8h.html#a195b86cad5bb99aa1bcd23952305af6b"/>
+<a href="kernels_8h.html#a1be32ba7d67137dde7ac191dfe83ff49"/>
 <a href="kernels_8h.html#a1d4cffc3c78067b3d9a62d64f3fb686f"/>
 <a href="kernels_8h.html#a35a412f688d79eb47e42d20a7c8650ee"/>
-<a href="kernels_8h.html#a3bd386cb6db09f636963ce66ceaf8647"/>
 <a href="kernels_8h.html#a4decd4a07d91487e6903f6e3c8b7513a"/>
 <a href="kernels_8h.html#a4e809746f48e5dcf7fa63215d3f5e33e"/>
 <a href="kernels_8h.html#a54eb3b65375022428aab5f810e40624b"/>
 <a href="kernels_8h.html#a76f614e9956a6ca05a9be4db5a483446"/>
-<a href="kernels_8h.html#a7aa91fcfe8b9caa42d60a957f11bfe6b"/>
 <a href="kernels_8h.html#a84ebe6275218070f0ea320f126f64e22"/>
 <a href="kernels_8h.html#a84fa8e0aee321a9d614433a0b933103b"/>
 <a href="kernels_8h.html#a90c24e0d0b99b68fad9deefcf4d3e818"/>
@@ -4344,6 +4422,7 @@
 <a href="kernels_8h.html#ab5f60614e965144b451930fdf935e08d"/>
 <a href="kernels_8h.html#abce2b67044ee06a7bbe7a91ec7c8c48d"/>
 <a href="kernels_8h.html#adce79d220672f5f3c65cc31d145ca9c4"/>
+<a href="kernels_8h.html#ae0470605dc819efeb6510183619f0299"/>
 <a href="kernels_8h.html#aeefaff208444d3fa61ecc0946fe1de5f"/>
 <a href="kernels_8h.html#af48c6f2f72b61dbd6766e4f5fea85df5"/>
 <a href="kernels_8h.html#afb57825bb763050cc9a9d194aa41ac36"/>
@@ -4391,57 +4470,57 @@
 <a href="metal_2kernels_2arange_8h.html#a1e5126ee6ae0164c2343230c4d87c03e"/>
 <a href="metal_2kernels_2binary_8h.html"/>
 <a href="metal_2kernels_2binary_8h.html#a19dbbf8fea68b64bdd25dc8d36865171"/>
-<a href="metal_2kernels_2binary_8h.html#a1f3f5d6bfbf3914f365790dd1434c10b"/>
 <a href="metal_2kernels_2binary_8h.html#a242b8b29a852c255467e50628c6dccf5"/>
 <a href="metal_2kernels_2binary_8h.html#a4116c35f2e4632366d1611d5a95ba141"/>
 <a href="metal_2kernels_2binary_8h.html#a48bd82eb10f9c623ce7d28daec4fa512"/>
 <a href="metal_2kernels_2binary_8h.html#a649851d133358dd5832a73b1061b3313"/>
 <a href="metal_2kernels_2binary_8h.html#a6808bfb006cb5473da087a2758d0d867"/>
-<a href="metal_2kernels_2binary_8h.html#a8cd5989852ec704c6fd132ae28f4fc14"/>
+<a href="metal_2kernels_2binary_8h.html#a6cefcfee68bd62f3a6924df0cd53dd49"/>
 <a href="metal_2kernels_2binary_8h.html#aa8c48b1b21d8f5a181f5443de2346589"/>
-<a href="metal_2kernels_2binary_8h.html#ac4979e60b993f7ffb602bcb91cd68bc9"/>
+<a href="metal_2kernels_2binary_8h.html#ab1b49438a70f6c707c18afd5bce12bb3"/>
+<a href="metal_2kernels_2binary_8h.html#abb15de8250f9a259de80618c6de46dfa"/>
 <a href="metal_2kernels_2binary_8h.html#add6a9aeee3cb0ba909574f27fa9ecd5b"/>
 <a href="metal_2kernels_2binary__two_8h.html"/>
-<a href="metal_2kernels_2binary__two_8h.html#a03f7c15a1607576755abb65c542ae347"/>
 <a href="metal_2kernels_2binary__two_8h.html#a08822ff98ea6f61a98b49a9e9a38b891"/>
 <a href="metal_2kernels_2binary__two_8h.html#a12dbda74fa460812177ccb9aeee6e1ca"/>
 <a href="metal_2kernels_2binary__two_8h.html#a12e80730e43dfaa4c79ce8d5f99edc50"/>
 <a href="metal_2kernels_2binary__two_8h.html#a273d2f31691f2c64623c2a97eab344be"/>
-<a href="metal_2kernels_2binary__two_8h.html#a43e5943460996c43060d1f3aa1309ba6"/>
-<a href="metal_2kernels_2binary__two_8h.html#a6297badf47dece518bb4e67f02cffea8"/>
+<a href="metal_2kernels_2binary__two_8h.html#a97b5613aff654d32c49225209a19bb95"/>
+<a href="metal_2kernels_2binary__two_8h.html#aae07014f8dffa3649a5c7f4671e1268e"/>
+<a href="metal_2kernels_2binary__two_8h.html#aaf6edb734cea627bca4f6540dc338fbd"/>
 <a href="metal_2kernels_2binary__two_8h.html#ab18c6ecf5065275c93701efd095c916c"/>
 <a href="metal_2kernels_2binary__two_8h.html#ab4324f594c007a6895540b77ad5d89d9"/>
 <a href="metal_2kernels_2binary__two_8h.html#ad1fad37c168192b212a4294f4cf78133"/>
 <a href="metal_2kernels_2binary__two_8h.html#af8a791ac7ca88d32cd8f4e9ac0f9ab4f"/>
 <a href="metal_2kernels_2copy_8h.html"/>
-<a href="metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5"/>
 <a href="metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1"/>
-<a href="metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950"/>
-<a href="metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd"/>
-<a href="metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36"/>
+<a href="metal_2kernels_2copy_8h.html#a39ec5b7b8351e4332b842982a2ee6260"/>
+<a href="metal_2kernels_2copy_8h.html#a3f3836ad0b6545ec9b9e1864224f7a13"/>
+<a href="metal_2kernels_2copy_8h.html#a71e4103db4689d90ef6f9d5ba93604cf"/>
 <a href="metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3"/>
-<a href="metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff"/>
+<a href="metal_2kernels_2copy_8h.html#aab82689380897ff4716b5eafd6ef3ecc"/>
 <a href="metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77"/>
+<a href="metal_2kernels_2copy_8h.html#ade9a9eea9b8262a854a11721fe2bb9fa"/>
 <a href="metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659"/>
 <a href="metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3"/>
-<a href="metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c"/>
 <a href="metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea"/>
+<a href="metal_2kernels_2copy_8h.html#af0b06ac3a96852a64fa4274a94b58301"/>
 <a href="metal_2kernels_2hadamard_8h.html"/>
 <a href="metal_2kernels_2hadamard_8h.html#a590e5366adc78bab4fe44e37885d413f"/>
 <a href="metal_2kernels_2hadamard_8h.html#a63c0e8510e555cd065e1f0ddfb33ce18"/>
 <a href="metal_2kernels_2hadamard_8h.html#ab0bd478f2051af35aed1869005e3370a"/>
 <a href="metal_2kernels_2reduce_8h.html"/>
 <a href="metal_2kernels_2ternary_8h.html"/>
-<a href="metal_2kernels_2ternary_8h.html#a113df0c8a841b0e986900d580644e047"/>
+<a href="metal_2kernels_2ternary_8h.html#a046dcbf67cd2318d45355dc7516e3ff4"/>
 <a href="metal_2kernels_2ternary_8h.html#a1bd5918559850f3f80e3adee2391fe6a"/>
 <a href="metal_2kernels_2ternary_8h.html#a3e610f3b01966bdbf23fdfebe5d2c508"/>
 <a href="metal_2kernels_2ternary_8h.html#a83f93644d21ee774e06e8190d0725ccb"/>
-<a href="metal_2kernels_2ternary_8h.html#adf8b5989de971e43829875dc0097cdfb"/>
-<a href="metal_2kernels_2ternary_8h.html#afdf0d9d0cb21fcb3f176500785076af8"/>
+<a href="metal_2kernels_2ternary_8h.html#ab2051fd944c2e24c57d5b4af54894d72"/>
+<a href="metal_2kernels_2ternary_8h.html#adec9ca8a8bf527cb15d70da5857af15d"/>
 <a href="metal_2kernels_2unary_8h.html"/>
 <a href="metal_2kernels_2unary_8h.html#a64e4f6737edddb72122e262977ee3014"/>
 <a href="metal_2kernels_2unary_8h.html#a7c7690f0df9d2acc60b63be58d9c7777"/>
-<a href="metal_2kernels_2unary_8h.html#ac965f8d3ed62f8580dbfb645e83d4ae5"/>
+<a href="metal_2kernels_2unary_8h.html#ac2a85fee50af49620ff62c1a71e2575d"/>
 <a href="metal_2reduce_8h.html"/>
 <a href="metal_2reduce_8h.html#a3ab0fd997d9a35782106ff083a72e098"/>
 <a href="metal_2reduce_8h.html#aa0332c64ee9965f05026c30a0b778000"/>
@@ -4475,8 +4554,6 @@
 <a href="metal__impl_8h.html#a8b4188f9a090a1da42d62b8a369bf106"/>
 <a href="metal__impl_8h.html#ab31abdda3052162d59f6590a89e38337"/>
 <a href="mlx_8h.html"/>
-<a href="mma_8h.html"/>
-<a href="mma_8h.html#ad583e6038efc119542410f43b603d4ad"/>
 <a href="namespacemembers.html"/>
 <a href="namespacemembers_b.html"/>
 <a href="namespacemembers_c.html"/>
@@ -4784,6 +4861,7 @@
 <a href="namespacemlx_1_1core.html#a19805f505cb7ac72bfab66c339ea7900"/>
 <a href="namespacemlx_1_1core.html#a1983a2466bff3bae4d23cf34bd0946c9"/>
 <a href="namespacemlx_1_1core.html#a1b33e2c2e3471420490cf0be2de6de18"/>
+<a href="namespacemlx_1_1core.html#a1be32ba7d67137dde7ac191dfe83ff49"/>
 <a href="namespacemlx_1_1core.html#a1c482bb3d9f9d4c62dee5865892c1f96"/>
 <a href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f"/>
 <a href="namespacemlx_1_1core.html#a1e0cbcf109d32794ffc8efc7302ba9b0"/>
@@ -4854,7 +4932,6 @@
 <a href="namespacemlx_1_1core.html#a3ac798e65e59fe10b7fb5c522efce782"/>
 <a href="namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027"/>
 <a href="namespacemlx_1_1core.html#a3ba20a804c306067b7023259429e0e48"/>
-<a href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647"/>
 <a href="namespacemlx_1_1core.html#a3c41a304126bc225bdc68062d1eb6e7e"/>
 <a href="namespacemlx_1_1core.html#a3cc5c154e4ad9a83ad43da8513146fdc"/>
 <a href="namespacemlx_1_1core.html#a3d2b2929ed4636e9e2b86e125b2e57d9"/>
@@ -4935,7 +5012,6 @@
 <a href="namespacemlx_1_1core.html#a6111e94d51de12391e5d68b765f28fc3"/>
 <a href="namespacemlx_1_1core.html#a61da2851cb3beeef28049228346c28b5"/>
 <a href="namespacemlx_1_1core.html#a622ce842fe44e4b6a95e03242341b459"/>
-<a href="namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf"/>
 <a href="namespacemlx_1_1core.html#a6235dc5f4db517618bb3449b08c96e8b"/>
 <a href="namespacemlx_1_1core.html#a6262aeb513d27fc8313293b261e72abb"/>
 <a href="namespacemlx_1_1core.html#a6276bb9bad43ed4a27a1e2c3f5bfd990"/>
@@ -4981,6 +5057,7 @@
 <a href="namespacemlx_1_1core.html#a759191fb984e7737f0ef529c2053ad73"/>
 <a href="namespacemlx_1_1core.html#a7620f1ae298127cb6181db9162f012a7"/>
 <a href="namespacemlx_1_1core.html#a766157c5d5d00fdf3da95eb7cb2981b9"/>
+<a href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d"/>
 <a href="namespacemlx_1_1core.html#a76dcd1fa3c68b386bc1d1d899a68a120"/>
 <a href="namespacemlx_1_1core.html#a76f614e9956a6ca05a9be4db5a483446"/>
 <a href="namespacemlx_1_1core.html#a775aed5f49b530c57e71cbac81404d45"/>
@@ -4991,7 +5068,6 @@
 <a href="namespacemlx_1_1core.html#a7904b886d7b535a6af0a885d00597323"/>
 <a href="namespacemlx_1_1core.html#a79817d2432e782e596c9c49a08b93be2"/>
 <a href="namespacemlx_1_1core.html#a7a4193f37b1de9c33c31d1da09c77edb"/>
-<a href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b"/>
 <a href="namespacemlx_1_1core.html#a7b763db8194e6fcb1b87eab143dfa47a"/>
 <a href="namespacemlx_1_1core.html#a7b987f404b8699de00f9e0099ab6b1b0"/>
 <a href="namespacemlx_1_1core.html#a7bae3ff296d9a60ff3c7e448f7fbc6bd"/>
@@ -5010,6 +5086,7 @@
 <a href="namespacemlx_1_1core.html#a81284b6ac737f91a8d1ffbbbbf938fe5"/>
 <a href="namespacemlx_1_1core.html#a81e1c727c3fc48910b030cb65a9e7afa"/>
 <a href="namespacemlx_1_1core.html#a827167f6a1ae55428fd218ddd51ec3b6"/>
+<a href="namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2"/>
 <a href="namespacemlx_1_1core.html#a839f94dbad44f0d37333006fc876b42e"/>
 <a href="namespacemlx_1_1core.html#a8481a3bb4c12c2b7dc6ba576c2be3d0d"/>
 <a href="namespacemlx_1_1core.html#a8494764f5c686743ede66dc76d85d955"/>
@@ -5094,6 +5171,8 @@
 <a href="namespacemlx_1_1core.html#aab9d96b0a168f4d05146000a6212b5d8"/>
 <a href="namespacemlx_1_1core.html#aad636e2d0b2f882cadd1b438f4daa9ed"/>
 <a href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032"/>
+<a href="namespacemlx_1_1core.html#aae1e770954edf1f9a35d19e0de4d857a"/>
+<a href="namespacemlx_1_1core.html#aaf51544472fa87fa974686eacdd2a4a6"/>
 <a href="namespacemlx_1_1core.html#aafa3bbeda78610c4285f3e57042268f3"/>
 <a href="namespacemlx_1_1core.html#aafaf24a28297428caf6d0c36c623489e"/>
 <a href="namespacemlx_1_1core.html#ab03949b1f60fa035ce454a894cd73ae9"/>
@@ -5215,13 +5294,13 @@
 <a href="namespacemlx_1_1core.html#ade2f9222fd433cd4d673c6182f256235"/>
 <a href="namespacemlx_1_1core.html#ade3791bc723b8f10fbab22eadb0f705a"/>
 <a href="namespacemlx_1_1core.html#ade5a175ff45347689ac4c798d04c8ffc"/>
+<a href="namespacemlx_1_1core.html#ae0470605dc819efeb6510183619f0299"/>
 <a href="namespacemlx_1_1core.html#ae0540f16c4e7bd55d0e86a88495e4967"/>
 <a href="namespacemlx_1_1core.html#ae065fe5c42c1a333d7858d19f6434fa9"/>
 <a href="namespacemlx_1_1core.html#ae1e41ca94022e43a00cdfc5845102daa"/>
 <a href="namespacemlx_1_1core.html#ae24c337810c841ff23e327efde7045e1"/>
 <a href="namespacemlx_1_1core.html#ae25e0c01b46612f039313a4825ba6428"/>
 <a href="namespacemlx_1_1core.html#ae2a0bcdc171d7e9745d33e1d9aac4f8a"/>
-<a href="namespacemlx_1_1core.html#ae309cb543dfb0239cfccc53a8ad0408e"/>
 <a href="namespacemlx_1_1core.html#ae36badb78a17cd7d13663a69645fc328"/>
 <a href="namespacemlx_1_1core.html#ae36ea40b8477bfa12d41aae8245225c9"/>
 <a href="namespacemlx_1_1core.html#ae3e1e8b7a5410e0edf35f31f74295e2f"/>
@@ -5247,6 +5326,7 @@
 <a href="namespacemlx_1_1core.html#aed3d9cd32698ef0fe65b1280f103b3f5"/>
 <a href="namespacemlx_1_1core.html#aedc4e9df4bf71c0ac34fcfae60cdf550"/>
 <a href="namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f"/>
+<a href="namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164"/>
 <a href="namespacemlx_1_1core.html#aef85739d150b9d5609973da8a3f1086a"/>
 <a href="namespacemlx_1_1core.html#aef89566301cb133d98c8e7bdd2b7bec6"/>
 <a href="namespacemlx_1_1core.html#aefb9b05ce8864ada99a920ab32017b89"/>
@@ -5324,13 +5404,16 @@
 <a href="namespacemlx_1_1core_1_1distributed_1_1detail.html#abf33511660ac71df5fc92f2aad6c6e08"/>
 <a href="namespacemlx_1_1core_1_1distributed_1_1detail.html#ac3612edf0e0e18c1e4ba0ce7c6e35cd6"/>
 <a href="namespacemlx_1_1core_1_1distributed_1_1detail.html#aeb5a1726358213bc75756506f7b54d04"/>
+<a href="namespacemlx_1_1core_1_1env.html"/>
+<a href="namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3"/>
+<a href="namespacemlx_1_1core_1_1env.html#ac3266e1259a64c8b56bdc6c7029179f2"/>
+<a href="namespacemlx_1_1core_1_1env.html#aedbf4e739553024c33dd0094dd9107aa"/>
 <a href="namespacemlx_1_1core_1_1fast.html"/>
 <a href="namespacemlx_1_1core_1_1fast.html#a01bd533ebd0e2415c4ee30032d51d7bf"/>
 <a href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0"/>
 <a href="namespacemlx_1_1core_1_1fast.html#a12c7ef41409d6fb378008e67b6fab328"/>
 <a href="namespacemlx_1_1core_1_1fast.html#a3663b50265b0a9c0cca2b5376852e059"/>
 <a href="namespacemlx_1_1core_1_1fast.html#a534ef357eae24892684a6ecd866d3fab"/>
-<a href="namespacemlx_1_1core_1_1fast.html#a638c7e9b9ea8677f01786d8f9738baf8"/>
 <a href="namespacemlx_1_1core_1_1fast.html#a9390693ff7be931f3ef3428e2ea4c3f9"/>
 <a href="namespacemlx_1_1core_1_1fast.html#aa4b5f6886b2288cb6dfdd8598579f080"/>
 <a href="namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e"/>
@@ -5729,6 +5812,7 @@
 <a href="ops_8h.html#ga8a10a10b81c69996d0aca8ba401f8ff0"/>
 <a href="ops_8h.html#ga8a2056f8c9bb30914c40bcf509386491"/>
 <a href="ops_8h.html#ga8a3b04e23e347d99ecf411fd6f4e5125"/>
+<a href="ops_8h.html#ga8ab10aa6c41416d739791164a52b25d5"/>
 <a href="ops_8h.html#ga8af4f22c08c11c4ffab7e3d45e0f3cd6"/>
 <a href="ops_8h.html#ga8d50480266d258cac40ff51bcb0fc6a7"/>
 <a href="ops_8h.html#ga8d656904aa2690b60955ae745aecfc30"/>
@@ -5943,7 +6027,6 @@
 <a href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99"/>
 <a href="quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd"/>
 <a href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494"/>
-<a href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c"/>
 <a href="quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f"/>
 <a href="quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad"/>
 <a href="quantized_8h.html#a803e4d5a1459844ba647aea5b004e133"/>
@@ -6015,26 +6098,25 @@
 <a href="readwrite_8h.html"/>
 <a href="readwrite_8h.html#a7b6e56afa21f022c5e754b000955735a"/>
 <a href="reduce__all_8h.html"/>
-<a href="reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d"/>
+<a href="reduce__all_8h.html#a9086a585eda5a887160ee24baae0a7b8"/>
 <a href="reduce__col_8h.html"/>
-<a href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"/>
-<a href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385"/>
-<a href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb"/>
-<a href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5"/>
+<a href="reduce__col_8h.html#a82cd031d8014c02e61dc9a817ea6d4ec"/>
+<a href="reduce__col_8h.html#a9a7be400d810700b47fc1a998032ce29"/>
+<a href="reduce__col_8h.html#aa3287cd98e97123b67b5d3920d984ca2"/>
+<a href="reduce__col_8h.html#ae8f9354e1c595142d05b33fe13988f02"/>
 <a href="reduce__init_8h.html"/>
 <a href="reduce__init_8h.html#a0088604ac2eaa6940689ff12c4ba5fc2"/>
 <a href="reduce__row_8h.html"/>
 <a href="reduce__row_8h.html#a045ec34228e77c79ec67d11c39ff097a"/>
-<a href="reduce__row_8h.html#a27e75312086e31f6bd1bbf4b366679da"/>
 <a href="reduce__row_8h.html#a4d00c44e5f4a13be529ff8b664a0a342"/>
 <a href="reduce__row_8h.html#a9d5e0049a2276f43702fc6907e74a35f"/>
 <a href="reduce__row_8h.html#aa146bb611069fd2892f03714fd1cc3cf"/>
-<a href="reduce__row_8h.html#ac01d30987668930c8b38900e47b8308b"/>
-<a href="reduce__row_8h.html#ad98332d74a6824aa7499df3e2f2246ae"/>
+<a href="reduce__row_8h.html#aeb49e89f1163cb3093770bb710df9f5e"/>
+<a href="reduce__row_8h.html#aef628dfccdb1361da5546f8b17c510bf"/>
+<a href="reduce__row_8h.html#afba85f5a1c935c124ef52e986d4b2c49"/>
 <a href="reduce__row_8h.html#afd80a25fa84e6cc884dcc8698859ade1"/>
 <a href="reduce__utils_8h.html"/>
 <a href="resident_8h.html"/>
-<a href="scaled__dot__product__attention__params_8h.html"/>
 <a href="scan_8h.html"/>
 <a href="scan_8h.html#a0d8d6a9b0f3a1263629380bda8eca7bc"/>
 <a href="scan_8h.html#a185f66aac8c5317587e6abd43f3013fc"/>
@@ -6045,7 +6127,7 @@
 <a href="scan_8h.html#ae86aef08e5ebc8790031eb51eefa754c"/>
 <a href="scan_8h.html#ae8eb101e538b85f8a4bcf451489ae0ac"/>
 <a href="scatter_8h.html"/>
-<a href="scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1"/>
+<a href="scatter_8h.html#a0df7206d4519defb48a6275afc12f87c"/>
 <a href="scheduler_8h.html"/>
 <a href="scheduler_8h.html#a1d06ffdbab36790b78deb6e34adc737f"/>
 <a href="scheduler_8h.html#a6b7289e33cef665178fe614aac75c1b2"/>
@@ -6054,7 +6136,9 @@
 <a href="scheduler_8h.html#aa2d4eacf5d5cbc778a51aafd4fd8e4d7"/>
 <a href="scheduler_8h.html#ae856e468c2f7c8f8ec672522cc13730b"/>
 <a href="sdpa__vector_8h.html"/>
+<a href="sdpa__vector_8h.html#a1368cf3618a4e03dbf743b3463205efe"/>
 <a href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae"/>
+<a href="sdpa__vector_8h.html#ae070ec482c79c5b3bd19dd03ea42ec74"/>
 <a href="sort_8h.html"/>
 <a href="sort_8h.html#a0386011c52d03e60885a31e6fbd903dd"/>
 <a href="sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2"/>
@@ -6068,6 +6152,10 @@
 <a href="steel_2defines_8h.html"/>
 <a href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6"/>
 <a href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b"/>
+<a href="steel__attention_8h.html"/>
+<a href="steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982"/>
+<a href="steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33"/>
+<a href="steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416"/>
 <a href="steel__conv__general_8h.html"/>
 <a href="steel__conv__general_8h.html#ad620c0656ae92e36dcb4a285d9e790ee"/>
 <a href="steel__gemm_8h.html"/>
@@ -6198,6 +6286,8 @@
 <a href="struct_cum_sum.html"/>
 <a href="struct_div_mod.html"/>
 <a href="struct_div_mod.html#a8b5758f2ea18d4c903b462331b25abfe"/>
+<a href="struct_div_op.html"/>
+<a href="struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221"/>
 <a href="struct_divide.html"/>
 <a href="struct_divide.html#a0a16b9194abc2ab7c61129f81a9bbb3d"/>
 <a href="struct_equal.html"/>
@@ -6209,6 +6299,8 @@
 <a href="struct_exp.html"/>
 <a href="struct_exp.html#a2b341ac400c4d145397950eb60734336"/>
 <a href="struct_exp.html#a5ef395868e055348c0802fd5fe45669c"/>
+<a href="struct_exp_sub_op.html"/>
+<a href="struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334"/>
 <a href="struct_expm1.html"/>
 <a href="struct_expm1.html#a4b834d42cf0b84daf03fec62c222091a"/>
 <a href="struct_floor.html"/>
@@ -6367,6 +6459,29 @@
 <a href="struct_logical_not.html#a8a620bac957ab8c09ac85adfddd96708"/>
 <a href="struct_logical_or.html"/>
 <a href="struct_logical_or.html#ade6a931324a604a3119d2220d6f5460d"/>
+<a href="struct_looped_elem_to_loc.html"/>
+<a href="struct_looped_elem_to_loc.html#a54c743940bf96350f3be42bba5d28205"/>
+<a href="struct_looped_elem_to_loc.html#a5653be1c990722a4a215be27efe5648b"/>
+<a href="struct_looped_elem_to_loc.html#a7da7bd04e79ba86f71c535b5a6ec1a2d"/>
+<a href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40"/>
+<a href="struct_looped_elem_to_loc.html#aba051a428ad0934a9c6d04d4d3ee6e0e"/>
+<a href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333"/>
+<a href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791"/>
+<a href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a03f3ca7a60bb85e36d7eba75e0e08b15"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a0e21977d9f23b6994773e8e4f3ee70de"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a89d9ec4dc2f2f0d77e27aa0c05f261ef"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af792b1fd4e8286f97b9b863c127a2d9a"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af8f2b29946324756c09951b69e170dd8"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a2cd3b616739b3d5b41e5b46ae335957d"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a3a18944c158e2747a6ddebb420299a3b"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a66b84b12f6c1494e5908989ed2849a9f"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a7be6bf560080472d61e74b522979ef1e"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a8fe55b3a2fa8cd35af568085faed785d"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#abf536c7162d36af7367e390789944c86"/>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#af5a7c0cddeb52da88fa1140f44aec45c"/>
 <a href="struct_m_l_x_conv_params.html"/>
 <a href="struct_m_l_x_conv_params.html#a0953063962ac3b5a027243289e72fbb2"/>
 <a href="struct_m_l_x_conv_params.html#a0c8b2cfc26859a2af9d39a2cfcc3aea6"/>
@@ -6383,35 +6498,10 @@
 <a href="struct_m_l_x_conv_params.html#ae84a9afb3a95b57e0b763bb0ebda0753"/>
 <a href="struct_m_l_x_conv_params.html#af7a5590ac0974c7841c7f8b9fda0cbed"/>
 <a href="struct_m_l_x_conv_params.html#af900fdb4c4d4ea35eed02940dee8d4d1"/>
-<a href="struct_m_l_x_fast_attention_params.html"/>
-<a href="struct_m_l_x_fast_attention_params.html#a0df159c839fc27b9426b8ac4336cc0ad"/>
-<a href="struct_m_l_x_fast_attention_params.html#a1180e311b95cd4b6d4a336d21b873c21"/>
-<a href="struct_m_l_x_fast_attention_params.html#a162826d3f288f64c0aea88a36b34859b"/>
-<a href="struct_m_l_x_fast_attention_params.html#a1f8c89bd55d89ad7b9fe27c60e3cb8d5"/>
-<a href="struct_m_l_x_fast_attention_params.html#a274eeb8591c02511014dce50c4240c8a"/>
-<a href="struct_m_l_x_fast_attention_params.html#a2799a2f219441fef7f351374f4cbc67c"/>
-<a href="struct_m_l_x_fast_attention_params.html#a3c5b1170999087f3f3a03830193b55c7"/>
-<a href="struct_m_l_x_fast_attention_params.html#a5cd3ede5f41d5fdf8177cab3f059f4d8"/>
-<a href="struct_m_l_x_fast_attention_params.html#a608aa256216ac6d80af00209303d2029"/>
-<a href="struct_m_l_x_fast_attention_params.html#a68a338d522ffeb6761b7b168869361e2"/>
-<a href="struct_m_l_x_fast_attention_params.html#a6f3d94dbe44b32e675558768710bf0a3"/>
-<a href="struct_m_l_x_fast_attention_params.html#a932266d04fa7d6e27d4a4a2c175f1477"/>
-<a href="struct_m_l_x_fast_attention_params.html#a98766fc89f75d5eef65b345f16a782d1"/>
-<a href="struct_m_l_x_fast_attention_params.html#a9e73dc1971b5ab913bd85a7afa7cf46c"/>
-<a href="struct_m_l_x_fast_attention_params.html#ab42c792a80388002e34992cbd837a167"/>
-<a href="struct_m_l_x_fast_attention_params.html#ab56b3db8fc6a938ce9c739ee78a7b803"/>
-<a href="struct_m_l_x_fast_attention_params.html#ada454f5ad22ec36a22d0ff596751af23"/>
-<a href="struct_m_l_x_fast_attention_params.html#adbc0a13076da5f704498e57239cb2bf2"/>
-<a href="struct_m_l_x_fast_attention_params.html#aebada0bf0789e8706dce564752208e8b"/>
-<a href="struct_m_l_x_fast_attention_params.html#af2dadba2a28f5db2ca52472d00937e58"/>
-<a href="struct_m_l_x_scaled_dot_product_attention_params.html"/>
-<a href="struct_m_l_x_scaled_dot_product_attention_params.html#a1a63d2e7ad712b4ba26219c784c95177"/>
-<a href="struct_m_l_x_scaled_dot_product_attention_params.html#a46cc2da6a069d822f36983ee18467e5c"/>
-<a href="struct_m_l_x_scaled_dot_product_attention_params.html#a58ef2765fd681e6b35b2ba72030610e0"/>
-<a href="struct_m_l_x_scaled_dot_product_attention_params.html#a68a292b9986c20560aca88394f82e9f7"/>
-<a href="struct_m_l_x_scaled_dot_product_attention_params.html#a7461e0e17cdc7d3fed80bb00d58d8644"/>
 <a href="struct_max.html"/>
 <a href="struct_max.html#adfee65117dbf49404241861d374b9c4d"/>
+<a href="struct_max_op.html"/>
+<a href="struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e"/>
 <a href="struct_maximum.html"/>
 <a href="struct_maximum.html#a0bc8fadc87f2c49fc440d625bfc97ca6"/>
 <a href="struct_maximum.html#a3ea0f42bc4cd80b68a98f189f9fa859c"/>
@@ -6421,6 +6511,8 @@
 <a href="struct_minimum.html#a0c939921de87ab9c6959238aac81a059"/>
 <a href="struct_minimum.html#a800fba087280f79c2f7e9aff75bed093"/>
 <a href="struct_minimum.html#aa6113dfac3986c0f571fa53f65c5330e"/>
+<a href="struct_mul_op.html"/>
+<a href="struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756"/>
 <a href="struct_multiply.html"/>
 <a href="struct_multiply.html#a1327fc5a0713931afe997b0d4d2988e0"/>
 <a href="struct_na_n_equal.html"/>
@@ -6446,6 +6538,7 @@
 <a href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6"/>
 <a href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba"/>
 <a href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475"/>
+<a href="struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589"/>
 <a href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf"/>
 <a href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb"/>
 <a href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9"/>
@@ -6455,10 +6548,10 @@
 <a href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc"/>
 <a href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d"/>
 <a href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83"/>
+<a href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76"/>
 <a href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320"/>
-<a href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b"/>
+<a href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db"/>
 <a href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00"/>
-<a href="struct_quantized_block_loader.html#af59b054750a65e7e79c1cd05c4acac93"/>
 <a href="struct_read_writer.html"/>
 <a href="struct_read_writer.html#a08e10626fbc789b6dff9172fd6c36f7c"/>
 <a href="struct_read_writer.html#a0935b946b8bf2e769427fcbf2da2f7be"/>
@@ -6537,9 +6630,13 @@
 <a href="struct_sqrt.html#ab9b16d2b9b03a1c54190f4479a56a4ad"/>
 <a href="struct_square.html"/>
 <a href="struct_square.html#afde739fc544e45dd30964c02dca94310"/>
+<a href="struct_sub_op.html"/>
+<a href="struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143"/>
 <a href="struct_subtract.html"/>
 <a href="struct_subtract.html#ae0856cd8d449074ca287baa7e460f68a"/>
 <a href="struct_sum.html"/>
+<a href="struct_sum_op.html"/>
+<a href="struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d"/>
 <a href="struct_tan.html"/>
 <a href="struct_tan.html#a1e6fb8c691621c69cb9bd393de4f6e78"/>
 <a href="struct_tan.html#a2ef120c9f92b0d2e9cec8389eda05724"/>
@@ -6548,6 +6645,10 @@
 <a href="struct_tanh.html#adce11a7ad33226c6ecff34f46f5c45d7"/>
 <a href="struct_thread_sort.html"/>
 <a href="struct_thread_sort.html#ad9ab3e6b47f7e9b91c0f3b773596986d"/>
+<a href="struct_transform_scale.html"/>
+<a href="struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16"/>
+<a href="struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6"/>
+<a href="struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70"/>
 <a href="structcomplex64__t.html"/>
 <a href="structcomplex64__t.html#a0a27a41206400f1e62b60ceb56960c93"/>
 <a href="structcomplex64__t.html#a29782289bb90d6294099667b86509cd3"/>
@@ -6562,22 +6663,6 @@
 <a href="structcomplex64__t.html#ac33e2e5263fec76a4fb4418c6e1d8d14"/>
 <a href="structcomplex64__t.html#ac81b486f642fb3b26c5d659917bdbcd0"/>
 <a href="structcomplex64__t.html#adbd392a5e92d31997380ad0a38be4be8"/>
-<a href="structlooped__elem__to__loc.html"/>
-<a href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca"/>
-<a href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0"/>
-<a href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a"/>
-<a href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189"/>
-<a href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2"/>
-<a href="structlooped__elem__to__loc.html#add610f331ef8d7d2d1917050890f82b2"/>
-<a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html"/>
-<a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a1064cdfdcef779b5628ce5357a6fe4f0"/>
-<a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a8c7aaffda0ca500d9f9566e5e74217a2"/>
-<a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#aa1e9e1009c16befb9a730835836436e0"/>
-<a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html"/>
-<a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a368d2a2204cee5055386954acd5ccb90"/>
-<a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a7aebc0b0656e3a55d0dbca27a57d600e"/>
-<a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a96cf2987c04210c9197e5237e425c4b4"/>
-<a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#af2984b35f7d7300d4812e7872b3c8851"/>
 <a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html"/>
 <a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a0f48dd0c8a2d2dfa825067fb212b2e6b"/>
 <a href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a13829f8c7a7c0efdc8946eff5d3c9470"/>
@@ -6921,19 +7006,25 @@
 <a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a"/>
 <a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html"/>
-<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a0a8501b940e5a347475fa4bc38fb4c05"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522"/>
-<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6d4c03a6585deedb5ccd1a1057d0c6ef"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a7375adf9ee5355bcf4b7f5f210efd115"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a85796b2bf41dbf347ae0978d4660600d"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198"/>
-<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9c343f791812a45c6c03a5c9f27f74d5"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#abc52d18ea87d213c47fd26062c829849"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aeef08f5f3c015578d40de756a6465aa2"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefdadbff4e003dc6f77506840babc088"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#a28bafec56edec3091e8716d8ccfb6ee1"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174"/>
@@ -6964,26 +7055,45 @@
 <a href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#ac528109a11abcb82e6e221c5efa4493c"/>
 <a href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#adf608e22d0c0397217472408aab52631"/>
 <a href="structmlx_1_1steel_1_1_accum_helper.html"/>
-<a href="structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da"/>
+<a href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a03e5480d1cca6af541be54a8720e9974"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a07ae31628e43e09bce533c7682c8dae3"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a1cba7fedbd02e157922619195997cf4f"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a33dc7fc22d2604a73af9f94eeea45bb4"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a3b3e18cb993ab24819c852bc64288841"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a3d286a0c27bace6016ed7a87f43291b7"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a48575afc94ab9ff74deaba61464e57a1"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a497b7404bcd25b535c3589c61f269f63"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a4cfd2ccb0fd7eb81c2a781a0614fdcbe"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a59255882cbd78bb6f15e704e3a356a7f"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a68a66e3fafa922dcfd1ab1f6bdc2375e"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#a90bba215328201a37eb1c430ce9f8563"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#aaf953954274794cfcb4e35e82d681b58"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#acc4860c3ce09c7230b470182ed002d3c"/>
+<a href="structmlx_1_1steel_1_1_attn_params.html#ad81bcd32e6ff8fec0000eca505fb6826"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag.html"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a211102315e2afbcfcd2e2c201b638e9f"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4"/>
+<a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a318c4279bdc7b39b7919f108b1cd8010"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a3c34dfdc944db110f4735f1b25307cf0"/>
-<a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9"/>
+<a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a51d662e4cff88b5ad17d7c44bb6b6970"/>
+<a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a5ec2e40a8f5ad98c71b825544cdd878b"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3"/>
+<a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887"/>
+<a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb"/>
+<a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aab8dd1c6917247da41dd3a31139a665f"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c"/>
 <a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d"/>
-<a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b"/>
 <a href="structmlx_1_1steel_1_1_block_loader.html"/>
 <a href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b"/>
-<a href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd"/>
 <a href="structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335"/>
 <a href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092"/>
 <a href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8"/>
@@ -6994,23 +7104,41 @@
 <a href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d"/>
 <a href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d"/>
 <a href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d"/>
+<a href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa"/>
 <a href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf"/>
-<a href="structmlx_1_1steel_1_1_block_loader.html#af34c184a19846e4b40ba54b2946589ec"/>
+<a href="structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2"/>
 <a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html"/>
-<a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#afbef88bfb901a71e8423de911b7c7347"/>
+<a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#a076616a7c67ad1b847e0e6b046077ee2"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#a0ccc7caa93e6e709981a1a08159d41dc"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#a2b136fad00dc54300e68aa6b905eff97"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#a6008ef45ff980dbe1119da0630f6c697"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#a6eb4e566b687395e27f290da288362db"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#ac2d95e35ba39e0984e6f1e58ca935f7d"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#acb743f32146fdc7986264b7beb35fb38"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321"/>
+<a href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562"/>
-<a href="structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26"/>
+<a href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142"/>
+<a href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0"/>
+<a href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3"/>
-<a href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88"/>
+<a href="structmlx_1_1steel_1_1_block_m_m_a.html#a8231b0e3475077c1381eb8f5daf62e35"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330"/>
@@ -7022,13 +7150,14 @@
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a"/>
-<a href="structmlx_1_1steel_1_1_block_m_m_a.html#ae2c42cb6d0dde785859164c195f4d13c"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d"/>
-<a href="structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c"/>
 <a href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff"/>
 <a href="structmlx_1_1steel_1_1_block_swizzle.html"/>
 <a href="structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760"/>
+<a href="structmlx_1_1steel_1_1_c_shape.html"/>
+<a href="structmlx_1_1steel_1_1_c_shape.html#a01b09227356b6a682a0694523a8e6901"/>
+<a href="structmlx_1_1steel_1_1_c_shape.html#a5caf36cb9acf9f90ba59a9b0b4197993"/>
 <a href="structmlx_1_1steel_1_1_channel_helper.html"/>
 <a href="structmlx_1_1steel_1_1_channel_helper.html#a2b24f991a9380fdad6b51a038770b925"/>
 <a href="structmlx_1_1steel_1_1_channel_helper.html#aa476bd0fcb38494c268547fc9820fc0a"/>
@@ -7229,15 +7358,15 @@
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298"/>
+<a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1a115d5af0fb6e260165adba2e377635"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64"/>
-<a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa8a04ed74d2259f99b337d4662c64d83"/>
-<a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa98f32278b5fd98c93ae5483c3596395"/>
+<a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a98b6ec692580510081e2aa887a61944b"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0"/>
-<a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#add8c6a31011a4895667c2a94a5af3782"/>
+<a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ae52eb09c9478cd4f199662346ac0c83e"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_params.html"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0d7f419ba265805b418e93ce1ca2e0f9"/>
 <a href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0e6b8b629232f1b43fbce9a395174bed"/>
@@ -7277,35 +7406,46 @@
 <a href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#acf168c72f4a86b72b8f5f386f07c9d8c"/>
 <a href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ad0713159d4f710cd9a066596593d8840"/>
 <a href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ae1b0386e4cd1a7018f4b654c4e9493ba"/>
+<a href="structmlx_1_1steel_1_1_layout2_d.html"/>
+<a href="structmlx_1_1steel_1_1_layout2_d.html#a23183747ab1ddbdd3f1fcac6d0faa2cd"/>
+<a href="structmlx_1_1steel_1_1_layout2_d.html#a6beedf1677ee1b192fb48c83a29ac8a1"/>
 <a href="structmlx_1_1steel_1_1_loop_alignment.html"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a"/>
-<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190"/>
+<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1ea49efd92696b15302ee4b52ecd548c"/>
+<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e"/>
+<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7"/>
-<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef"/>
+<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323"/>
+<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62"/>
+<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f"/>
-<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c"/>
+<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f"/>
+<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9"/>
-<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382"/>
-<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44"/>
+<a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3"/>
 <a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6"/>
+<a href="structmlx_1_1steel_1_1_shape2_d.html"/>
+<a href="structmlx_1_1steel_1_1_shape2_d.html#a070ce70eb6d84361c7f313159c438a5c"/>
+<a href="structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe"/>
+<a href="structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e"/>
 <a href="structmlx_1_1steel_1_1_transform_add.html"/>
 <a href="structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19"/>
 <a href="structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae"/>
@@ -7633,6 +7773,7 @@
 <a href="utils_8h.html#a0023c267cf81345fad65e7a797954cd3"/>
 <a href="utils_8h.html#a0181b5d72bf3d34448dabc70f7ff858d"/>
 <a href="utils_8h.html#a075e07def338cd9d815182d0e6a656c0"/>
+<a href="utils_8h.html#a0efecbf9efe695adafad12b5a4945df3"/>
 <a href="utils_8h.html#a123331f01188bd76e37623b63b6b4340"/>
 <a href="utils_8h.html#a1e5c30e316afa30c14bc48b92afdb794"/>
 <a href="utils_8h.html#a1fd58658474fb842d648dcf8f7d9f078"/>
@@ -7649,9 +7790,11 @@
 <a href="utils_8h.html#a8b984eef832f757e28cd262d64a49ae7"/>
 <a href="utils_8h.html#a95fc1013cc48fbfee0c54310711a5e58"/>
 <a href="utils_8h.html#aafaf24a28297428caf6d0c36c623489e"/>
+<a href="utils_8h.html#ac3266e1259a64c8b56bdc6c7029179f2"/>
 <a href="utils_8h.html#ac457c232f956ba802acb69c5a621633d"/>
 <a href="utils_8h.html#ad4b664de4a4abd305827b30879b9da33"/>
 <a href="utils_8h.html#adacbc4526e8964b267a8ec3eb1bc1a32"/>
+<a href="utils_8h.html#aedbf4e739553024c33dd0094dd9107aa"/>
 <a href="utils_8h.html#af5a408a78cc934717dd711ddfda58ea6"/>
 </body>
 </html>
diff --git a/docs/build/html/erf_8h_source.html b/docs/build/html/erf_8h_source.html
index 720eda103..d1fa7b458 100644
--- a/docs/build/html/erf_8h_source.html
+++ b/docs/build/html/erf_8h_source.html
@@ -166,10 +166,10 @@ $(function(){ initResizable(false); });
 </div>
 <div class="ttc" id="aerf_8h_html_a1846e0d683c7aff826bb32addcc3b885"><div class="ttname"><a href="erf_8h.html#a1846e0d683c7aff826bb32addcc3b885">erfinv</a></div><div class="ttdeci">float erfinv(float a)</div><div class="ttdef"><b>Definition</b> erf.h:42</div></div>
 <div class="ttc" id="aerf_8h_html_a6ce199ee56105c67adbf8c48c019a8b2"><div class="ttname"><a href="erf_8h.html#a6ce199ee56105c67adbf8c48c019a8b2">erf</a></div><div class="ttdeci">float erf(float a)</div><div class="ttdef"><b>Definition</b> erf.h:11</div></div>
-<div class="ttc" id="anamespacemetal_html_a423a9f4f2fc7ef5ec7eda061277b51b6"><div class="ttname"><a href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a6301a78d69ff14a06194ca85a0c7d326"><div class="ttname"><a href="namespacemetal.html#a6301a78d69ff14a06194ca85a0c7d326">metal::fma</a></div><div class="ttdeci">METAL_FUNC bfloat16_t fma(bfloat16_t x, bfloat16_t y, bfloat16_t z)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a87c5122c60f9a12afceb9925a5b78ffb"><div class="ttname"><a href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">metal::abs</a></div><div class="ttdeci">METAL_FUNC bfloat16_t abs(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_ac2a0b3618d922ac014baac8189d44650"><div class="ttname"><a href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">metal::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
+<div class="ttc" id="anamespacemetal_html_a423a9f4f2fc7ef5ec7eda061277b51b6"><div class="ttname"><a href="namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6">metal::log</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a6301a78d69ff14a06194ca85a0c7d326"><div class="ttname"><a href="namespacemetal.html#a6301a78d69ff14a06194ca85a0c7d326">metal::fma</a></div><div class="ttdeci">METAL_FUNC bfloat16_t fma(bfloat16_t x, bfloat16_t y, bfloat16_t z)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a87c5122c60f9a12afceb9925a5b78ffb"><div class="ttname"><a href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">metal::abs</a></div><div class="ttdeci">METAL_FUNC bfloat16_t abs(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_ac2a0b3618d922ac014baac8189d44650"><div class="ttname"><a href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">metal::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
 <div class="ttc" id="atypes_2bf16_8h_html_aa21e554721eddcf127b7fcfa7fdc56bd"><div class="ttname"><a href="types_2bf16_8h.html#aa21e554721eddcf127b7fcfa7fdc56bd">u</a></div><div class="ttdeci">uint32_t u</div><div class="ttdef"><b>Definition</b> bf16.h:17</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/examples/linear_regression.html b/docs/build/html/examples/linear_regression.html
index f5f704fa9..0cb7ddf61 100644
--- a/docs/build/html/examples/linear_regression.html
+++ b/docs/build/html/examples/linear_regression.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Linear Regression &#8212; MLX 0.20.0 documentation</title>
+    <title>Linear Regression &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Using Streams" href="../usage/using_streams.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/examples/llama-inference.html b/docs/build/html/examples/llama-inference.html
index fe42132f5..286a271a1 100644
--- a/docs/build/html/examples/llama-inference.html
+++ b/docs/build/html/examples/llama-inference.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>LLM inference &#8212; MLX 0.20.0 documentation</title>
+    <title>LLM inference &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Multi-Layer Perceptron" href="mlp.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/examples/mlp.html b/docs/build/html/examples/mlp.html
index e30b0d5e5..254db7d81 100644
--- a/docs/build/html/examples/mlp.html
+++ b/docs/build/html/examples/mlp.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Multi-Layer Perceptron &#8212; MLX 0.20.0 documentation</title>
+    <title>Multi-Layer Perceptron &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Linear Regression" href="linear_regression.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/fast_8h.html b/docs/build/html/fast_8h.html
index 990ee371b..f584102d8 100644
--- a/docs/build/html/fast_8h.html
+++ b/docs/build/html/fast_8h.html
@@ -129,8 +129,6 @@ Functions</h2></td></tr>
 <tr class="separator:a3663b50265b0a9c0cca2b5376852e059"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aa4b5f6886b2288cb6dfdd8598579f080" id="r_aa4b5f6886b2288cb6dfdd8598579f080"><td class="memItemLeft" align="right" valign="top">std::tuple&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a>, <a class="el" href="classmlx_1_1core_1_1array.html">array</a>, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1fast.html#aa4b5f6886b2288cb6dfdd8598579f080">mlx::core::fast::affine_quantize</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;w, int group_size=64, int bits=4, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
 <tr class="separator:aa4b5f6886b2288cb6dfdd8598579f080"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a638c7e9b9ea8677f01786d8f9738baf8" id="r_a638c7e9b9ea8677f01786d8f9738baf8"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1fast.html#a638c7e9b9ea8677f01786d8f9738baf8">mlx::core::fast::affine_quantize</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;w, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;scales, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;biases, int group_size=64, int bits=4, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
-<tr class="separator:a638c7e9b9ea8677f01786d8f9738baf8"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a12c7ef41409d6fb378008e67b6fab328" id="r_a12c7ef41409d6fb378008e67b6fab328"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1fast.html#a12c7ef41409d6fb378008e67b6fab328">mlx::core::fast::affine_dequantize</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;w, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;scales, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;biases, int group_size=64, int bits=4, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
 <tr class="separator:a12c7ef41409d6fb378008e67b6fab328"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ab16436b465dc10ce472193d541d8426e" id="r_ab16436b465dc10ce472193d541d8426e"><td class="memItemLeft" align="right" valign="top"><a class="el" href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0">MetalKernelFunction</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e">mlx::core::fast::metal_kernel</a> (const std::string &amp;name, const std::vector&lt; std::string &gt; &amp;input_names, const std::vector&lt; std::string &gt; &amp;output_names, const std::string &amp;source, const std::string &amp;header=&quot;&quot;, bool ensure_row_contiguous=true, bool atomic_outputs=false)</td></tr>
diff --git a/docs/build/html/fast_8h_source.html b/docs/build/html/fast_8h_source.html
index 90326ce03..3a1a368a8 100644
--- a/docs/build/html/fast_8h_source.html
+++ b/docs/build/html/fast_8h_source.html
@@ -140,7 +140,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>    <span class="keywordtype">int</span> bits = 4,</div>
 <div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>    <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
 <div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span> </div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1fast.html#a638c7e9b9ea8677f01786d8f9738baf8">   50</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="namespacemlx_1_1core_1_1fast.html#aa4b5f6886b2288cb6dfdd8598579f080">affine_quantize</a>(</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1fast.html#a12c7ef41409d6fb378008e67b6fab328">   50</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="namespacemlx_1_1core_1_1fast.html#a12c7ef41409d6fb378008e67b6fab328">affine_dequantize</a>(</div>
 <div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; w,</div>
 <div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; scales,</div>
 <div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; biases,</div>
@@ -148,47 +148,39 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>    <span class="keywordtype">int</span> bits = 4,</div>
 <div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>    <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
 <div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span> </div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1fast.html#a12c7ef41409d6fb378008e67b6fab328">   58</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="namespacemlx_1_1core_1_1fast.html#a12c7ef41409d6fb378008e67b6fab328">affine_dequantize</a>(</div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; w,</div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; scales,</div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; biases,</div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>    <span class="keywordtype">int</span> group_size = 64,</div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    <span class="keywordtype">int</span> bits = 4,</div>
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span> </div>
-<div class="line"><a id="l00066" name="l00066"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1fast.html#a9390693ff7be931f3ef3428e2ea4c3f9">   66</a></span><span class="keyword">typedef</span> std::variant&lt;int, bool, Dtype&gt; <a class="code hl_typedef" href="namespacemlx_1_1core_1_1fast.html#a9390693ff7be931f3ef3428e2ea4c3f9">TemplateArg</a>;</div>
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span> </div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span><span class="keyword">typedef</span> std::function&lt;std::vector&lt;array&gt;(</div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>    <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>    <span class="keyword">const</span> std::vector&lt;std::vector&lt;int&gt;&gt;&amp;,</div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>    <span class="keyword">const</span> std::vector&lt;Dtype&gt;&amp;,</div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    std::tuple&lt;int, int, int&gt;,</div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>    std::tuple&lt;int, int, int&gt;,</div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>    std::vector&lt;std::pair&lt;std::string, TemplateArg&gt;&gt;,</div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    std::optional&lt;float&gt;,</div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    <span class="keywordtype">bool</span>,</div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a>)&gt;</div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0">   78</a></span>    <a class="code hl_typedef" href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0">MetalKernelFunction</a>;</div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span> </div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e">   80</a></span><a class="code hl_typedef" href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0">MetalKernelFunction</a> <a class="code hl_function" href="namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e">metal_kernel</a>(</div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    <span class="keyword">const</span> std::string&amp; name,</div>
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    <span class="keyword">const</span> std::vector&lt;std::string&gt;&amp; input_names,</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    <span class="keyword">const</span> std::vector&lt;std::string&gt;&amp; output_names,</div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    <span class="keyword">const</span> std::string&amp; source,</div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>    <span class="keyword">const</span> std::string&amp; header = <span class="stringliteral">&quot;&quot;</span>,</div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>    <span class="keywordtype">bool</span> ensure_row_contiguous = <span class="keyword">true</span>,</div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>    <span class="keywordtype">bool</span> atomic_outputs = <span class="keyword">false</span>);</div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span> </div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>} <span class="comment">// namespace mlx::core::fast</span></div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1fast.html#a9390693ff7be931f3ef3428e2ea4c3f9">   58</a></span><span class="keyword">typedef</span> std::variant&lt;int, bool, Dtype&gt; <a class="code hl_typedef" href="namespacemlx_1_1core_1_1fast.html#a9390693ff7be931f3ef3428e2ea4c3f9">TemplateArg</a>;</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span> </div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span><span class="keyword">typedef</span> std::function&lt;std::vector&lt;array&gt;(</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>    <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>    <span class="keyword">const</span> std::vector&lt;std::vector&lt;int&gt;&gt;&amp;,</div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    <span class="keyword">const</span> std::vector&lt;Dtype&gt;&amp;,</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    std::tuple&lt;int, int, int&gt;,</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>    std::tuple&lt;int, int, int&gt;,</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    std::vector&lt;std::pair&lt;std::string, TemplateArg&gt;&gt;,</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    std::optional&lt;float&gt;,</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    <span class="keywordtype">bool</span>,</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>    <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a>)&gt;</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0">   70</a></span>    <a class="code hl_typedef" href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0">MetalKernelFunction</a>;</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span> </div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e">   72</a></span><a class="code hl_typedef" href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0">MetalKernelFunction</a> <a class="code hl_function" href="namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e">metal_kernel</a>(</div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>    <span class="keyword">const</span> std::string&amp; name,</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>    <span class="keyword">const</span> std::vector&lt;std::string&gt;&amp; input_names,</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    <span class="keyword">const</span> std::vector&lt;std::string&gt;&amp; output_names,</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    <span class="keyword">const</span> std::string&amp; source,</div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    <span class="keyword">const</span> std::string&amp; header = <span class="stringliteral">&quot;&quot;</span>,</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    <span class="keywordtype">bool</span> ensure_row_contiguous = <span class="keyword">true</span>,</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    <span class="keywordtype">bool</span> atomic_outputs = <span class="keyword">false</span>);</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span> </div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>} <span class="comment">// namespace mlx::core::fast</span></div>
 </div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1fast_html"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a></div><div class="ttdef"><b>Definition</b> fast.h:9</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_a01bd533ebd0e2415c4ee30032d51d7bf"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#a01bd533ebd0e2415c4ee30032d51d7bf">mlx::core::fast::layer_norm</a></div><div class="ttdeci">array layer_norm(const array &amp;x, const std::optional&lt; array &gt; &amp;weight, const std::optional&lt; array &gt; &amp;bias, float eps, StreamOrDevice s={})</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_a0e8c2c4ea7a946568c8fe5b4810417e0"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0">mlx::core::fast::MetalKernelFunction</a></div><div class="ttdeci">std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;, const std::vector&lt; std::vector&lt; int &gt; &gt; &amp;, const std::vector&lt; Dtype &gt; &amp;, std::tuple&lt; int, int, int &gt;, std::tuple&lt; int, int, int &gt;, std::vector&lt; std::pair&lt; std::string, TemplateArg &gt; &gt;, std::optional&lt; float &gt;, bool, StreamOrDevice)&gt; MetalKernelFunction</div><div class="ttdef"><b>Definition</b> fast.h:78</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_a0e8c2c4ea7a946568c8fe5b4810417e0"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0">mlx::core::fast::MetalKernelFunction</a></div><div class="ttdeci">std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;, const std::vector&lt; std::vector&lt; int &gt; &gt; &amp;, const std::vector&lt; Dtype &gt; &amp;, std::tuple&lt; int, int, int &gt;, std::tuple&lt; int, int, int &gt;, std::vector&lt; std::pair&lt; std::string, TemplateArg &gt; &gt;, std::optional&lt; float &gt;, bool, StreamOrDevice)&gt; MetalKernelFunction</div><div class="ttdef"><b>Definition</b> fast.h:70</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_a12c7ef41409d6fb378008e67b6fab328"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#a12c7ef41409d6fb378008e67b6fab328">mlx::core::fast::affine_dequantize</a></div><div class="ttdeci">array affine_dequantize(const array &amp;w, const array &amp;scales, const array &amp;biases, int group_size=64, int bits=4, StreamOrDevice s={})</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_a3663b50265b0a9c0cca2b5376852e059"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#a3663b50265b0a9c0cca2b5376852e059">mlx::core::fast::scaled_dot_product_attention</a></div><div class="ttdeci">array scaled_dot_product_attention(const array &amp;queries, const array &amp;keys, const array &amp;values, const float scale, const std::optional&lt; array &gt; &amp;mask=std::nullopt, const std::optional&lt; int &gt; memory_efficient_threshold=std::nullopt, StreamOrDevice s={})</div><div class="ttdoc">Computes: O = softmax(Q @ K.T) @ V.</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_a534ef357eae24892684a6ecd866d3fab"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#a534ef357eae24892684a6ecd866d3fab">mlx::core::fast::rope</a></div><div class="ttdeci">array rope(const array &amp;x, int dims, bool traditional, std::optional&lt; float &gt; base, float scale, int offset, const std::optional&lt; array &gt; &amp;freqs=std::nullopt, StreamOrDevice s={})</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_a9390693ff7be931f3ef3428e2ea4c3f9"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#a9390693ff7be931f3ef3428e2ea4c3f9">mlx::core::fast::TemplateArg</a></div><div class="ttdeci">std::variant&lt; int, bool, Dtype &gt; TemplateArg</div><div class="ttdef"><b>Definition</b> fast.h:66</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_a9390693ff7be931f3ef3428e2ea4c3f9"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#a9390693ff7be931f3ef3428e2ea4c3f9">mlx::core::fast::TemplateArg</a></div><div class="ttdeci">std::variant&lt; int, bool, Dtype &gt; TemplateArg</div><div class="ttdef"><b>Definition</b> fast.h:58</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_aa4b5f6886b2288cb6dfdd8598579f080"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#aa4b5f6886b2288cb6dfdd8598579f080">mlx::core::fast::affine_quantize</a></div><div class="ttdeci">std::tuple&lt; array, array, array &gt; affine_quantize(const array &amp;w, int group_size=64, int bits=4, StreamOrDevice s={})</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_ab16436b465dc10ce472193d541d8426e"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e">mlx::core::fast::metal_kernel</a></div><div class="ttdeci">MetalKernelFunction metal_kernel(const std::string &amp;name, const std::vector&lt; std::string &gt; &amp;input_names, const std::vector&lt; std::string &gt; &amp;output_names, const std::string &amp;source, const std::string &amp;header=&quot;&quot;, bool ensure_row_contiguous=true, bool atomic_outputs=false)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1fast_html_ac7b620275c6386f822b7aacc6b312e62"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html#ac7b620275c6386f822b7aacc6b312e62">mlx::core::fast::rms_norm</a></div><div class="ttdeci">array rms_norm(const array &amp;x, const array &amp;weight, float eps, StreamOrDevice s={})</div></div>
diff --git a/docs/build/html/fast__primitives_8h_source.html b/docs/build/html/fast__primitives_8h_source.html
index 5780e0cde..68274176a 100644
--- a/docs/build/html/fast__primitives_8h_source.html
+++ b/docs/build/html/fast__primitives_8h_source.html
@@ -365,87 +365,83 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>        dequantize_(<a class="code hl_function" href="group__ops.html#gabff758a5c1ce32ad7e8b78aba0164077">dequantize</a>) {}</div>
 </div>
 <div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span> </div>
-<div class="foldopen" id="foldopen00230" data-start="{" data-end="}">
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">  230</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)<span class="keyword"></span></div>
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span><span class="keyword">      override </span>{</div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>    <span class="keywordflow">throw</span> std::runtime_error(<span class="stringliteral">&quot;NYI&quot;</span>);</div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>  }</div>
-</div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span> </div>
-<div class="line"><a id="l00235" name="l00235"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">  235</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">  230</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span> </div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">  233</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span> </div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a4b8f1b1f633002c8ca6fa8f0ef4dd587">  236</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a4b8f1b1f633002c8ca6fa8f0ef4dd587">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html">AffineQuantize</a>);</div>
 <div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span> </div>
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a4b8f1b1f633002c8ca6fa8f0ef4dd587">  238</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a4b8f1b1f633002c8ca6fa8f0ef4dd587">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html">AffineQuantize</a>);</div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span> </div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>  std::function&lt;std::vector&lt;array&gt;(std::vector&lt;array&gt;)&gt; fallback_;</div>
-<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>  <span class="keywordtype">int</span> group_size_;</div>
-<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>  <span class="keywordtype">int</span> bits_;</div>
-<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>  <span class="keywordtype">bool</span> dequantize_;</div>
-<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>};</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>  std::function&lt;std::vector&lt;array&gt;(std::vector&lt;array&gt;)&gt; fallback_;</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>  <span class="keywordtype">int</span> group_size_;</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>  <span class="keywordtype">int</span> bits_;</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>  <span class="keywordtype">bool</span> dequantize_;</div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>};</div>
 </div>
-<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span> </div>
-<div class="foldopen" id="foldopen00247" data-start="{" data-end="};">
-<div class="line"><a id="l00247" name="l00247"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html">  247</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html">CustomKernelShapeInfo</a> {</div>
-<div class="line"><a id="l00248" name="l00248"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">  248</a></span>  <span class="keywordtype">bool</span> <a class="code hl_variable" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">shape</a> = <span class="keyword">false</span>;</div>
-<div class="line"><a id="l00249" name="l00249"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2">  249</a></span>  <span class="keywordtype">bool</span> <a class="code hl_variable" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2">strides</a> = <span class="keyword">false</span>;</div>
-<div class="line"><a id="l00250" name="l00250"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051">  250</a></span>  <span class="keywordtype">bool</span> <a class="code hl_variable" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051">ndim</a> = <span class="keyword">false</span>;</div>
-<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>};</div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span> </div>
+<div class="foldopen" id="foldopen00245" data-start="{" data-end="};">
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html">  245</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html">CustomKernelShapeInfo</a> {</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">  246</a></span>  <span class="keywordtype">bool</span> <a class="code hl_variable" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">shape</a> = <span class="keyword">false</span>;</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2">  247</a></span>  <span class="keywordtype">bool</span> <a class="code hl_variable" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2">strides</a> = <span class="keyword">false</span>;</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051">  248</a></span>  <span class="keywordtype">bool</span> <a class="code hl_variable" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051">ndim</a> = <span class="keyword">false</span>;</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>};</div>
 </div>
-<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span> </div>
-<div class="foldopen" id="foldopen00253" data-start="{" data-end="};">
-<div class="line"><a id="l00253" name="l00253"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">  253</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">CustomKernel</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
-<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen00255" data-start="{" data-end="}">
-<div class="line"><a id="l00255" name="l00255"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153">  255</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153">CustomKernel</a>(</div>
-<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> <a class="code hl_function" href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">stream</a>,</div>
-<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>      std::string name,</div>
-<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>      std::string source,</div>
-<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>      std::tuple&lt;int, int, int&gt; grid,</div>
-<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>      std::tuple&lt;int, int, int&gt; threadgroup,</div>
-<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>      std::vector&lt;CustomKernelShapeInfo&gt; shape_infos,</div>
-<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>      <span class="keywordtype">bool</span> ensure_row_contiguous,</div>
-<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>      std::optional&lt;float&gt; init_value)</div>
-<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(<a class="code hl_function" href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">stream</a>),</div>
-<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>        source_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(source)),</div>
-<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>        name_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(name)),</div>
-<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>        grid_(grid),</div>
-<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>        threadgroup_(threadgroup),</div>
-<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>        shape_infos_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(shape_infos)),</div>
-<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>        ensure_row_contiguous_(ensure_row_contiguous),</div>
-<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>        init_value_(init_value) {}</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span> </div>
+<div class="foldopen" id="foldopen00251" data-start="{" data-end="};">
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">  251</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">CustomKernel</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen00253" data-start="{" data-end="}">
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153">  253</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153">CustomKernel</a>(</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> <a class="code hl_function" href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">stream</a>,</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>      std::string name,</div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>      std::string source,</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>      std::tuple&lt;int, int, int&gt; grid,</div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>      std::tuple&lt;int, int, int&gt; threadgroup,</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>      std::vector&lt;CustomKernelShapeInfo&gt; shape_infos,</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>      <span class="keywordtype">bool</span> ensure_row_contiguous,</div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>      std::optional&lt;float&gt; init_value)</div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(<a class="code hl_function" href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">stream</a>),</div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>        source_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(source)),</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>        name_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(name)),</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>        grid_(grid),</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>        threadgroup_(threadgroup),</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>        shape_infos_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(shape_infos)),</div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>        ensure_row_contiguous_(ensure_row_contiguous),</div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>        init_value_(init_value) {}</div>
 </div>
-<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span> </div>
-<div class="foldopen" id="foldopen00273" data-start="{" data-end="}">
-<div class="line"><a id="l00273" name="l00273"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">  273</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)<span class="keyword"></span></div>
-<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span><span class="keyword">      override </span>{</div>
-<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>    <span class="keywordflow">throw</span> std::runtime_error(<span class="stringliteral">&quot;Custom Metal kernels only run on GPU.&quot;</span>);</div>
-<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>  }</div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span> </div>
+<div class="foldopen" id="foldopen00271" data-start="{" data-end="}">
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">  271</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)<span class="keyword"></span></div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span><span class="keyword">      override </span>{</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>    <span class="keywordflow">throw</span> std::runtime_error(<span class="stringliteral">&quot;Custom Metal kernels only run on GPU.&quot;</span>);</div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>  }</div>
 </div>
-<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span> </div>
-<div class="line"><a id="l00278" name="l00278"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db">  278</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span> </div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db">  276</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span> </div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a116ecf31c8672c94e5ea06c1d43e9534">  279</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a116ecf31c8672c94e5ea06c1d43e9534">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">CustomKernel</a>);</div>
 <div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span> </div>
-<div class="line"><a id="l00281" name="l00281"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a116ecf31c8672c94e5ea06c1d43e9534">  281</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a116ecf31c8672c94e5ea06c1d43e9534">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">CustomKernel</a>);</div>
-<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span> </div>
-<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>  std::string source_;</div>
-<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>  std::string name_;</div>
-<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>  std::tuple&lt;int, int, int&gt; grid_;</div>
-<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>  std::tuple&lt;int, int, int&gt; threadgroup_;</div>
-<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>  std::vector&lt;CustomKernelShapeInfo&gt; shape_infos_;</div>
-<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>  <span class="keywordtype">bool</span> ensure_row_contiguous_;</div>
-<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>  std::optional&lt;float&gt; init_value_;</div>
-<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>};</div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>  std::string source_;</div>
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>  std::string name_;</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>  std::tuple&lt;int, int, int&gt; grid_;</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>  std::tuple&lt;int, int, int&gt; threadgroup_;</div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>  std::vector&lt;CustomKernelShapeInfo&gt; shape_infos_;</div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>  <span class="keywordtype">bool</span> ensure_row_contiguous_;</div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>  std::optional&lt;float&gt; init_value_;</div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>};</div>
 </div>
-<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span> </div>
-<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>} <span class="comment">// namespace mlx::core::fast</span></div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span> </div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>} <span class="comment">// namespace mlx::core::fast</span></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></div><div class="ttdef"><b>Definition</b> primitives.h:48</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html_a46e6257397a662528f9f831842ac456a"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a">mlx::core::Primitive::stream</a></div><div class="ttdeci">const Stream &amp; stream()</div><div class="ttdoc">The stream the primitive will run on.</div><div class="ttdef"><b>Definition</b> primitives.h:58</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html_a6140a502af4c2bbbc776ab26e9afebcd"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd">mlx::core::Primitive::is_equivalent</a></div><div class="ttdeci">virtual bool is_equivalent(const Primitive &amp;other) const</div><div class="ttdoc">Equivalence check defaults to false unless overridden by the primitive.</div><div class="ttdef"><b>Definition</b> primitives.h:107</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_affine_quantize_html"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html">mlx::core::fast::AffineQuantize</a></div><div class="ttdef"><b>Definition</b> fast_primitives.h:217</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_affine_quantize_html_a3b5d628628d245b38911118d4a0ff9fd"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">mlx::core::fast::AffineQuantize::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div><div class="ttdef"><b>Definition</b> fast_primitives.h:230</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_affine_quantize_html_a3b5d628628d245b38911118d4a0ff9fd"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">mlx::core::fast::AffineQuantize::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_affine_quantize_html_a4b8f1b1f633002c8ca6fa8f0ef4dd587"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a4b8f1b1f633002c8ca6fa8f0ef4dd587">mlx::core::fast::AffineQuantize::DEFINE_PRINT</a></div><div class="ttdeci">DEFINE_PRINT(AffineQuantize)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_affine_quantize_html_a63812b2abaf26ad7e7fa4c9e82db1628"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">mlx::core::fast::AffineQuantize::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_affine_quantize_html_a84d5fa9e8c3de407fbcc5f38d2ed1473"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a84d5fa9e8c3de407fbcc5f38d2ed1473">mlx::core::fast::AffineQuantize::AffineQuantize</a></div><div class="ttdeci">AffineQuantize(Stream stream, std::function&lt; std::vector&lt; array &gt;(std::vector&lt; array &gt;)&gt; fallback, int group_size, int bits, bool dequantize)</div><div class="ttdef"><b>Definition</b> fast_primitives.h:219</div></div>
@@ -454,11 +450,11 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_html_a74be4bcd0382f7f6400bf73fd5569c91"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91">mlx::core::fast::Custom::vjp</a></div><div class="ttdeci">virtual std::vector&lt; array &gt; vjp(const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotangents, const std::vector&lt; int &gt; &amp;argnums, const std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">The vector-Jacobian product.</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_html_a7f4c3a4c48c6807faa36fb31e39dad8d"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d">mlx::core::fast::Custom::vmap</a></div><div class="ttdeci">virtual std::pair&lt; std::vector&lt; array &gt;, std::vector&lt; int &gt; &gt; vmap(const std::vector&lt; array &gt; &amp;inputs, const std::vector&lt; int &gt; &amp;axes) override</div><div class="ttdoc">The primitive must know how to vectorize itself across the given axes.</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_html_ac77b28702654df8e7d882a49357a9584"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584">mlx::core::fast::Custom::jvp</a></div><div class="ttdeci">virtual std::vector&lt; array &gt; jvp(const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;tangents, const std::vector&lt; int &gt; &amp;argnums) override</div><div class="ttdoc">The Jacobian-vector product.</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_kernel_html"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">mlx::core::fast::CustomKernel</a></div><div class="ttdef"><b>Definition</b> fast_primitives.h:253</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_kernel_html"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html">mlx::core::fast::CustomKernel</a></div><div class="ttdef"><b>Definition</b> fast_primitives.h:251</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_kernel_html_a116ecf31c8672c94e5ea06c1d43e9534"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a116ecf31c8672c94e5ea06c1d43e9534">mlx::core::fast::CustomKernel::DEFINE_PRINT</a></div><div class="ttdeci">DEFINE_PRINT(CustomKernel)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_kernel_html_a2ed2a16b23053f8195068386a99fd6db"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db">mlx::core::fast::CustomKernel::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_kernel_html_a4ad1b7a9919753c759093f3e21a15bad"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">mlx::core::fast::CustomKernel::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div><div class="ttdef"><b>Definition</b> fast_primitives.h:273</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_kernel_html_a954893e07f0d36715b4e1e414b6f2153"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153">mlx::core::fast::CustomKernel::CustomKernel</a></div><div class="ttdeci">CustomKernel(Stream stream, std::string name, std::string source, std::tuple&lt; int, int, int &gt; grid, std::tuple&lt; int, int, int &gt; threadgroup, std::vector&lt; CustomKernelShapeInfo &gt; shape_infos, bool ensure_row_contiguous, std::optional&lt; float &gt; init_value)</div><div class="ttdef"><b>Definition</b> fast_primitives.h:255</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_kernel_html_a4ad1b7a9919753c759093f3e21a15bad"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">mlx::core::fast::CustomKernel::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div><div class="ttdef"><b>Definition</b> fast_primitives.h:271</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_custom_kernel_html_a954893e07f0d36715b4e1e414b6f2153"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153">mlx::core::fast::CustomKernel::CustomKernel</a></div><div class="ttdeci">CustomKernel(Stream stream, std::string name, std::string source, std::tuple&lt; int, int, int &gt; grid, std::tuple&lt; int, int, int &gt; threadgroup, std::vector&lt; CustomKernelShapeInfo &gt; shape_infos, bool ensure_row_contiguous, std::optional&lt; float &gt; init_value)</div><div class="ttdef"><b>Definition</b> fast_primitives.h:253</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_layer_norm_html"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_layer_norm.html">mlx::core::fast::LayerNorm</a></div><div class="ttdef"><b>Definition</b> fast_primitives.h:90</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_layer_norm_html_a467fcf02b3ddf1d8b6d476b244ae3568"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a467fcf02b3ddf1d8b6d476b244ae3568">mlx::core::fast::LayerNorm::DEFINE_PRINT</a></div><div class="ttdeci">DEFINE_PRINT(LayerNorm) bool is_equivalent(const Primitive &amp;other) const override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1fast_1_1_layer_norm_html_a5ac38d50e62850589bf51ee313303153"><div class="ttname"><a href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5ac38d50e62850589bf51ee313303153">mlx::core::fast::LayerNorm::LayerNorm</a></div><div class="ttdeci">LayerNorm(Stream stream, std::function&lt; std::vector&lt; array &gt;(std::vector&lt; array &gt;)&gt; fallback, float eps)</div><div class="ttdef"><b>Definition</b> fast_primitives.h:92</div></div>
@@ -499,10 +495,10 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="anamespacemlx_1_1core_1_1fast_html"><div class="ttname"><a href="namespacemlx_1_1core_1_1fast.html">mlx::core::fast</a></div><div class="ttdef"><b>Definition</b> fast.h:9</div></div>
 <div class="ttc" id="aprimitives_8h_html"><div class="ttname"><a href="primitives_8h.html">primitives.h</a></div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1_stream_html"><div class="ttname"><a href="structmlx_1_1core_1_1_stream.html">mlx::core::Stream</a></div><div class="ttdef"><b>Definition</b> stream.h:9</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info_html"><div class="ttname"><a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html">mlx::core::fast::CustomKernelShapeInfo</a></div><div class="ttdef"><b>Definition</b> fast_primitives.h:247</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info_html_a63954de7da62942ec69afcaaa19d46f2"><div class="ttname"><a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2">mlx::core::fast::CustomKernelShapeInfo::strides</a></div><div class="ttdeci">bool strides</div><div class="ttdef"><b>Definition</b> fast_primitives.h:249</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info_html_a63db720fe0c2abc4b71e22a58a015f8a"><div class="ttname"><a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">mlx::core::fast::CustomKernelShapeInfo::shape</a></div><div class="ttdeci">bool shape</div><div class="ttdef"><b>Definition</b> fast_primitives.h:248</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info_html_ae605df33f449872e3da9777d97008051"><div class="ttname"><a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051">mlx::core::fast::CustomKernelShapeInfo::ndim</a></div><div class="ttdeci">bool ndim</div><div class="ttdef"><b>Definition</b> fast_primitives.h:250</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info_html"><div class="ttname"><a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html">mlx::core::fast::CustomKernelShapeInfo</a></div><div class="ttdef"><b>Definition</b> fast_primitives.h:245</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info_html_a63954de7da62942ec69afcaaa19d46f2"><div class="ttname"><a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2">mlx::core::fast::CustomKernelShapeInfo::strides</a></div><div class="ttdeci">bool strides</div><div class="ttdef"><b>Definition</b> fast_primitives.h:247</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info_html_a63db720fe0c2abc4b71e22a58a015f8a"><div class="ttname"><a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">mlx::core::fast::CustomKernelShapeInfo::shape</a></div><div class="ttdeci">bool shape</div><div class="ttdef"><b>Definition</b> fast_primitives.h:246</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info_html_ae605df33f449872e3da9777d97008051"><div class="ttname"><a href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051">mlx::core::fast::CustomKernelShapeInfo::ndim</a></div><div class="ttdeci">bool ndim</div><div class="ttdef"><b>Definition</b> fast_primitives.h:248</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/files.html b/docs/build/html/files.html
index 70a9ad5cc..a7b33eeea 100644
--- a/docs/build/html/files.html
+++ b/docs/build/html/files.html
@@ -125,70 +125,82 @@ $(function(){ initResizable(false); });
 <tr id="row_0_1_2_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_0_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_0_')">&#9658;</span><span id="img_0_1_2_1_0_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_0_')">&#160;</span><a class="el" href="dir_8c751ccfa9f494753d976761a9d60a84.html" target="_self">fft</a></td><td class="desc"></td></tr>
 <tr id="row_0_1_2_1_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="radix_8h_source.html"><span class="icondoc"></span></a><a class="el" href="radix_8h.html" target="_self">radix.h</a></td><td class="desc"></td></tr>
 <tr id="row_0_1_2_1_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="readwrite_8h_source.html"><span class="icondoc"></span></a><a class="el" href="readwrite_8h.html" target="_self">readwrite.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_1_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_1_')">&#9658;</span><span id="img_0_1_2_1_1_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_1_')">&#160;</span><a class="el" href="dir_f60cd69d27fd3faa641c79056fff0e2d.html" target="_self">reduction</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2reduction_2ops_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2reduction_2ops_8h.html" target="_self">ops.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="reduce__all_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__all_8h.html" target="_self">reduce_all.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_1_2_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="reduce__col_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__col_8h.html" target="_self">reduce_col.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_1_3_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="reduce__init_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__init_8h.html" target="_self">reduce_init.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_1_4_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="reduce__row_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__row_8h.html" target="_self">reduce_row.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_2_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_2_')">&#9658;</span><span id="img_0_1_2_1_2_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_2_')">&#160;</span><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html" target="_self">steel</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_2_0_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_2_0_')">&#9658;</span><span id="img_0_1_2_1_2_0_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_2_0_')">&#160;</span><a class="el" href="dir_df9494e83ef22ae6150a0e080d9709ed.html" target="_self">conv</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_2_0_0_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_2_0_0_')">&#9658;</span><span id="img_0_1_2_1_2_0_0_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_2_0_0_')">&#160;</span><a class="el" href="dir_6379e541ea5051a09bc0e3fdd92fcd3b.html" target="_self">kernels</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="kernels_2steel_2conv_2kernels_2steel__conv_8h_source.html"><span class="icondoc"></span></a><a class="el" href="kernels_2steel_2conv_2kernels_2steel__conv_8h.html" target="_self">steel_conv.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="steel__conv__general_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel__conv__general_8h.html" target="_self">steel_conv_general.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_2_0_1_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_2_0_1_')">&#9658;</span><span id="img_0_1_2_1_2_0_1_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_2_0_1_')">&#160;</span><a class="el" href="dir_ba4426224ef60f409462a2a12fa18f06.html" target="_self">loaders</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="loader__channel__l_8h_source.html"><span class="icondoc"></span></a><a class="el" href="loader__channel__l_8h.html" target="_self">loader_channel_l.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="loader__channel__n_8h_source.html"><span class="icondoc"></span></a><a class="el" href="loader__channel__n_8h.html" target="_self">loader_channel_n.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_1_2_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="loader__general_8h_source.html"><span class="icondoc"></span></a><a class="el" href="loader__general_8h.html" target="_self">loader_general.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_2_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="conv_8h_source.html"><span class="icondoc"></span></a><a class="el" href="conv_8h.html" target="_self">conv.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_3_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="conv_2loader_8h_source.html"><span class="icondoc"></span></a><a class="el" href="conv_2loader_8h.html" target="_self">loader.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_0_4_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="conv_2params_8h_source.html"><span class="icondoc"></span></a><a class="el" href="conv_2params_8h.html" target="_self">params.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_2_1_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_2_1_')">&#9658;</span><span id="img_0_1_2_1_2_1_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_2_1_')">&#160;</span><a class="el" href="dir_6768c99e6145fb9510ccdb40db8ede25.html" target="_self">gemm</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_2_1_0_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_2_1_0_')">&#9658;</span><span id="img_0_1_2_1_2_1_0_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_2_1_0_')">&#160;</span><a class="el" href="dir_9c555e3d0f5b8c3fb3a7397c81fd5bf9.html" target="_self">kernels</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="steel__gemm__fused_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel__gemm__fused_8h.html" target="_self">steel_gemm_fused.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="steel__gemm__masked_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel__gemm__masked_8h.html" target="_self">steel_gemm_masked.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_0_2_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="steel__gemm__splitk_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel__gemm__splitk_8h.html" target="_self">steel_gemm_splitk.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="gemm_8h_source.html"><span class="icondoc"></span></a><a class="el" href="gemm_8h.html" target="_self">gemm.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_2_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="gemm_2loader_8h_source.html"><span class="icondoc"></span></a><a class="el" href="gemm_2loader_8h.html" target="_self">loader.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_3_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="mma_8h_source.html"><span class="icondoc"></span></a><a class="el" href="mma_8h.html" target="_self">mma.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_4_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="gemm_2params_8h_source.html"><span class="icondoc"></span></a><a class="el" href="gemm_2params_8h.html" target="_self">params.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_1_5_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html" target="_self">transforms.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_2_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_2_2_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_2_2_')">&#9658;</span><span id="img_0_1_2_1_2_2_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_2_2_')">&#160;</span><a class="el" href="dir_1b634d20b746ceaa770a5379eca5f24a.html" target="_self">utils</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_2_0_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="integral__constant_8h_source.html"><span class="icondoc"></span></a><a class="el" href="integral__constant_8h.html" target="_self">integral_constant.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_2_1_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="type__traits_8h_source.html"><span class="icondoc"></span></a><a class="el" href="type__traits_8h.html" target="_self">type_traits.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_3_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="steel_2defines_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel_2defines_8h.html" target="_self">defines.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_2_4_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2steel_2utils_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2steel_2utils_8h.html" target="_self">utils.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_3_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2arange_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2arange_8h.html" target="_self">arange.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_4_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="atomic_8h_source.html"><span class="icondoc"></span></a><a class="el" href="atomic_8h.html" target="_self">atomic.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_5_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2bf16_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2bf16_8h.html" target="_self">bf16.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_6_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="bf16__math_8h_source.html"><span class="icondoc"></span></a><a class="el" href="bf16__math_8h.html" target="_self">bf16_math.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_7_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2binary_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2binary_8h.html" target="_self">binary.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_8_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="binary__ops_8h_source.html"><span class="icondoc"></span></a><a class="el" href="binary__ops_8h.html" target="_self">binary_ops.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_9_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2binary__two_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2binary__two_8h.html" target="_self">binary_two.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_10_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2complex_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2complex_8h.html" target="_self">complex.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_11_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2copy_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2copy_8h.html" target="_self">copy.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_12_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="defines_8h_source.html"><span class="icondoc"></span></a><a class="el" href="defines_8h.html" target="_self">defines.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_13_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="erf_8h_source.html"><span class="icondoc"></span></a><a class="el" href="erf_8h.html" target="_self">erf.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_14_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="expm1f_8h_source.html"><span class="icondoc"></span></a><a class="el" href="expm1f_8h.html" target="_self">expm1f.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_15_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2fft_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2fft_8h.html" target="_self">fft.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_16_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="gather_8h_source.html"><span class="icondoc"></span></a><a class="el" href="gather_8h.html" target="_self">gather.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_17_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="kernels_2gemv__masked_8h_source.html"><span class="icondoc"></span></a><a class="el" href="kernels_2gemv__masked_8h.html" target="_self">gemv_masked.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_18_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2hadamard_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2hadamard_8h.html" target="_self">hadamard.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_19_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="kernels_2indexing_8h_source.html"><span class="icondoc"></span></a><a class="el" href="kernels_2indexing_8h.html" target="_self">indexing.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_20_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="quantized_8h_source.html"><span class="icondoc"></span></a><a class="el" href="quantized_8h.html" target="_self">quantized.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_21_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2reduce_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2reduce_8h.html" target="_self">reduce.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_22_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="reduce__utils_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__utils_8h.html" target="_self">reduce_utils.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_23_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="scaled__dot__product__attention__params_8h_source.html"><span class="icondoc"></span></a><a class="el" href="scaled__dot__product__attention__params_8h.html" target="_self">scaled_dot_product_attention_params.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_24_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="scan_8h_source.html"><span class="icondoc"></span></a><a class="el" href="scan_8h.html" target="_self">scan.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_25_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="scatter_8h_source.html"><span class="icondoc"></span></a><a class="el" href="scatter_8h.html" target="_self">scatter.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_26_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="sdpa__vector_8h_source.html"><span class="icondoc"></span></a><a class="el" href="sdpa__vector_8h.html" target="_self">sdpa_vector.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_27_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="kernels_2softmax_8h_source.html"><span class="icondoc"></span></a><a class="el" href="kernels_2softmax_8h.html" target="_self">softmax.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_28_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="sort_8h_source.html"><span class="icondoc"></span></a><a class="el" href="sort_8h.html" target="_self">sort.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_29_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2ternary_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2ternary_8h.html" target="_self">ternary.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_30_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="ternary__ops_8h_source.html"><span class="icondoc"></span></a><a class="el" href="ternary__ops_8h.html" target="_self">ternary_ops.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_31_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2unary_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2unary_8h.html" target="_self">unary.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_32_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="unary__ops_8h_source.html"><span class="icondoc"></span></a><a class="el" href="unary__ops_8h.html" target="_self">unary_ops.h</a></td><td class="desc"></td></tr>
-<tr id="row_0_1_2_1_33_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2utils_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2utils_8h.html" target="_self">utils.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_1_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_1_')">&#9658;</span><span id="img_0_1_2_1_1_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_1_')">&#160;</span><a class="el" href="dir_fb5e52e7ad5a84a63db2993d12f7610c.html" target="_self">jit</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2jit_2bf16_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2jit_2bf16_8h.html" target="_self">bf16.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_2_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_2_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_2_')">&#9658;</span><span id="img_0_1_2_1_2_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_2_')">&#160;</span><a class="el" href="dir_d36f9e79442ec4bd53287b83bdefe7e5.html" target="_self">metal_3_0</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_2_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html" target="_self">bf16.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_3_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_3_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_3_')">&#9658;</span><span id="img_0_1_2_1_3_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_3_')">&#160;</span><a class="el" href="dir_83367edb60e23ad59b1a493d8c883287.html" target="_self">metal_3_1</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_3_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2metal__3__1_2bf16_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2metal__3__1_2bf16_8h.html" target="_self">bf16.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_4_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_4_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_4_')">&#9658;</span><span id="img_0_1_2_1_4_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_4_')">&#160;</span><a class="el" href="dir_f60cd69d27fd3faa641c79056fff0e2d.html" target="_self">reduction</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_4_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2reduction_2ops_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2reduction_2ops_8h.html" target="_self">ops.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_4_1_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="reduce__all_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__all_8h.html" target="_self">reduce_all.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_4_2_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="reduce__col_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__col_8h.html" target="_self">reduce_col.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_4_3_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="reduce__init_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__init_8h.html" target="_self">reduce_init.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_4_4_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="reduce__row_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__row_8h.html" target="_self">reduce_row.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_5_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_5_')">&#9658;</span><span id="img_0_1_2_1_5_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_5_')">&#160;</span><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html" target="_self">steel</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_0_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_5_0_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_5_0_')">&#9658;</span><span id="img_0_1_2_1_5_0_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_5_0_')">&#160;</span><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html" target="_self">attn</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_5_0_0_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_5_0_0_')">&#9658;</span><span id="img_0_1_2_1_5_0_0_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_5_0_0_')">&#160;</span><a class="el" href="dir_5aea41cce495e77a0857a0aecf063e33.html" target="_self">kernels</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_0_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="steel__attention_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel__attention_8h.html" target="_self">steel_attention.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="attn_8h_source.html"><span class="icondoc"></span></a><a class="el" href="attn_8h.html" target="_self">attn.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_0_2_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="attn_2loader_8h_source.html"><span class="icondoc"></span></a><a class="el" href="attn_2loader_8h.html" target="_self">loader.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_0_3_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="attn_2mma_8h_source.html"><span class="icondoc"></span></a><a class="el" href="attn_2mma_8h.html" target="_self">mma.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_0_4_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="attn_2params_8h_source.html"><span class="icondoc"></span></a><a class="el" href="attn_2params_8h.html" target="_self">params.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_0_5_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h.html" target="_self">transforms.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_5_1_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_5_1_')">&#9658;</span><span id="img_0_1_2_1_5_1_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_5_1_')">&#160;</span><a class="el" href="dir_df9494e83ef22ae6150a0e080d9709ed.html" target="_self">conv</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_5_1_0_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_5_1_0_')">&#9658;</span><span id="img_0_1_2_1_5_1_0_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_5_1_0_')">&#160;</span><a class="el" href="dir_6379e541ea5051a09bc0e3fdd92fcd3b.html" target="_self">kernels</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="kernels_2steel_2conv_2kernels_2steel__conv_8h_source.html"><span class="icondoc"></span></a><a class="el" href="kernels_2steel_2conv_2kernels_2steel__conv_8h.html" target="_self">steel_conv.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="steel__conv__general_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel__conv__general_8h.html" target="_self">steel_conv_general.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_5_1_1_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_5_1_1_')">&#9658;</span><span id="img_0_1_2_1_5_1_1_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_5_1_1_')">&#160;</span><a class="el" href="dir_ba4426224ef60f409462a2a12fa18f06.html" target="_self">loaders</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="loader__channel__l_8h_source.html"><span class="icondoc"></span></a><a class="el" href="loader__channel__l_8h.html" target="_self">loader_channel_l.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_1_1_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="loader__channel__n_8h_source.html"><span class="icondoc"></span></a><a class="el" href="loader__channel__n_8h.html" target="_self">loader_channel_n.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_1_2_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="loader__general_8h_source.html"><span class="icondoc"></span></a><a class="el" href="loader__general_8h.html" target="_self">loader_general.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_2_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="conv_8h_source.html"><span class="icondoc"></span></a><a class="el" href="conv_8h.html" target="_self">conv.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_3_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="conv_2loader_8h_source.html"><span class="icondoc"></span></a><a class="el" href="conv_2loader_8h.html" target="_self">loader.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_1_4_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="conv_2params_8h_source.html"><span class="icondoc"></span></a><a class="el" href="conv_2params_8h.html" target="_self">params.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_5_2_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_5_2_')">&#9658;</span><span id="img_0_1_2_1_5_2_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_5_2_')">&#160;</span><a class="el" href="dir_6768c99e6145fb9510ccdb40db8ede25.html" target="_self">gemm</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_0_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_5_2_0_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_5_2_0_')">&#9658;</span><span id="img_0_1_2_1_5_2_0_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_5_2_0_')">&#160;</span><a class="el" href="dir_9c555e3d0f5b8c3fb3a7397c81fd5bf9.html" target="_self">kernels</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="steel__gemm__fused_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel__gemm__fused_8h.html" target="_self">steel_gemm_fused.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="steel__gemm__masked_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel__gemm__masked_8h.html" target="_self">steel_gemm_masked.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_0_2_" class="even" style="display:none;"><td class="entry"><span style="width:128px;display:inline-block;">&#160;</span><a href="steel__gemm__splitk_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel__gemm__splitk_8h.html" target="_self">steel_gemm_splitk.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_1_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="gemm_8h_source.html"><span class="icondoc"></span></a><a class="el" href="gemm_8h.html" target="_self">gemm.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_2_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="gemm_2loader_8h_source.html"><span class="icondoc"></span></a><a class="el" href="gemm_2loader_8h.html" target="_self">loader.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_3_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="gemm_2mma_8h_source.html"><span class="icondoc"></span></a><a class="el" href="gemm_2mma_8h.html" target="_self">mma.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_4_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="gemm_2params_8h_source.html"><span class="icondoc"></span></a><a class="el" href="gemm_2params_8h.html" target="_self">params.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_2_5_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html" target="_self">transforms.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_3_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><span id="arr_0_1_2_1_5_3_" class="arrow" onclick="dynsection.toggleFolder('0_1_2_1_5_3_')">&#9658;</span><span id="img_0_1_2_1_5_3_" class="iconfclosed" onclick="dynsection.toggleFolder('0_1_2_1_5_3_')">&#160;</span><a class="el" href="dir_1b634d20b746ceaa770a5379eca5f24a.html" target="_self">utils</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_3_0_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="integral__constant_8h_source.html"><span class="icondoc"></span></a><a class="el" href="integral__constant_8h.html" target="_self">integral_constant.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_3_1_" class="even" style="display:none;"><td class="entry"><span style="width:112px;display:inline-block;">&#160;</span><a href="type__traits_8h_source.html"><span class="icondoc"></span></a><a class="el" href="type__traits_8h.html" target="_self">type_traits.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_4_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="steel_2defines_8h_source.html"><span class="icondoc"></span></a><a class="el" href="steel_2defines_8h.html" target="_self">defines.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_5_5_" class="even" style="display:none;"><td class="entry"><span style="width:96px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2steel_2utils_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2steel_2utils_8h.html" target="_self">utils.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_6_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2arange_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2arange_8h.html" target="_self">arange.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_7_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="atomic_8h_source.html"><span class="icondoc"></span></a><a class="el" href="atomic_8h.html" target="_self">atomic.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_8_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="bf16__math_8h_source.html"><span class="icondoc"></span></a><a class="el" href="bf16__math_8h.html" target="_self">bf16_math.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_9_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2binary_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2binary_8h.html" target="_self">binary.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_10_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="binary__ops_8h_source.html"><span class="icondoc"></span></a><a class="el" href="binary__ops_8h.html" target="_self">binary_ops.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_11_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2binary__two_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2binary__two_8h.html" target="_self">binary_two.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_12_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2complex_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2complex_8h.html" target="_self">complex.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_13_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2copy_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2copy_8h.html" target="_self">copy.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_14_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="defines_8h_source.html"><span class="icondoc"></span></a><a class="el" href="defines_8h.html" target="_self">defines.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_15_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="erf_8h_source.html"><span class="icondoc"></span></a><a class="el" href="erf_8h.html" target="_self">erf.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_16_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="expm1f_8h_source.html"><span class="icondoc"></span></a><a class="el" href="expm1f_8h.html" target="_self">expm1f.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_17_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2fft_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2fft_8h.html" target="_self">fft.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_18_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="gather_8h_source.html"><span class="icondoc"></span></a><a class="el" href="gather_8h.html" target="_self">gather.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_19_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="kernels_2gemv__masked_8h_source.html"><span class="icondoc"></span></a><a class="el" href="kernels_2gemv__masked_8h.html" target="_self">gemv_masked.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_20_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2hadamard_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2hadamard_8h.html" target="_self">hadamard.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_21_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="kernels_2indexing_8h_source.html"><span class="icondoc"></span></a><a class="el" href="kernels_2indexing_8h.html" target="_self">indexing.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_22_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="quantized_8h_source.html"><span class="icondoc"></span></a><a class="el" href="quantized_8h.html" target="_self">quantized.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_23_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2reduce_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2reduce_8h.html" target="_self">reduce.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_24_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="reduce__utils_8h_source.html"><span class="icondoc"></span></a><a class="el" href="reduce__utils_8h.html" target="_self">reduce_utils.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_25_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="scan_8h_source.html"><span class="icondoc"></span></a><a class="el" href="scan_8h.html" target="_self">scan.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_26_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="scatter_8h_source.html"><span class="icondoc"></span></a><a class="el" href="scatter_8h.html" target="_self">scatter.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_27_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="sdpa__vector_8h_source.html"><span class="icondoc"></span></a><a class="el" href="sdpa__vector_8h.html" target="_self">sdpa_vector.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_28_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="kernels_2softmax_8h_source.html"><span class="icondoc"></span></a><a class="el" href="kernels_2softmax_8h.html" target="_self">softmax.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_29_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="sort_8h_source.html"><span class="icondoc"></span></a><a class="el" href="sort_8h.html" target="_self">sort.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_30_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2ternary_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2ternary_8h.html" target="_self">ternary.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_31_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="ternary__ops_8h_source.html"><span class="icondoc"></span></a><a class="el" href="ternary__ops_8h.html" target="_self">ternary_ops.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_32_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="metal_2kernels_2unary_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2kernels_2unary_8h.html" target="_self">unary.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_33_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="unary__ops_8h_source.html"><span class="icondoc"></span></a><a class="el" href="unary__ops_8h.html" target="_self">unary_ops.h</a></td><td class="desc"></td></tr>
+<tr id="row_0_1_2_1_34_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;">&#160;</span><a href="backend_2metal_2kernels_2utils_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2kernels_2utils_8h.html" target="_self">utils.h</a></td><td class="desc"></td></tr>
 <tr id="row_0_1_2_2_" class="even"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><a href="backend_2metal_2allocator_8h_source.html"><span class="icondoc"></span></a><a class="el" href="backend_2metal_2allocator_8h.html" target="_self">allocator.h</a></td><td class="desc"></td></tr>
 <tr id="row_0_1_2_3_" class="odd"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><a href="metal_2binary_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2binary_8h.html" target="_self">binary.h</a></td><td class="desc"></td></tr>
 <tr id="row_0_1_2_4_" class="even"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><a href="metal_2copy_8h_source.html"><span class="icondoc"></span></a><a class="el" href="metal_2copy_8h.html" target="_self">copy.h</a></td><td class="desc"></td></tr>
diff --git a/docs/build/html/functions_a.html b/docs/build/html/functions_a.html
index 32ebacc4d..c6c451642 100644
--- a/docs/build/html/functions_a.html
+++ b/docs/build/html/functions_a.html
@@ -91,7 +91,7 @@ $(function(){ initResizable(false); });
 <li>A_str_k&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>A_str_m&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>Abs()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a1247e72feb640fb562d036b2dd1ae4ad">mlx::core::Abs</a></li>
-<li>accum_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da">mlx::steel::AccumHelper&lt; T &gt;</a></li>
+<li>accum_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">mlx::steel::AccumHelper&lt; T &gt;</a></li>
 <li>Add()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_add.html#ae3fd5483f3454eac3df256e3f5f3cdae">mlx::core::Add</a></li>
 <li>add_temporaries()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">mlx::core::metal::Device</a></li>
 <li>add_temporary()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">mlx::core::metal::Device</a></li>
@@ -108,12 +108,12 @@ $(function(){ initResizable(false); });
 <li>Allocator()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1allocator_1_1_allocator.html#a5803678a418fef687fc65fa9d5c37b65">mlx::core::allocator::Allocator</a></li>
 <li>allocator&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1allocator_1_1_common_allocator.html#abf84c726a37df68345589b897b2e35f0">mlx::core::allocator::CommonAllocator</a>, <a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html#afa1c5a725309caff163c492b5b84491e">mlx::core::metal::MetalAllocator</a></li>
 <li>AllReduce()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a2d1ea56cbf72a316680ea90aa6da1c2d">mlx::core::distributed::AllReduce</a></li>
-<li>alpha&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#af8693d96512eff3e125d33d203920710">mlx::steel::GEMMAddMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a932266d04fa7d6e27d4a4a2c175f1477">MLXFastAttentionParams</a></li>
+<li>alpha&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#af8693d96512eff3e125d33d203920710">mlx::steel::GEMMAddMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></li>
 <li>And&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a6f8b5d455d0c1770428a6bef1608f23dab14e7d426f45ae7f029f4e00210fbae4">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924aba3b7fb927f6b6c8b198a9cdc3dd9e02">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93">mlx::core::Reduce</a></li>
-<li>apply()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75">mlx::steel::TransformNone&lt; OutT, InT &gt;</a>, <a class="el" href="struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218">ScaleOp&lt; OutT, InT &gt;</a></li>
+<li>apply()&#160;:&#160;<a class="el" href="struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221">DivOp</a>, <a class="el" href="struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334">ExpSubOp</a>, <a class="el" href="struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e">MaxOp</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75">mlx::steel::TransformNone&lt; OutT, InT &gt;</a>, <a class="el" href="struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756">MulOp</a>, <a class="el" href="struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218">ScaleOp&lt; OutT, InT &gt;</a>, <a class="el" href="struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143">SubOp</a>, <a class="el" href="struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d">SumOp</a>, <a class="el" href="struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16">TransformScale&lt; T &gt;</a></li>
 <li>apply_epilogue()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>apply_epilogue_safe()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>apply_inplace_op()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></li>
+<li>apply_inplace_op()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a2b136fad00dc54300e68aa6b905eff97">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></li>
 <li>Arange()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_arange.html#a1a70c3b0b9c67d5a9446c141c5b7c574">mlx::core::Arange</a></li>
 <li>ArcCos()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a66f4ee841d17923d93241b71ea5103e9">mlx::core::ArcCos</a></li>
 <li>ArcCosh()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a34597054db467941a2a883c653ba4d71">mlx::core::ArcCosh</a></li>
@@ -135,7 +135,7 @@ $(function(){ initResizable(false); });
 <li>As_offset&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>AsStrided()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_as_strided.html#a80c0547f72ed53374eafc57d57b5d4af">mlx::core::AsStrided</a></li>
 <li>AsType()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_as_type.html#a8c3241d402a8977bb4db037e225f5b47">mlx::core::AsType</a></li>
-<li>Atile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
+<li>Atile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>atomic_update()&#160;:&#160;<a class="el" href="struct_none.html#aecbce7c97e8b1d5dc4afd2e788c24e06">None</a></li>
 <li>attach_event()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a000c3cfe13cb378bf0523b62816190da">mlx::core::array</a></li>
 <li>available&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078a308bd3e5bf976888b120dd36d0c2d2ae">mlx::core::array</a></li>
diff --git a/docs/build/html/functions_b.html b/docs/build/html/functions_b.html
index f1cfde395..6eeb537f2 100644
--- a/docs/build/html/functions_b.html
+++ b/docs/build/html/functions_b.html
@@ -87,35 +87,33 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all class members with links to the classes they belong to:</div>
 
 <h3><a id="index_b" name="index_b"></a>- b -</h3><ul>
+<li>B&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a1cba7fedbd02e157922619195997cf4f">mlx::steel::AttnParams</a></li>
 <li>b&#160;:&#160;<a class="el" href="unionbool4__or__uint.html#a47d77eac47598fe420f8f04a615f76ca">bool4_or_uint</a></li>
 <li>B_str_k&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>B_str_n&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>Base&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421">mlx::core::Log</a></li>
 <li>base_wh&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aca37adba6f148579eb1cd0a7800a5cfe">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6c46564bf1a96a02791dd432cc9c883e">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>base_ww&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32d020c6715d06f7de360877fcb7b6e4">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a230f0e581f9b8227b9ee68760b3b1503">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
-<li>batch_ndim&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a640dc138a8bf7b2b5bed6a436b429c2f">mlx::steel::GEMMParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a6f3d94dbe44b32e675558768710bf0a3">MLXFastAttentionParams</a></li>
+<li>batch_ndim&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a640dc138a8bf7b2b5bed6a436b429c2f">mlx::steel::GEMMParams</a></li>
 <li>batch_size&#160;:&#160;<a class="el" href="struct_read_writer.html#a689f4890c1d2ce33fc6da7550beec735">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>batch_stride_a&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a76f55783a8e2ee203cf8507eee4b000c">mlx::steel::GEMMParams</a></li>
 <li>batch_stride_b&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a99b959b12d12da657648fa24d43e49e8">mlx::steel::GEMMParams</a></li>
 <li>batch_stride_c&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a77c065db228e9654a0a75a6ffe47c15a">mlx::steel::GEMMAddMMParams</a></li>
 <li>batch_stride_d&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#ad98006dd509a455864e6aa7c52743a41">mlx::steel::GEMMParams</a></li>
-<li>batch_stride_k&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a162826d3f288f64c0aea88a36b34859b">MLXFastAttentionParams</a></li>
-<li>batch_stride_o&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a3c5b1170999087f3f3a03830193b55c7">MLXFastAttentionParams</a></li>
-<li>batch_stride_q&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a98766fc89f75d5eef65b345f16a782d1">MLXFastAttentionParams</a></li>
-<li>batch_stride_v&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a1180e311b95cd4b6d4a336d21b873c21">MLXFastAttentionParams</a></li>
 <li>BCOLS&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3d6272d000f8ea79d9b3b5228bdca20f">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a29fbeeacdf5b6feeb74815ced255fa5a">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9229d22e0a02d96825eb5a57c8cb95ac">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac3b40db720055350bba59d614ea1dd79">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a86519729ef0561686bb86e474c95b93d">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a1843921cd67926002bb0dcccf3048eb6">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b6cf53a10514310d01f4d6459053a57">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 <li>BCOLS_PACKED&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>begin()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a76b258b169d7d73419ebbf85340fb914">mlx::core::array</a></li>
 <li>beta&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#ac0ce4d8a6014f8adb29fd0a0bb23139f">mlx::steel::GEMMAddMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></li>
-<li>bi&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32a3a91fa715b82f36e05ceb10933d09">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8e53b0a9951cb840d922cc285b257ee3">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9eb024e2fc6f07345f87fbf7141c0d16">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ae3af75287f279d2cdeef189126740d4c">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a8c5e74003600132954cb953616e1a026">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a4c91f848856ab0872bdfd37c62d4b0ba">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae3b9f21f72e5e6c541c9978f55d354c7">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>bi&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32a3a91fa715b82f36e05ceb10933d09">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8e53b0a9951cb840d922cc285b257ee3">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9eb024e2fc6f07345f87fbf7141c0d16">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ae3af75287f279d2cdeef189126740d4c">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a8c5e74003600132954cb953616e1a026">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a4c91f848856ab0872bdfd37c62d4b0ba">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae3b9f21f72e5e6c541c9978f55d354c7">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>biases&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>bits_&#160;:&#160;<a class="el" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">_MLX_BFloat16</a>, <a class="el" href="structmlx_1_1core_1_1___m_l_x___b_float16.html#aca48963f820065c3d8ecab24265ab3fc">mlx::core::_MLX_BFloat16</a>, <a class="el" href="structmlx_1_1core_1_1___m_l_x___float16.html#a5203fe52424fd32bce6eb7917dd9288b">mlx::core::_MLX_Float16</a></li>
 <li>bits_to_bfloat()&#160;:&#160;<a class="el" href="struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca">_MLX_BFloat16</a></li>
 <li>BitwiseBinary()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a0d8b3a94951621ffcdebc6fda748a172">mlx::core::BitwiseBinary</a></li>
-<li>bj&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#ace16704025bc6e6204c306a357f3a8b8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a7ae9e41f50c0c63c35b63086a1c22cc3">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7cf448573d41fbc67f8dfc65b7aef2b2">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a6fd3dd7b74d91609fa9dd61c657a0e32">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a6f2fdcaf5a67567cca38ae3d8120ab37">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acec010e10d5733654963407af38d4f67">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#adaa261fc2e8e694aedab4ebd60b52e5e">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>bj&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#ace16704025bc6e6204c306a357f3a8b8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a7ae9e41f50c0c63c35b63086a1c22cc3">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7cf448573d41fbc67f8dfc65b7aef2b2">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a6fd3dd7b74d91609fa9dd61c657a0e32">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a6f2fdcaf5a67567cca38ae3d8120ab37">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acec010e10d5733654963407af38d4f67">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#adaa261fc2e8e694aedab4ebd60b52e5e">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>block_merge_sort_t&#160;:&#160;<a class="el" href="struct_kernel_merge_sort.html#adae7850e057fc30d5328c7b3dcc998fa">KernelMergeSort&lt; T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a>, <a class="el" href="struct_kernel_multi_block_merge_sort.html#af27e9af4b58640c0aa620bc4efc68dff">KernelMultiBlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
 <li>block_sort()&#160;:&#160;<a class="el" href="struct_kernel_merge_sort.html#a56b644ec66f7fb5c01b280f124304be9">KernelMergeSort&lt; T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a>, <a class="el" href="struct_kernel_multi_block_merge_sort.html#a322ed2eac315a561e0fd90af2fd577eb">KernelMultiBlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
 <li>BlockLoader()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></li>
+<li>BlockLoaderT()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a076616a7c67ad1b847e0e6b046077ee2">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></li>
 <li>blockM&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a7281520100658811076400060663903c">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a2ae8ce535d59cccf453381b4485a77f0">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
 <li>BlockMaskedMM()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#ad26509deb5306d0c5eb72477e9a57477">mlx::core::BlockMaskedMM</a></li>
 <li>BlockMMA()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
@@ -123,13 +121,14 @@ $(function(){ initResizable(false); });
 <li>Broadcast()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_broadcast.html#accbab8433c93e281608a268d11afaefb">mlx::core::Broadcast</a></li>
 <li>BROWS&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aba1e1c8012e4e50f0e9bcfb9486c1781">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ac070c6bd5be85b1ae805e18890db4fd4">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a343984fb74ec579a4404278dbbc7e7b5">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a10591ea957605a9c662f93d59ff3410d">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ae9b86b05b23153ea1abaeead456c491c">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a015a0c56de74a0c4d51953a7e94fbba8">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acc8140aae84694f62e6324dbb6a614a4">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 <li>Bs_offset&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>Btile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
+<li>Btile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>buf&#160;:&#160;<a class="el" href="struct_read_writer.html#a23bac3c96dd0265ddbee1f256be45ff5">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>Buffer()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1allocator_1_1_buffer.html#ac4fc2cc6aa1368cfb74aff329d9a1300">mlx::core::allocator::Buffer</a></li>
 <li>buffer()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#ab3daf04c27c4593d9d73c397b8484a08">mlx::core::array</a>, <a class="el" href="structmlx_1_1core_1_1array_1_1_data.html#a9a51e2d12ba505027cc0fca86bdd39ad">mlx::core::array::Data</a>, <a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">mlx::core::metal::DeviceStream</a></li>
 <li>buffer_ops&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">mlx::core::metal::DeviceStream</a></li>
 <li>buffer_size()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a914577c63755b2e862d2da68bbf8e3dd">mlx::core::array</a></li>
 <li>buffers&#160;:&#160;<a class="el" href="struct_indices.html#ad705070a740579c07d109ae4f3d86e76">Indices&lt; IdxT, NIDX &gt;</a></li>
+<li>bytes_per_pack&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_c.html b/docs/build/html/functions_c.html
index 72eac8b79..1396ec6b3 100644
--- a/docs/build/html/functions_c.html
+++ b/docs/build/html/functions_c.html
@@ -88,6 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_c" name="index_c"></a>- c -</h3><ul>
 <li>C&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#a0953063962ac3b5a027243289e72fbb2">MLXConvParams&lt; NDIM &gt;</a></li>
+<li>c&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a></li>
 <li>capitalize_bool&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_print_formatter.html#adf49a949db36f0ba076842a6d675d79a">mlx::core::PrintFormatter</a></li>
 <li>Category&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2d">mlx::core::Dtype</a></li>
 <li>Ceil()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_ceil.html#aede38610ca25429f229301546bc9b682">mlx::core::Ceil</a></li>
@@ -98,6 +99,7 @@ $(function(){ initResizable(false); });
 <li>cmplx()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#a5b1ce506f1023f5254025ac81b831a2c">pocketfft::detail::cmplx&lt; T &gt;</a></li>
 <li>cndarr()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1cndarr.html#abf73f1b4ddcfb27d7f85cfa441607129">pocketfft::detail::cndarr&lt; T &gt;</a></li>
 <li>col_contiguous&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html#ae24709026598d635e6b5c24a15f8a802">mlx::core::array::Flags</a></li>
+<li>col_frag_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aab8dd1c6917247da41dd3a31139a665f">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></li>
 <li>CommandEncoder()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3">mlx::core::metal::CommandEncoder</a></li>
 <li>commit_command_buffer()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">mlx::core::metal::Device</a></li>
 <li>Compiled()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_compiled.html#a2d8cefff835c419a48a077d306b8e051">mlx::core::Compiled</a></li>
@@ -108,6 +110,7 @@ $(function(){ initResizable(false); });
 <li>ConcurrentContext()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174">mlx::core::metal::CommandEncoder::ConcurrentContext</a></li>
 <li>cond&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a4ffd524d6a5bedd1a303b63bdde6701c">mlx::core::scheduler::StreamThread</a></li>
 <li>Conjugate()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87">mlx::core::Conjugate</a></li>
+<li>Contiguous()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_contiguous.html#a3e83f414c02ae0b92a50b6f8e402e1c0">mlx::core::Contiguous</a></li>
 <li>contiguous&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html#afd0ab11e7a486a2a8e50ee84b971ac8a">mlx::core::array::Flags</a></li>
 <li>ContiguousIterator()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6">mlx::core::ContiguousIterator&lt; StrideT &gt;</a></li>
 <li>Conv2DInputBlockLoaderGeneral()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
@@ -126,7 +129,7 @@ $(function(){ initResizable(false); });
 <li>cost_guess()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1util.html#ad3d874bc3fb0048df2270779a15d4bd0">pocketfft::detail::util</a></li>
 <li>count_down()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html#a81d6597189b40410e35f3cd653fd1342">pocketfft::detail::threading::latch</a></li>
 <li>cpu&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_device.html#a69ee81924251dec96f1945c9d91506fd">mlx::core::Device</a></li>
-<li>Ctile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
+<li>Ctile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>Custom()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313">mlx::core::fast::Custom</a></li>
 <li>CustomKernel()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153">mlx::core::fast::CustomKernel</a></li>
 <li>CustomTransforms()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488">mlx::core::CustomTransforms</a></li>
diff --git a/docs/build/html/functions_d.html b/docs/build/html/functions_d.html
index cbd830086..674118ff1 100644
--- a/docs/build/html/functions_d.html
+++ b/docs/build/html/functions_d.html
@@ -87,6 +87,7 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all class members with links to the classes they belong to:</div>
 
 <h3><a id="index_d" name="index_d"></a>- d -</h3><ul>
+<li>D&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a07ae31628e43e09bce533c7682c8dae3">mlx::steel::AttnParams</a></li>
 <li>d&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_data.html#a25f52ac67912a49bb6e2b6715aa65311">mlx::core::array::Data</a>, <a class="el" href="classpocketfft_1_1detail_1_1cndarr.html#ac29c769aebb03f81fbcf16ba6e766af2">pocketfft::detail::cndarr&lt; T &gt;</a></li>
 <li>Data()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_data.html#a77e2ea35fac1d54e4062468a432e1482">mlx::core::array::Data</a></li>
 <li>data()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a72e3ce6c03fefe272cadf214bd127b95">mlx::core::array</a>, <a class="el" href="classpocketfft_1_1detail_1_1arr.html#aec0f2191b4663b4187aab92454c34de8">pocketfft::detail::arr&lt; T &gt;</a></li>
@@ -107,13 +108,14 @@ $(function(){ initResizable(false); });
 <li>difference_type&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_array_iterator.html#adcee44c77980fc2370a2c31e203aead5">mlx::core::array::ArrayIterator</a></li>
 <li>digits&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#af6a681edff230c8d734a1feefb8d1879">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 <li>digits10&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a0f48dd0c8a2d2dfa825067fb212b2e6b">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
-<li>dispatchThreadgroups()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">mlx::core::metal::CommandEncoder</a></li>
-<li>dispatchThreads()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">mlx::core::metal::CommandEncoder</a></li>
+<li>dim&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a7be6bf560080472d61e74b522979ef1e">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></li>
+<li>dispatch_threadgroups()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a85796b2bf41dbf347ae0978d4660600d">mlx::core::metal::CommandEncoder</a></li>
+<li>dispatch_threads()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a0a8501b940e5a347475fa4bc38fb4c05">mlx::core::metal::CommandEncoder</a></li>
 <li>DistPrimitive()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_dist_primitive.html#a8c54166951522c2a52ef39fce8c87f8f">mlx::core::distributed::DistPrimitive</a></li>
 <li>Divide()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb">mlx::core::Divide</a></li>
 <li>DivMod()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826">mlx::core::DivMod</a></li>
 <li>do_read&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a13eb86acf6abe288c19645935a47d2ad">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a640155880483e1042ec5f647b9adaac6">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
-<li>dst&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#af34c184a19846e4b40ba54b2946589ec">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aa84c4ad43a5defb83ba1a5f49a7adb2a">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ae048eb79f8b8d98f0fe8805c30fbb09f">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a59a4fffc1dc2f3fadfb3fdd1b886da70">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8598bf23a2bce6af13c876cbfa76449f">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aea6494838175225d02cbc7768a646ec7">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8474daf268013e138a84fc1c4bff7352">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a24e20e4c1dd1ebf9534bfa2b3e050ed3">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>dst&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a6eb4e566b687395e27f290da288362db">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aa84c4ad43a5defb83ba1a5f49a7adb2a">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ae048eb79f8b8d98f0fe8805c30fbb09f">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a59a4fffc1dc2f3fadfb3fdd1b886da70">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8598bf23a2bce6af13c876cbfa76449f">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aea6494838175225d02cbc7768a646ec7">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8474daf268013e138a84fc1c4bff7352">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a24e20e4c1dd1ebf9534bfa2b3e050ed3">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>dst_ld&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a07c85eab8cbf7b02c60df29cf32031ef">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a91192d512e7a18c2d16a139065000959">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ae71570942c7b0ad8e67c62662b336c4a">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a9e59da7e4436e61b2d3c3f982355910b">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a0ff5a6d503e0bbac4634030a75ab818d">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aae121ca6016fc6c7255027b3641f3a09">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ac18eeebea26cc6da434ead6eb4397350">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 <li>Dtype()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_dtype.html#aec17f0a4a51729e5ac40b62f0aa765d1">mlx::core::Dtype</a></li>
 <li>dtype()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#ae29e7d6fbfbea1e5e321a8d1ea3cfacd">mlx::core::array</a></li>
diff --git a/docs/build/html/functions_e.html b/docs/build/html/functions_e.html
index 60e04a1b0..0fd6e64bd 100644
--- a/docs/build/html/functions_e.html
+++ b/docs/build/html/functions_e.html
@@ -90,7 +90,7 @@ $(function(){ initResizable(false); });
 <li>e&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421a491d45f7af463017c1f8cae94cd05590">mlx::core::Log</a></li>
 <li>Eigh()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_eigh.html#ad8f5d012ebd5942abeffecca77fcddda">mlx::core::Eigh</a></li>
 <li>elem&#160;:&#160;<a class="el" href="struct_read_writer.html#a32b82adcd6ed324ce235e9f5ad780ede">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
-<li>elem_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>elem_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>elems()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>elems_per_thread&#160;:&#160;<a class="el" href="struct_read_writer.html#a444230a0182ce6ba1898c04ce6e669a7">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>empty()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#a1269e5da40c3f5145c895cee3641879a">pocketfft::detail::threading::concurrent_queue&lt; T &gt;</a></li>
@@ -104,8 +104,8 @@ $(function(){ initResizable(false); });
 <li>Erf()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_erf.html#a702f76f848928d8d7d3d0881ac6e4c82">mlx::core::Erf</a></li>
 <li>ErfInv()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478">mlx::core::ErfInv</a></li>
 <li>eval()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a2820c45188071a22175e9fa42e10a49a">mlx::core::array</a></li>
-<li>eval_cpu()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#ac45b1d0fedd85feefbff7ce7e168b151">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ab721fe0072fffbddbc3c4334dd033ba5">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#acdc1965ad64ee9ee6328fe150a97902e">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html#a3be84b08122a939edd6062d26261358a">mlx::core::distributed::Recv</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#af2620837bfc1b97217d006ed6e374051">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">mlx::core::fast::AffineQuantize</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">mlx::core::fast::CustomKernel</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5d7a4c1c9ee84e327d1c371733108c05">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a0d8c4c6e7462befc38f7e08244fa1c2b">mlx::core::fast::LayerNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#a7da6e0cfd630958d9633b2e2bd97a54f">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#adfc1d52bc266466ab29ee45fd8fab439">mlx::core::fast::RMSNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a05a7d595c6b9dadf7ddfd6e3fd402f0e">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ae20851e002f7fcb6d4f97817596f6328">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a1596dc50b910538eae14878e98f07575">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a7e8f6f5d6ae0a33f6abc0f5a46e0b132">mlx::core::UnaryPrimitive</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497">mlx::core::View</a></li>
-<li>eval_gpu()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa3d5ff0f2b3554ad48fbbf2a0f3336d5">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a4251ce0f2db2045226b66210b828af7a">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a52df7155f56b8450581b2fd2747cad20">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html#a932e39624bc3d234a7489c3decc4749e">mlx::core::distributed::Recv</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a0c8dbd2a912be91be04ec701e29fba3d">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">mlx::core::fast::AffineQuantize</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db">mlx::core::fast::CustomKernel</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a77abda7f47bffa2c037a5d60cccc1528">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a954a003a4a27c8c4c60a5a14142a9cc3">mlx::core::fast::LayerNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae7955e8d43c097eecae264e804b4d8ca">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a48efb8fa84c4ba6cc9fb560ebbe01560">mlx::core::fast::RMSNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a913b6b00fc518b25ac3947e4e15790f2">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a505f38ba93a3499895f5312e0112e73d">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ad217376dcf5eff691d731566faec2ba2">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a6b7f80abaf038d53ec6ffbb0dfac6adb">mlx::core::UnaryPrimitive</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075">mlx::core::View</a></li>
+<li>eval_cpu()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#ac45b1d0fedd85feefbff7ce7e168b151">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ab721fe0072fffbddbc3c4334dd033ba5">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#acdc1965ad64ee9ee6328fe150a97902e">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html#a3be84b08122a939edd6062d26261358a">mlx::core::distributed::Recv</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#af2620837bfc1b97217d006ed6e374051">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">mlx::core::fast::AffineQuantize</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">mlx::core::fast::CustomKernel</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5d7a4c1c9ee84e327d1c371733108c05">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a0d8c4c6e7462befc38f7e08244fa1c2b">mlx::core::fast::LayerNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#a7da6e0cfd630958d9633b2e2bd97a54f">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#adfc1d52bc266466ab29ee45fd8fab439">mlx::core::fast::RMSNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a05a7d595c6b9dadf7ddfd6e3fd402f0e">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ae20851e002f7fcb6d4f97817596f6328">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a1596dc50b910538eae14878e98f07575">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a7e8f6f5d6ae0a33f6abc0f5a46e0b132">mlx::core::UnaryPrimitive</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497">mlx::core::View</a></li>
+<li>eval_gpu()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa3d5ff0f2b3554ad48fbbf2a0f3336d5">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a4251ce0f2db2045226b66210b828af7a">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a52df7155f56b8450581b2fd2747cad20">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html#a932e39624bc3d234a7489c3decc4749e">mlx::core::distributed::Recv</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a0c8dbd2a912be91be04ec701e29fba3d">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">mlx::core::fast::AffineQuantize</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db">mlx::core::fast::CustomKernel</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a77abda7f47bffa2c037a5d60cccc1528">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a954a003a4a27c8c4c60a5a14142a9cc3">mlx::core::fast::LayerNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae7955e8d43c097eecae264e804b4d8ca">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a48efb8fa84c4ba6cc9fb560ebbe01560">mlx::core::fast::RMSNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a913b6b00fc518b25ac3947e4e15790f2">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a505f38ba93a3499895f5312e0112e73d">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ad217376dcf5eff691d731566faec2ba2">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a6b7f80abaf038d53ec6ffbb0dfac6adb">mlx::core::UnaryPrimitive</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075">mlx::core::View</a></li>
 <li>evaluated&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078a6fc3d7595445dd877584495f47535268">mlx::core::array</a></li>
 <li>Event()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#a833506419b2110ad1abd89b2dd238b4d">mlx::core::Event</a></li>
 <li>event()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a0a8e4d6e67e739a712876bb36f88f9bf">mlx::core::array</a></li>
diff --git a/docs/build/html/functions_f.html b/docs/build/html/functions_f.html
index fc1082a6a..87ef80e77 100644
--- a/docs/build/html/functions_f.html
+++ b/docs/build/html/functions_f.html
@@ -106,7 +106,7 @@ $(function(){ initResizable(false); });
 <li>Floor()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_floor.html#ada4e979b784b732696313d7094e91340">mlx::core::Floor</a></li>
 <li>forward&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1_exec_c2_c.html#a63e27292b327597674deede9debe1c43">pocketfft::detail::ExecC2C</a>, <a class="el" href="structpocketfft_1_1detail_1_1_exec_r2_r.html#a5ec66ebb2ccd079f62b068ddd1fc7bdf">pocketfft::detail::ExecR2R</a></li>
 <li>frag_at()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
-<li>frag_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>frag_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>free()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1allocator_1_1_allocator.html#ae963d551be646ae0e13df2c16f2beefb">mlx::core::allocator::Allocator</a>, <a class="el" href="classmlx_1_1core_1_1allocator_1_1_common_allocator.html#a84b50d1a3cbffa12c1a6cf0ed8c71079">mlx::core::allocator::CommonAllocator</a>, <a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a109a0a37fb0b3be381a62dc3b1a54bf0">mlx::core::metal::MetalAllocator</a></li>
 <li>Full()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_full.html#aafcb86a2e41353853ec48c717e0c54d6">mlx::core::Full</a></li>
 </ul>
diff --git a/docs/build/html/functions_func_a.html b/docs/build/html/functions_func_a.html
index 9071cd8eb..dadf4640c 100644
--- a/docs/build/html/functions_func_a.html
+++ b/docs/build/html/functions_func_a.html
@@ -99,10 +99,10 @@ $(function(){ initResizable(false); });
 <li>allocate()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a4f785747b898980756af9e5515363826">pocketfft::detail::threading::aligned_allocator&lt; T &gt;</a></li>
 <li>Allocator()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1allocator_1_1_allocator.html#a5803678a418fef687fc65fa9d5c37b65">mlx::core::allocator::Allocator</a></li>
 <li>AllReduce()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a2d1ea56cbf72a316680ea90aa6da1c2d">mlx::core::distributed::AllReduce</a></li>
-<li>apply()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75">mlx::steel::TransformNone&lt; OutT, InT &gt;</a>, <a class="el" href="struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218">ScaleOp&lt; OutT, InT &gt;</a></li>
+<li>apply()&#160;:&#160;<a class="el" href="struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221">DivOp</a>, <a class="el" href="struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334">ExpSubOp</a>, <a class="el" href="struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e">MaxOp</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75">mlx::steel::TransformNone&lt; OutT, InT &gt;</a>, <a class="el" href="struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756">MulOp</a>, <a class="el" href="struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218">ScaleOp&lt; OutT, InT &gt;</a>, <a class="el" href="struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143">SubOp</a>, <a class="el" href="struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d">SumOp</a>, <a class="el" href="struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16">TransformScale&lt; T &gt;</a></li>
 <li>apply_epilogue()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>apply_epilogue_safe()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>apply_inplace_op()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></li>
+<li>apply_inplace_op()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a2b136fad00dc54300e68aa6b905eff97">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></li>
 <li>Arange()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_arange.html#a1a70c3b0b9c67d5a9446c141c5b7c574">mlx::core::Arange</a></li>
 <li>ArcCos()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a66f4ee841d17923d93241b71ea5103e9">mlx::core::ArcCos</a></li>
 <li>ArcCosh()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a34597054db467941a2a883c653ba4d71">mlx::core::ArcCosh</a></li>
diff --git a/docs/build/html/functions_func_b.html b/docs/build/html/functions_func_b.html
index 8a271c35d..13e97946a 100644
--- a/docs/build/html/functions_func_b.html
+++ b/docs/build/html/functions_func_b.html
@@ -92,6 +92,7 @@ $(function(){ initResizable(false); });
 <li>BitwiseBinary()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a0d8b3a94951621ffcdebc6fda748a172">mlx::core::BitwiseBinary</a></li>
 <li>block_sort()&#160;:&#160;<a class="el" href="struct_kernel_merge_sort.html#a56b644ec66f7fb5c01b280f124304be9">KernelMergeSort&lt; T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a>, <a class="el" href="struct_kernel_multi_block_merge_sort.html#a322ed2eac315a561e0fd90af2fd577eb">KernelMultiBlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
 <li>BlockLoader()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></li>
+<li>BlockLoaderT()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a076616a7c67ad1b847e0e6b046077ee2">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></li>
 <li>BlockMaskedMM()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#ad26509deb5306d0c5eb72477e9a57477">mlx::core::BlockMaskedMM</a></li>
 <li>BlockMMA()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>Broadcast()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_broadcast.html#accbab8433c93e281608a268d11afaefb">mlx::core::Broadcast</a></li>
diff --git a/docs/build/html/functions_func_c.html b/docs/build/html/functions_func_c.html
index 948b17f3b..ec7c04409 100644
--- a/docs/build/html/functions_func_c.html
+++ b/docs/build/html/functions_func_c.html
@@ -103,6 +103,7 @@ $(function(){ initResizable(false); });
 <li>Concatenate()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_concatenate.html#acff07853de2d31faeec7c4ca40ce0888">mlx::core::Concatenate</a></li>
 <li>ConcurrentContext()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174">mlx::core::metal::CommandEncoder::ConcurrentContext</a></li>
 <li>Conjugate()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87">mlx::core::Conjugate</a></li>
+<li>Contiguous()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_contiguous.html#a3e83f414c02ae0b92a50b6f8e402e1c0">mlx::core::Contiguous</a></li>
 <li>ContiguousIterator()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6">mlx::core::ContiguousIterator&lt; StrideT &gt;</a></li>
 <li>Conv2DInputBlockLoaderGeneral()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>Conv2DInputBlockLoaderLargeFilter()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8755116a535539744e4947bc69f9c50f">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
diff --git a/docs/build/html/functions_func_d.html b/docs/build/html/functions_func_d.html
index 37797b723..45c3acbf9 100644
--- a/docs/build/html/functions_func_d.html
+++ b/docs/build/html/functions_func_d.html
@@ -102,8 +102,8 @@ $(function(){ initResizable(false); });
 <li>Device()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_device.html#a481ccfb94d689994396bd353e966b489">mlx::core::Device</a>, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6">mlx::core::metal::Device</a></li>
 <li>device()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_primitive.html#a8ae61e3289c4134232a69295268f8261">mlx::core::Primitive</a></li>
 <li>DeviceStream()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">mlx::core::metal::DeviceStream</a></li>
-<li>dispatchThreadgroups()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">mlx::core::metal::CommandEncoder</a></li>
-<li>dispatchThreads()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">mlx::core::metal::CommandEncoder</a></li>
+<li>dispatch_threadgroups()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a85796b2bf41dbf347ae0978d4660600d">mlx::core::metal::CommandEncoder</a></li>
+<li>dispatch_threads()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a0a8501b940e5a347475fa4bc38fb4c05">mlx::core::metal::CommandEncoder</a></li>
 <li>DistPrimitive()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_dist_primitive.html#a8c54166951522c2a52ef39fce8c87f8f">mlx::core::distributed::DistPrimitive</a></li>
 <li>Divide()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb">mlx::core::Divide</a></li>
 <li>DivMod()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826">mlx::core::DivMod</a></li>
diff --git a/docs/build/html/functions_func_e.html b/docs/build/html/functions_func_e.html
index 2b0aff299..64ed74664 100644
--- a/docs/build/html/functions_func_e.html
+++ b/docs/build/html/functions_func_e.html
@@ -99,8 +99,8 @@ $(function(){ initResizable(false); });
 <li>Erf()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_erf.html#a702f76f848928d8d7d3d0881ac6e4c82">mlx::core::Erf</a></li>
 <li>ErfInv()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478">mlx::core::ErfInv</a></li>
 <li>eval()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a2820c45188071a22175e9fa42e10a49a">mlx::core::array</a></li>
-<li>eval_cpu()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#ac45b1d0fedd85feefbff7ce7e168b151">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ab721fe0072fffbddbc3c4334dd033ba5">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#acdc1965ad64ee9ee6328fe150a97902e">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html#a3be84b08122a939edd6062d26261358a">mlx::core::distributed::Recv</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#af2620837bfc1b97217d006ed6e374051">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">mlx::core::fast::AffineQuantize</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">mlx::core::fast::CustomKernel</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5d7a4c1c9ee84e327d1c371733108c05">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a0d8c4c6e7462befc38f7e08244fa1c2b">mlx::core::fast::LayerNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#a7da6e0cfd630958d9633b2e2bd97a54f">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#adfc1d52bc266466ab29ee45fd8fab439">mlx::core::fast::RMSNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a05a7d595c6b9dadf7ddfd6e3fd402f0e">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ae20851e002f7fcb6d4f97817596f6328">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a1596dc50b910538eae14878e98f07575">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a7e8f6f5d6ae0a33f6abc0f5a46e0b132">mlx::core::UnaryPrimitive</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497">mlx::core::View</a></li>
-<li>eval_gpu()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa3d5ff0f2b3554ad48fbbf2a0f3336d5">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a4251ce0f2db2045226b66210b828af7a">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a52df7155f56b8450581b2fd2747cad20">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html#a932e39624bc3d234a7489c3decc4749e">mlx::core::distributed::Recv</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a0c8dbd2a912be91be04ec701e29fba3d">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">mlx::core::fast::AffineQuantize</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db">mlx::core::fast::CustomKernel</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a77abda7f47bffa2c037a5d60cccc1528">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a954a003a4a27c8c4c60a5a14142a9cc3">mlx::core::fast::LayerNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae7955e8d43c097eecae264e804b4d8ca">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a48efb8fa84c4ba6cc9fb560ebbe01560">mlx::core::fast::RMSNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a913b6b00fc518b25ac3947e4e15790f2">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a505f38ba93a3499895f5312e0112e73d">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ad217376dcf5eff691d731566faec2ba2">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a6b7f80abaf038d53ec6ffbb0dfac6adb">mlx::core::UnaryPrimitive</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075">mlx::core::View</a></li>
+<li>eval_cpu()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#ac45b1d0fedd85feefbff7ce7e168b151">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ab721fe0072fffbddbc3c4334dd033ba5">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#acdc1965ad64ee9ee6328fe150a97902e">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html#a3be84b08122a939edd6062d26261358a">mlx::core::distributed::Recv</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#af2620837bfc1b97217d006ed6e374051">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd">mlx::core::fast::AffineQuantize</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad">mlx::core::fast::CustomKernel</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5d7a4c1c9ee84e327d1c371733108c05">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a0d8c4c6e7462befc38f7e08244fa1c2b">mlx::core::fast::LayerNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#a7da6e0cfd630958d9633b2e2bd97a54f">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#adfc1d52bc266466ab29ee45fd8fab439">mlx::core::fast::RMSNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a05a7d595c6b9dadf7ddfd6e3fd402f0e">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ae20851e002f7fcb6d4f97817596f6328">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a1596dc50b910538eae14878e98f07575">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a7e8f6f5d6ae0a33f6abc0f5a46e0b132">mlx::core::UnaryPrimitive</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497">mlx::core::View</a></li>
+<li>eval_gpu()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa3d5ff0f2b3554ad48fbbf2a0f3336d5">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a4251ce0f2db2045226b66210b828af7a">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a52df7155f56b8450581b2fd2747cad20">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html#a932e39624bc3d234a7489c3decc4749e">mlx::core::distributed::Recv</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a0c8dbd2a912be91be04ec701e29fba3d">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628">mlx::core::fast::AffineQuantize</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db">mlx::core::fast::CustomKernel</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a77abda7f47bffa2c037a5d60cccc1528">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a954a003a4a27c8c4c60a5a14142a9cc3">mlx::core::fast::LayerNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae7955e8d43c097eecae264e804b4d8ca">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a48efb8fa84c4ba6cc9fb560ebbe01560">mlx::core::fast::RMSNormVJP</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a913b6b00fc518b25ac3947e4e15790f2">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a505f38ba93a3499895f5312e0112e73d">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ad217376dcf5eff691d731566faec2ba2">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a6b7f80abaf038d53ec6ffbb0dfac6adb">mlx::core::UnaryPrimitive</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075">mlx::core::View</a></li>
 <li>Event()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#a833506419b2110ad1abd89b2dd238b4d">mlx::core::Event</a></li>
 <li>event()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a0a8e4d6e67e739a712876bb36f88f9bf">mlx::core::array</a></li>
 <li>exec()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1cfftp.html#a95211024bf007d27e700835db556fbd2">pocketfft::detail::cfftp&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1fftblue.html#a5fb03413a3d1a653842875adcf87ae8c">pocketfft::detail::fftblue&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1pocketfft__c.html#a436afd63e8e130f97aff103ae964a45d">pocketfft::detail::pocketfft_c&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1pocketfft__r.html#a2815bc8aa04fa986834b02e502f98b33">pocketfft::detail::pocketfft_r&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1rfftp.html#a073972f42bdd3617693be7be2cb5e0ac">pocketfft::detail::rfftp&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1_t__dcst23.html#a2a45b7b4612904c2be69c01f6d5029ac">pocketfft::detail::T_dcst23&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1_t__dcst4.html#af794ebf21009d5f918681188081df708">pocketfft::detail::T_dcst4&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1_t__dct1.html#a7736111ff9d220f983e41a6fecd5f058">pocketfft::detail::T_dct1&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1_t__dst1.html#a598a9511004263eb3610053d7efc9e26">pocketfft::detail::T_dst1&lt; T0 &gt;</a></li>
diff --git a/docs/build/html/functions_func_i.html b/docs/build/html/functions_func_i.html
index c8b24b4e1..5b51e0c92 100644
--- a/docs/build/html/functions_func_i.html
+++ b/docs/build/html/functions_func_i.html
@@ -99,7 +99,7 @@ $(function(){ initResizable(false); });
 <li>iofs()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1multi__iter.html#ad33360d4a8ab8e6d72efadc6f9cb5bfa">pocketfft::detail::multi_iter&lt; N &gt;</a></li>
 <li>is_available()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#aebed1f37c19197be76105161102a8a40">mlx::core::array</a></li>
 <li>is_donatable()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a4677a404b5d191af20b52649225de087">mlx::core::array</a></li>
-<li>is_equivalent()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64">mlx::core::View</a></li>
+<li>is_equivalent()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#aa5d273a461fc6e64f3c9a67c24cb3372">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64">mlx::core::View</a></li>
 <li>is_open()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1io_1_1_file_writer.html#ad5d2ee671a81700cb1658c41309d6676">mlx::core::io::FileWriter</a>, <a class="el" href="classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a653009adbcbce8248bc666df502fdbde">mlx::core::io::ParallelFileReader</a>, <a class="el" href="classmlx_1_1core_1_1io_1_1_reader.html#a780f504058bd9c80cb3d105046a9f985">mlx::core::io::Reader</a>, <a class="el" href="classmlx_1_1core_1_1io_1_1_writer.html#a85aa36bdb0dbfb8c5b6cfd955b03417a">mlx::core::io::Writer</a></li>
 <li>is_ready()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html#ab41ecc5adb6187aa2682ca190fd920f3">pocketfft::detail::threading::latch</a></li>
 <li>is_signaled()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#a05a9a3de88185b4a89e154242b4e770a">mlx::core::Event</a></li>
diff --git a/docs/build/html/functions_func_j.html b/docs/build/html/functions_func_j.html
index 8bf8c5264..719ab3d07 100644
--- a/docs/build/html/functions_func_j.html
+++ b/docs/build/html/functions_func_j.html
@@ -87,7 +87,7 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all functions with links to the classes they belong to:</div>
 
 <h3><a id="index_j" name="index_j"></a>- j -</h3><ul>
-<li>jvp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a9fecf38f53da08ba1947543c2b3158c2">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1">mlx::core::Transpose</a></li>
+<li>jvp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a1f9fcae7235e0ae9217825b78cb0f991">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a9fecf38f53da08ba1947543c2b3158c2">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1">mlx::core::Transpose</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_func_l.html b/docs/build/html/functions_func_l.html
index b787da34e..be7c06ba6 100644
--- a/docs/build/html/functions_func_l.html
+++ b/docs/build/html/functions_func_l.html
@@ -101,16 +101,17 @@ $(function(){ initResizable(false); });
 <li>Load()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a">mlx::core::Load</a></li>
 <li>load()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a>, <a class="el" href="struct_read_writer.html#a120eaf4b5f32e80972a18d14e82a2d75">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>load_padded()&#160;:&#160;<a class="el" href="struct_read_writer.html#add5bd3f647793a5a19d63197a19df73c">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
-<li>load_safe()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a04bb72da9a93d6d1eba468fa311bbba7">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>load_safe()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a04bb72da9a93d6d1eba468fa311bbba7">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#ac2d95e35ba39e0984e6f1e58ca935f7d">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>load_strided()&#160;:&#160;<a class="el" href="struct_read_writer.html#a998ef484bade81f726b9edfc6b878197">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
-<li>load_unsafe()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a6013e9c5b2f72fa1311dd038172df0ce">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3859ca11b5991ef6ee9b99afdc3ea30a">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a961836be363409744e48e595d5e0c2ec">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#aa11d1a142bc868df462f48a7102147f3">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8034abc10483487fc94313e3674d1111">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a69e2f7c9814d1cc1c5c267be8618dc55">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8f078982186421f5b484c0b53af9c655">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a0e262b003ac0e7ee6272585eac921704">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
-<li>location()&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">looped_elem_to_loc&lt; dim, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a8c7aaffda0ca500d9f9566e5e74217a2">looped_elem_to_loc&lt; 0, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a368d2a2204cee5055386954acd5ccb90">looped_elem_to_loc&lt; 1, offset_t &gt;</a></li>
+<li>load_unsafe()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a6013e9c5b2f72fa1311dd038172df0ce">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#acb743f32146fdc7986264b7beb35fb38">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3859ca11b5991ef6ee9b99afdc3ea30a">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a961836be363409744e48e595d5e0c2ec">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#aa11d1a142bc868df462f48a7102147f3">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8034abc10483487fc94313e3674d1111">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a69e2f7c9814d1cc1c5c267be8618dc55">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8f078982186421f5b484c0b53af9c655">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a0e262b003ac0e7ee6272585eac921704">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>location()&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#aba051a428ad0934a9c6d04d4d3ee6e0e">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a89d9ec4dc2f2f0d77e27aa0c05f261ef">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a66b84b12f6c1494e5908989ed2849a9f">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></li>
 <li>Log()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9">mlx::core::Log</a></li>
 <li>Log1p()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a">mlx::core::Log1p</a></li>
 <li>LogAddExp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a">mlx::core::LogAddExp</a></li>
 <li>LogicalAnd()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3">mlx::core::LogicalAnd</a></li>
 <li>LogicalNot()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7">mlx::core::LogicalNot</a></li>
 <li>LogicalOr()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918">mlx::core::LogicalOr</a></li>
+<li>LoopedElemToLoc()&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#a5653be1c990722a4a215be27efe5648b">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a0e21977d9f23b6994773e8e4f3ee70de">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#abf536c7162d36af7367e390789944c86">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></li>
 <li>lowest()&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/functions_func_n.html b/docs/build/html/functions_func_n.html
index df574bf22..70032b09e 100644
--- a/docs/build/html/functions_func_n.html
+++ b/docs/build/html/functions_func_n.html
@@ -94,7 +94,7 @@ $(function(){ initResizable(false); });
 <li>Negative()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70">mlx::core::Negative</a></li>
 <li>new_queue()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">mlx::core::metal::Device</a></li>
 <li>new_stream()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a157c8da85fa1bddb8eacf8515a3cc879">mlx::core::scheduler::Scheduler</a></li>
-<li>next()&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">looped_elem_to_loc&lt; dim, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#aa1e9e1009c16befb9a730835836436e0">looped_elem_to_loc&lt; 0, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a96cf2987c04210c9197e5237e425c4b4">looped_elem_to_loc&lt; 1, offset_t &gt;</a>, <a class="el" href="classmlx_1_1core_1_1random_1_1_key_sequence.html#a4193c5eac3ef093a740d5305b25d3e18">mlx::core::random::KeySequence</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3e5ee68ed0ee43f7e979dd4222f76a8c">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3b71f379ff9baf39830c92f4f1ecde52">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af9ce1a767266664bea131a5437002c80">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a78d2b0098311a278be8394edbd5fc731">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aae56c19bb562219770fec38e5666c6ce">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a11743cb1c108f42ccdc6e59204a5b3e8">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a30b10bebde7f08b89d03bdd9ea0f48da">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>next()&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#a54c743940bf96350f3be42bba5d28205">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a03f3ca7a60bb85e36d7eba75e0e08b15">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#af5a7c0cddeb52da88fa1140f44aec45c">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a>, <a class="el" href="classmlx_1_1core_1_1random_1_1_key_sequence.html#a4193c5eac3ef093a740d5305b25d3e18">mlx::core::random::KeySequence</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a6008ef45ff980dbe1119da0630f6c697">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3e5ee68ed0ee43f7e979dd4222f76a8c">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3b71f379ff9baf39830c92f4f1ecde52">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af9ce1a767266664bea131a5437002c80">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a78d2b0098311a278be8394edbd5fc731">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aae56c19bb562219770fec38e5666c6ce">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a11743cb1c108f42ccdc6e59204a5b3e8">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a30b10bebde7f08b89d03bdd9ea0f48da">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>NotEqual()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_not_equal.html#ac568397bd17b5d9f25ad1a0ebadedbb9">mlx::core::NotEqual</a></li>
 <li>notify_new_task()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ae8aa34a9be8bc73508dd500000421173">mlx::core::scheduler::Scheduler</a></li>
 <li>notify_task_completion()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#abbb2b1c2f8bae2b9c7cc51db65f18a3b">mlx::core::scheduler::Scheduler</a></li>
diff --git a/docs/build/html/functions_func_o.html b/docs/build/html/functions_func_o.html
index e605ad367..0df9ec530 100644
--- a/docs/build/html/functions_func_o.html
+++ b/docs/build/html/functions_func_o.html
@@ -103,11 +103,10 @@ $(function(){ initResizable(false); });
 <li>operator+=()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#ad4e69dcd89bdb7764c9c5807168f911e">pocketfft::detail::cmplx&lt; T &gt;</a></li>
 <li>operator-()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#a460da5db36d1c72fb1ed3496fd3abde4">pocketfft::detail::cmplx&lt; T &gt;</a></li>
 <li>operator-=()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#a12441ff423274bd1b54245933d69ad7e">pocketfft::detail::cmplx&lt; T &gt;</a></li>
-<li>operator-&gt;()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966">mlx::core::metal::CommandEncoder</a></li>
 <li>operator=()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1___m_l_x___b_float16.html#a0f65b0523b8ddd989f338da6cb2860e3">mlx::core::_MLX_BFloat16</a>, <a class="el" href="structmlx_1_1core_1_1___m_l_x___float16.html#a608a099bf7116ee608dcfd31ea3ade2c">mlx::core::_MLX_Float16</a>, <a class="el" href="classmlx_1_1core_1_1allocator_1_1_allocator.html#a027b84cddc8d476f736ac1f1a9991fe4">mlx::core::allocator::Allocator</a>, <a class="el" href="structmlx_1_1core_1_1array_1_1_data.html#a68e9417954fe811b5e41e6317a526748">mlx::core::array::Data</a>, <a class="el" href="classmlx_1_1core_1_1array.html#a8acf2b4c75f9b7f79da6675dbc36cf36">mlx::core::array</a>, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e">mlx::core::metal::CommandEncoder</a>, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">mlx::core::metal::Device</a>, <a class="el" href="classmlx_1_1core_1_1metal_1_1_residency_set.html#aef97dbbc755940789f99a26164591c45">mlx::core::metal::ResidencySet</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a6b1be7ea92f3a7bb19875c70259dad6b">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ab170dbd2ce34c51e2eeebf5d08e7e2db">mlx::core::scheduler::Scheduler</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a0a859309a4f192f2679e07f2e4ff4d22">mlx::core::UnaryPrimitive</a></li>
 <li>operator[]()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1arr.html#aea0bd899b19e03f54dfd6c188727061a">pocketfft::detail::arr&lt; T &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1cndarr.html#ae4852d1fe936a5d61832b507816c7054">pocketfft::detail::cndarr&lt; T &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1ndarr.html#a2b2c4e205e8b5c32c9fe55dfd7b8c8d8">pocketfft::detail::ndarr&lt; T &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1sincos__2pibyn.html#a71b02f67c47b24adb296eafd2c7a3598">pocketfft::detail::sincos_2pibyn&lt; T &gt;</a></li>
 <li>out_of_bounds()&#160;:&#160;<a class="el" href="struct_read_writer.html#a08e10626fbc789b6dff9172fd6c36f7c">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
-<li>output_shapes()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a8849dc20991398f6f9a24d6785673853">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325">mlx::core::Tanh</a></li>
+<li>output_shapes()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a1a53623d7c591ba6567ac1533fbc2b7c">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a8849dc20991398f6f9a24d6785673853">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325">mlx::core::Tanh</a></li>
 <li>outputs()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a2c186fd527f984f0589d4183b4976289">mlx::core::array</a>, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">mlx::core::metal::CommandEncoder</a></li>
 <li>overwrite_descriptor()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a95e6b156c8e05439f076b85c05079387">mlx::core::array</a></li>
 </ul>
diff --git a/docs/build/html/functions_func_p.html b/docs/build/html/functions_func_p.html
index fda8c19de..e3626ae4b 100644
--- a/docs/build/html/functions_func_p.html
+++ b/docs/build/html/functions_func_p.html
@@ -99,7 +99,7 @@ $(function(){ initResizable(false); });
 <li>primitive()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a790548666511d8c6d9f92ee79d2ce14c">mlx::core::array</a></li>
 <li>primitive_id()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#af5ad83605d4eea81561246873bee1d7c">mlx::core::array</a></li>
 <li>primitive_ptr()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a5119cd616ec3c05d65878944b8889469">mlx::core::array</a></li>
-<li>print()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb">mlx::core::Primitive</a>, <a class="el" href="structmlx_1_1core_1_1_print_formatter.html#a79fad4cf5844db8c92b066539146281b">mlx::core::PrintFormatter</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c">mlx::core::View</a></li>
+<li>print()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#aca8a4ba9a58cc10f063e6b082fa2fc23">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb">mlx::core::Primitive</a>, <a class="el" href="structmlx_1_1core_1_1_print_formatter.html#a79fad4cf5844db8c92b066539146281b">mlx::core::PrintFormatter</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c">mlx::core::View</a></li>
 <li>prod()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1util.html#a33acae07b20b28fe4658bc338fce1b89">pocketfft::detail::util</a></li>
 <li>ptr()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1allocator_1_1_buffer.html#a990643feac06961c5599aac098c17b94">mlx::core::allocator::Buffer</a></li>
 <li>push()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#ae8ebd6bc1b4d18e75bd046005e7dde7b">pocketfft::detail::threading::concurrent_queue&lt; T &gt;</a></li>
diff --git a/docs/build/html/functions_func_q.html b/docs/build/html/functions_func_q.html
index f09c31456..a5868be0f 100644
--- a/docs/build/html/functions_func_q.html
+++ b/docs/build/html/functions_func_q.html
@@ -88,7 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_q" name="index_q"></a>- q -</h3><ul>
 <li>QRF()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983">mlx::core::QRF</a></li>
-<li>QuantizedBlockLoader()&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#af59b054750a65e7e79c1cd05c4acac93">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>QuantizedBlockLoader()&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>QuantizedMatmul()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c">mlx::core::QuantizedMatmul</a></li>
 <li>quiet_NaN()&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 </ul>
diff --git a/docs/build/html/functions_func_r.html b/docs/build/html/functions_func_r.html
index 56f88041b..d0276be4d 100644
--- a/docs/build/html/functions_func_r.html
+++ b/docs/build/html/functions_func_r.html
@@ -117,6 +117,8 @@ $(function(){ initResizable(false); });
 <li>RoPE()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a031fa27aeba94cfa5ccc633137a12163">mlx::core::fast::RoPE</a></li>
 <li>Round()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde">mlx::core::Round</a></li>
 <li>round_error()&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
+<li>row_bin_op()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a318c4279bdc7b39b7919f108b1cd8010">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>row_reduce()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a51d662e4cff88b5ad17d7c44bb6b6970">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>run()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#ac4a7b5011a0ea938ab1949bb1767fc1a">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a5d68656832de892f33db939005713927">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/functions_func_s.html b/docs/build/html/functions_func_s.html
index e99403c15..fffb9cd7a 100644
--- a/docs/build/html/functions_func_s.html
+++ b/docs/build/html/functions_func_s.html
@@ -97,7 +97,9 @@ $(function(){ initResizable(false); });
 <li>Select()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9">mlx::core::Select</a></li>
 <li>Send()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a2481dd876b14d4a13ac466cbca9c4eac">mlx::core::distributed::Send</a></li>
 <li>Set()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#a647fece372b64b13c4a7e5877d09a807">pocketfft::detail::cmplx&lt; T &gt;</a></li>
+<li>set_bytes()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9c343f791812a45c6c03a5c9f27f74d5">mlx::core::metal::CommandEncoder</a></li>
 <li>set_cache_limit()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html#af392bced29d9e4e3f1a7cc4725d83764">mlx::core::metal::MetalAllocator</a></li>
+<li>set_compute_pipeline_state()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6d4c03a6585deedb5ccd1a1057d0c6ef">mlx::core::metal::CommandEncoder</a></li>
 <li>set_data()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a631acd8e318189640b8338f9ae1a554d">mlx::core::array</a></li>
 <li>set_default_stream()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a6d15314ac9cf25efc9bd1278de9a66bb">mlx::core::scheduler::Scheduler</a></li>
 <li>set_input_array()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">mlx::core::metal::CommandEncoder</a></li>
@@ -108,8 +110,10 @@ $(function(){ initResizable(false); });
 <li>set_status()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a63598018999b49f1340b183cb303f05c">mlx::core::array</a></li>
 <li>set_tracer()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#af26e6be1a9e6239471a4c24310c0c7c8">mlx::core::array</a></li>
 <li>set_value()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#a0d077b11f4b28f882b42440b7ac6d40d">mlx::core::Event</a></li>
+<li>set_vector_bytes()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b">mlx::core::metal::CommandEncoder</a></li>
 <li>set_wired_limit()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a84fa0347da18055bc13ba0a5c4b57253">mlx::core::metal::MetalAllocator</a></li>
 <li>shape()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a4a2a2c8a4a5beafd723fc13f2055d55d">mlx::core::array</a>, <a class="el" href="classpocketfft_1_1detail_1_1arr__info.html#accada8146cb8d3ab7facb4c1e3413ec0">pocketfft::detail::arr_info</a></li>
+<li>Shape2D()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_shape2_d.html#a070ce70eb6d84361c7f313159c438a5c">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a></li>
 <li>shutdown()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a0c7c29290fde806031c497f24c4ad411">pocketfft::detail::threading::thread_pool</a></li>
 <li>siblings()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#acf80fde8f743f65ad5b4be69fcb7a74d">mlx::core::array</a></li>
 <li>Sigmoid()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b">mlx::core::Sigmoid</a></li>
diff --git a/docs/build/html/functions_func_t.html b/docs/build/html/functions_func_t.html
index 6e8b1ebbe..7eece817f 100644
--- a/docs/build/html/functions_func_t.html
+++ b/docs/build/html/functions_func_t.html
@@ -100,6 +100,7 @@ $(function(){ initResizable(false); });
 <li>ThreadPool()&#160;:&#160;<a class="el" href="class_thread_pool.html#ac291710e33dbbed96ee20711080d506d">ThreadPool</a></li>
 <li>TransformAdd()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></li>
 <li>TransformAxpby()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></li>
+<li>TransformScale()&#160;:&#160;<a class="el" href="struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70">TransformScale&lt; T &gt;</a></li>
 <li>Transpose()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a">mlx::core::Transpose</a></li>
 <li>try_pop()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#aa3807d46a126d229f9054c779105ea43">pocketfft::detail::threading::concurrent_queue&lt; T &gt;</a></li>
 </ul>
diff --git a/docs/build/html/functions_func_u.html b/docs/build/html/functions_func_u.html
index 09d0138cd..d8106c7ec 100644
--- a/docs/build/html/functions_func_u.html
+++ b/docs/build/html/functions_func_u.html
@@ -89,6 +89,7 @@ $(function(){ initResizable(false); });
 <h3><a id="index_u" name="index_u"></a>- u -</h3><ul>
 <li>UnaryPrimitive()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a189f6d4ed369f82a4b724a29eb056d4e">mlx::core::UnaryPrimitive</a></li>
 <li>Uniform()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1">mlx::core::Uniform</a></li>
+<li>update_fence()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aeef08f5f3c015578d40de756a6465aa2">mlx::core::metal::CommandEncoder</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_func_v.html b/docs/build/html/functions_func_v.html
index 33d877a08..c51ad5fbf 100644
--- a/docs/build/html/functions_func_v.html
+++ b/docs/build/html/functions_func_v.html
@@ -91,8 +91,8 @@ $(function(){ initResizable(false); });
 <li>valid()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#aa77afd9669e2ef9d5e9ae1c2c6fd24fa">mlx::core::Event</a></li>
 <li>value()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#ab71c7baee3d1d02ad6a2001bbf90b970">mlx::core::Event</a></li>
 <li>View()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e">mlx::core::View</a></li>
-<li>vjp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a1dcb6807326eeab62474c6a0e3836d42">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80">mlx::core::Transpose</a></li>
-<li>vmap()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ac632b9619dd7a6a0f177bd36202e8103">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121">mlx::core::View</a></li>
+<li>vjp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#abf488f02057fd5852f38b2e8a600ad2a">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a1dcb6807326eeab62474c6a0e3836d42">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80">mlx::core::Transpose</a></li>
+<li>vmap()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a563221e90b15aa90bfae23d29c10e4ec">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ac632b9619dd7a6a0f177bd36202e8103">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121">mlx::core::View</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_func_w.html b/docs/build/html/functions_func_w.html
index f7fb19db3..f5ad13fdc 100644
--- a/docs/build/html/functions_func_w.html
+++ b/docs/build/html/functions_func_w.html
@@ -88,6 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_w" name="index_w"></a>- w -</h3><ul>
 <li>wait()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a648592006f1c92287734ba2428eaa45e">mlx::core::array</a>, <a class="el" href="classmlx_1_1core_1_1_event.html#a634afd918e6ed847f354531ba9f48252">mlx::core::Event</a>, <a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html#af503189cc9247047fbdfc3ebf1daacc1">pocketfft::detail::threading::latch</a></li>
+<li>wait_for_fence()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefdadbff4e003dc6f77506840babc088">mlx::core::metal::CommandEncoder</a></li>
 <li>wait_for_one()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a01c574bb388f10d67aaaaa541894d807">mlx::core::scheduler::Scheduler</a></li>
 <li>write()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1io_1_1_file_writer.html#abca32838c9886f734d93430c34c07d7f">mlx::core::io::FileWriter</a>, <a class="el" href="classmlx_1_1core_1_1io_1_1_writer.html#ad9515b7f007338674de1e124cf77e125">mlx::core::io::Writer</a>, <a class="el" href="struct_read_writer.html#ac2ea71e41740ddc863890e3e8e6f09d0">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>write_padded()&#160;:&#160;<a class="el" href="struct_read_writer.html#a95367307acace2aa88226cf8956d2d88">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
diff --git a/docs/build/html/functions_g.html b/docs/build/html/functions_g.html
index f8758d443..34d40c6ee 100644
--- a/docs/build/html/functions_g.html
+++ b/docs/build/html/functions_g.html
@@ -91,11 +91,9 @@ $(function(){ initResizable(false); });
 <li>GatherMM()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#afd9bbc08138181b80e2fb86536ff3f2a">mlx::core::GatherMM</a></li>
 <li>GatherQMM()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a60ed2ade7f10dd9c9314913a810f9360">mlx::core::GatherQMM</a></li>
 <li>gemm_k_iterations&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a8b50863e4e2d3481c154be6c3629bf51">mlx::steel::ImplicitGemmConv2DParams</a></li>
-<li>gemm_k_iterations_aligned&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0d7f419ba265805b418e93ce1ca2e0f9">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#aa37e05a03ac8b34ec7dc31ca42f68998">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#adbc0a13076da5f704498e57239cb2bf2">MLXFastAttentionParams</a></li>
+<li>gemm_k_iterations_aligned&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0d7f419ba265805b418e93ce1ca2e0f9">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#aa37e05a03ac8b34ec7dc31ca42f68998">mlx::steel::GEMMSpiltKParams</a></li>
 <li>gemm_loop()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
-<li>gemm_n_iterations_aligned&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#ab56b3db8fc6a938ce9c739ee78a7b803">MLXFastAttentionParams</a></li>
 <li>gemm_params&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ab0724eb3ef52ee773b6607f6433b9f2c">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af59f9d356c4c3ec5627dc5a263d239d4">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#acc778b3c0b7ec38a43e8ea943df8704c">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
-<li>gemm_sv_m_block_iterations&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a2799a2f219441fef7f351374f4cbc67c">MLXFastAttentionParams</a></li>
 <li>get_active_memory()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a7a3ad4e33d57a47474c98e2f88e775d7">mlx::core::metal::MetalAllocator</a></li>
 <li>get_architecture()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">mlx::core::metal::Device</a></li>
 <li>get_cache_memory()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html#ad3cabbe638917ca4114eb74dcabe381f">mlx::core::metal::MetalAllocator</a></li>
@@ -112,6 +110,7 @@ $(function(){ initResizable(false); });
 <li>good_size_cmplx()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1util.html#a758e00d242a1b7eda8f9f0c21f35c624">pocketfft::detail::util</a></li>
 <li>good_size_real()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1util.html#a173da7d5929ded86fffcebcfdc5086aa">pocketfft::detail::util</a></li>
 <li>gpu&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_device.html#a45ed081b56ae5d4ddd39c83a5d8a1616">mlx::core::Device</a></li>
+<li>gqa_factor&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a3b3e18cb993ab24819c852bc64288841">mlx::steel::AttnParams</a></li>
 <li>Greater()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b">mlx::core::Greater</a></li>
 <li>GreaterEqual()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a19a3c49d5a9b40e17da0e56ef6908527">mlx::core::GreaterEqual</a></li>
 <li>grid&#160;:&#160;<a class="el" href="struct_read_writer.html#ac7a957f99873d3797081f5d620f3b2c8">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
diff --git a/docs/build/html/functions_h.html b/docs/build/html/functions_h.html
index 922da92d6..857a46f57 100644
--- a/docs/build/html/functions_h.html
+++ b/docs/build/html/functions_h.html
@@ -87,6 +87,7 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all class members with links to the classes they belong to:</div>
 
 <h3><a id="index_h" name="index_h"></a>- h -</h3><ul>
+<li>H&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a3d286a0c27bace6016ed7a87f43291b7">mlx::steel::AttnParams</a></li>
 <li>Hadamard()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_hadamard.html#abe4a0ed820b126940beec519d4239923">mlx::core::Hadamard</a></li>
 <li>has_mul_operand_mask&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#ad47223ee49b3cb7bf3746a2cec45f883">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a8db6f01f96a36b216acd801c34a96ef5">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
 <li>has_mul_output_mask&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a0edbf2dd6a6563e7afa6dab6b670615c">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a8eb06f6569e4042e24fee220b11fa10d">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
diff --git a/docs/build/html/functions_i.html b/docs/build/html/functions_i.html
index 57bdb5355..5c35ec29d 100644
--- a/docs/build/html/functions_i.html
+++ b/docs/build/html/functions_i.html
@@ -97,10 +97,10 @@ $(function(){ initResizable(false); });
 <li>in_strides&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#ab25eade6573784985dbea1216f9068cf">MLXConvParams&lt; NDIM &gt;</a></li>
 <li>in_tracing()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1detail_1_1_in_tracing.html#ac52b8e2c3f808d3076c4e1ebaf9dc63d">mlx::core::detail::InTracing</a></li>
 <li>increment_command_buffer_ops()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">mlx::core::metal::Device</a></li>
-<li>index&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">looped_elem_to_loc&lt; dim, offset_t &gt;</a>, <a class="el" href="structmlx_1_1core_1_1_device.html#a5e345748fe318a267833ab7398b364ac">mlx::core::Device</a>, <a class="el" href="structmlx_1_1core_1_1_stream.html#a9d0dafc1899333e1176eb2bbc0a8b626">mlx::core::Stream</a></li>
+<li>index&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a2cd3b616739b3d5b41e5b46ae335957d">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a>, <a class="el" href="structmlx_1_1core_1_1_device.html#a5e345748fe318a267833ab7398b364ac">mlx::core::Device</a>, <a class="el" href="structmlx_1_1core_1_1_stream.html#a9d0dafc1899333e1176eb2bbc0a8b626">mlx::core::Stream</a></li>
 <li>infinity()&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61eb741e7af49046beb863abf023b206">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 <li>init&#160;:&#160;<a class="el" href="struct_cum_max.html#a16480052a2eeb4340e546838aab59cc4">CumMax&lt; U &gt;</a>, <a class="el" href="struct_cum_min.html#a8b67f739c620d0cc194b533190990ab9">CumMin&lt; U &gt;</a>, <a class="el" href="struct_cum_prod_3_01bool_01_4.html#ae7a8b0ba9e6898356b87b18766e76d2c">CumProd&lt; bool &gt;</a>, <a class="el" href="struct_less_than.html#abf97a6b0163048e4ba96460939dbd3a3">LessThan&lt; T &gt;</a></li>
-<li>inner_looper&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">looped_elem_to_loc&lt; dim, offset_t &gt;</a></li>
+<li>inner_looper&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></li>
 <li>inp_jump_c&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a78d30e843d65d1829623afb0b607f0a5">mlx::steel::ImplicitGemmConv2DParams</a></li>
 <li>inp_jump_h&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a03685a4066cdb11ffb647408e2c5b122">mlx::steel::ImplicitGemmConv2DParams</a></li>
 <li>inp_jump_w&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#acf168c72f4a86b72b8f5f386f07c9d8c">mlx::steel::ImplicitGemmConv2DParams</a></li>
@@ -108,13 +108,12 @@ $(function(){ initResizable(false); });
 <li>insert()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_residency_set.html#aaafe1a4305a107d4bcdd4f35d3df09b3">mlx::core::metal::ResidencySet</a></li>
 <li>InTracing()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1detail_1_1_in_tracing.html#a7a77f19391498afa5dcea3509d241a70">mlx::core::detail::InTracing</a></li>
 <li>inv&#160;:&#160;<a class="el" href="struct_read_writer.html#a773fa8524515bdc2ff8b0e2060741813">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
-<li>INV_ALPHA&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a7461e0e17cdc7d3fed80bb00d58d8644">MLXScaledDotProductAttentionParams</a></li>
 <li>Inverse()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_inverse.html#a71467681e523abb725724490bfeb76ad">mlx::core::Inverse</a></li>
 <li>iofs()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1multi__iter.html#ad33360d4a8ab8e6d72efadc6f9cb5bfa">pocketfft::detail::multi_iter&lt; N &gt;</a></li>
 <li>iS&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#a72e1c3b4da0f70622cf18036bbf97fe6">MLXConvParams&lt; NDIM &gt;</a></li>
 <li>is_available()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#aebed1f37c19197be76105161102a8a40">mlx::core::array</a></li>
 <li>is_donatable()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a4677a404b5d191af20b52649225de087">mlx::core::array</a></li>
-<li>is_equivalent()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64">mlx::core::View</a></li>
+<li>is_equivalent()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#aa5d273a461fc6e64f3c9a67c24cb3372">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62">mlx::core::fast::ScaledDotProductAttention</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64">mlx::core::View</a></li>
 <li>is_open()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1io_1_1_file_writer.html#ad5d2ee671a81700cb1658c41309d6676">mlx::core::io::FileWriter</a>, <a class="el" href="classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a653009adbcbce8248bc666df502fdbde">mlx::core::io::ParallelFileReader</a>, <a class="el" href="classmlx_1_1core_1_1io_1_1_reader.html#a780f504058bd9c80cb3d105046a9f985">mlx::core::io::Reader</a>, <a class="el" href="classmlx_1_1core_1_1io_1_1_writer.html#a85aa36bdb0dbfb8c5b6cfd955b03417a">mlx::core::io::Writer</a></li>
 <li>is_ready()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html#ab41ecc5adb6187aa2682ca190fd920f3">pocketfft::detail::threading::latch</a></li>
 <li>is_signaled()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#a05a9a3de88185b4a89e154242b4e770a">mlx::core::Event</a></li>
diff --git a/docs/build/html/functions_j.html b/docs/build/html/functions_j.html
index 8b36f0dba..d289699d8 100644
--- a/docs/build/html/functions_j.html
+++ b/docs/build/html/functions_j.html
@@ -88,7 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_j" name="index_j"></a>- j -</h3><ul>
 <li>jump_params&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a21b9ee9168dad4af84a611f861519e77">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aa5611e9a84bebaee966d2b339c214ff5">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
-<li>jvp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a9fecf38f53da08ba1947543c2b3158c2">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1">mlx::core::Transpose</a></li>
+<li>jvp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a1f9fcae7235e0ae9217825b78cb0f991">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a9fecf38f53da08ba1947543c2b3158c2">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1">mlx::core::Transpose</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_k.html b/docs/build/html/functions_k.html
index b0d096968..de7253275 100644
--- a/docs/build/html/functions_k.html
+++ b/docs/build/html/functions_k.html
@@ -87,8 +87,10 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all class members with links to the classes they belong to:</div>
 
 <h3><a id="index_k" name="index_k"></a>- k -</h3><ul>
-<li>K&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#aa0851af4da8df820bdad9589ff517cff">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a3733f9031e82e761ec44e72ed5c6d0e7">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ae1b0386e4cd1a7018f4b654c4e9493ba">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#ada454f5ad22ec36a22d0ff596751af23">MLXFastAttentionParams</a></li>
-<li>kCols&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>K&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#aa0851af4da8df820bdad9589ff517cff">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a3733f9031e82e761ec44e72ed5c6d0e7">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ae1b0386e4cd1a7018f4b654c4e9493ba">mlx::steel::ImplicitGemmConv2DParams</a></li>
+<li>K_strides&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a03e5480d1cca6af541be54a8720e9974">mlx::steel::AttnParams</a></li>
+<li>kCols&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_c_shape.html#a01b09227356b6a682a0694523a8e6901">mlx::steel::CShape&lt; R, C &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>kColsPerThread&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1ea49efd92696b15302ee4b52ecd548c">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>kdil&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#a7611db8f1621c7e09fc685ed44073b14">MLXConvParams&lt; NDIM &gt;</a></li>
 <li>kElemCols&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></li>
 <li>kElemRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></li>
@@ -99,11 +101,12 @@ $(function(){ initResizable(false); });
 <li>kFragRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>kFragSize&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>Kind&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_dtype.html#adb1ea8b45a0c53e04a0e73b168702715">mlx::core::Dtype</a></li>
+<li>kL&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a497b7404bcd25b535c3589c61f269f63">mlx::steel::AttnParams</a></li>
 <li>kNumFrags&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
-<li>kRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>kRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_c_shape.html#a5caf36cb9acf9f90ba59a9b0b4197993">mlx::steel::CShape&lt; R, C &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>kRowsPerThread&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>kTileCols&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>kTileRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
-<li>KV_TILES&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a58ef2765fd681e6b35b2ba72030610e0">MLXScaledDotProductAttentionParams</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_l.html b/docs/build/html/functions_l.html
index 2f996b3ae..abe2b1929 100644
--- a/docs/build/html/functions_l.html
+++ b/docs/build/html/functions_l.html
@@ -92,15 +92,11 @@ $(function(){ initResizable(false); });
 <li>latch()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html#a9260c871bb927525f7f53caa23a19c15">pocketfft::detail::threading::latch</a></li>
 <li>LayerNorm()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5ac38d50e62850589bf51ee313303153">mlx::core::fast::LayerNorm</a></li>
 <li>LayerNormVJP()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a41bc1391dbc0cf63b2c85b67956c08d9">mlx::core::fast::LayerNormVJP</a></li>
+<li>layout&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_layout2_d.html#a6beedf1677ee1b192fb48c83a29ac8a1">mlx::steel::Layout2D&lt; Shape, Layout &gt;</a></li>
 <li>lda&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#afec42b532ffcad32bbffd494526bef03">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a6fac3c4a7c35af7b46b53f9662f882c6">mlx::steel::GEMMSpiltKParams</a></li>
 <li>ldb&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a6032a081ab707c14b5f28069faa7cf62">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a7f6f511854ccc98fa573bb560776ebed">mlx::steel::GEMMSpiltKParams</a></li>
 <li>ldc&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a801e2245a36632160975a784b762a4e6">mlx::steel::GEMMAddMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a888730efa5c5c8ae7ed771c3084d583c">mlx::steel::GEMMSpiltKParams</a></li>
 <li>ldd&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a6e8ae14e3f97c499ad9c39358a1855ab">mlx::steel::GEMMParams</a></li>
-<li>ldk&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a1f8c89bd55d89ad7b9fe27c60e3cb8d5">MLXFastAttentionParams</a></li>
-<li>ldo&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a9e73dc1971b5ab913bd85a7afa7cf46c">MLXFastAttentionParams</a></li>
-<li>ldq&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#af2dadba2a28f5db2ca52472d00937e58">MLXFastAttentionParams</a></li>
-<li>lds&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a274eeb8591c02511014dce50c4240c8a">MLXFastAttentionParams</a></li>
-<li>ldv&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#aebada0bf0789e8706dce564752208e8b">MLXFastAttentionParams</a></li>
 <li>LeftShift&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a6f8b5d455d0c1770428a6bef1608f23da986b39e75cbe29fcda1d7bf7942a65a0">mlx::core::BitwiseBinary</a></li>
 <li>length()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1pocketfft__c.html#a1fd1a2f9b3ae5ee9f00b9ca6946eb16d">pocketfft::detail::pocketfft_c&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1pocketfft__r.html#a83222fdbf81a7c6d560e0841cdfca8c6">pocketfft::detail::pocketfft_r&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1_t__dcst23.html#a6dab012b487ff98d11b8a9418653a478">pocketfft::detail::T_dcst23&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1_t__dcst4.html#af25bf28a7ccd4690ca9934e3aa79c12f">pocketfft::detail::T_dcst4&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1_t__dct1.html#ac7a04c91d507bd8f173d2266bb5bb168">pocketfft::detail::T_dct1&lt; T0 &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1_t__dst1.html#ab205d901650e38b592ff860b7978fa3e">pocketfft::detail::T_dst1&lt; T0 &gt;</a></li>
 <li>length_in()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1multi__iter.html#a5318b79d934cddf109dff7bf96a330c8">pocketfft::detail::multi_iter&lt; N &gt;</a></li>
@@ -111,19 +107,20 @@ $(function(){ initResizable(false); });
 <li>Load()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a">mlx::core::Load</a></li>
 <li>load()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a>, <a class="el" href="struct_read_writer.html#a120eaf4b5f32e80972a18d14e82a2d75">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>load_padded()&#160;:&#160;<a class="el" href="struct_read_writer.html#add5bd3f647793a5a19d63197a19df73c">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
-<li>load_safe()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a04bb72da9a93d6d1eba468fa311bbba7">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>load_safe()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a04bb72da9a93d6d1eba468fa311bbba7">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#ac2d95e35ba39e0984e6f1e58ca935f7d">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>load_strided()&#160;:&#160;<a class="el" href="struct_read_writer.html#a998ef484bade81f726b9edfc6b878197">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
-<li>load_unsafe()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a6013e9c5b2f72fa1311dd038172df0ce">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3859ca11b5991ef6ee9b99afdc3ea30a">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a961836be363409744e48e595d5e0c2ec">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#aa11d1a142bc868df462f48a7102147f3">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8034abc10483487fc94313e3674d1111">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a69e2f7c9814d1cc1c5c267be8618dc55">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8f078982186421f5b484c0b53af9c655">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a0e262b003ac0e7ee6272585eac921704">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
-<li>loader_a_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa8a04ed74d2259f99b337d4662c64d83">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
-<li>loader_b_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa98f32278b5fd98c93ae5483c3596395">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
+<li>load_unsafe()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a6013e9c5b2f72fa1311dd038172df0ce">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#acb743f32146fdc7986264b7beb35fb38">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3859ca11b5991ef6ee9b99afdc3ea30a">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a961836be363409744e48e595d5e0c2ec">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#aa11d1a142bc868df462f48a7102147f3">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8034abc10483487fc94313e3674d1111">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a69e2f7c9814d1cc1c5c267be8618dc55">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8f078982186421f5b484c0b53af9c655">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a0e262b003ac0e7ee6272585eac921704">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>loader_a_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a98b6ec692580510081e2aa887a61944b">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
+<li>loader_b_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1a115d5af0fb6e260165adba2e377635">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
 <li>loc&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html#a027b29e06d5cb467d961c019699514b1">mlx::core::ContiguousIterator&lt; StrideT &gt;</a></li>
-<li>location()&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">looped_elem_to_loc&lt; dim, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a8c7aaffda0ca500d9f9566e5e74217a2">looped_elem_to_loc&lt; 0, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a368d2a2204cee5055386954acd5ccb90">looped_elem_to_loc&lt; 1, offset_t &gt;</a></li>
+<li>location()&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#aba051a428ad0934a9c6d04d4d3ee6e0e">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a89d9ec4dc2f2f0d77e27aa0c05f261ef">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a66b84b12f6c1494e5908989ed2849a9f">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></li>
 <li>Log()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9">mlx::core::Log</a></li>
 <li>Log1p()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a">mlx::core::Log1p</a></li>
 <li>LogAddExp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a">mlx::core::LogAddExp</a></li>
 <li>LogicalAnd()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3">mlx::core::LogicalAnd</a></li>
 <li>LogicalNot()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7">mlx::core::LogicalNot</a></li>
 <li>LogicalOr()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918">mlx::core::LogicalOr</a></li>
+<li>LoopedElemToLoc()&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#a5653be1c990722a4a215be27efe5648b">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a0e21977d9f23b6994773e8e4f3ee70de">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#abf536c7162d36af7367e390789944c86">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></li>
 <li>lowest()&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/functions_m.html b/docs/build/html/functions_m.html
index 3be734fe6..5d68a72da 100644
--- a/docs/build/html/functions_m.html
+++ b/docs/build/html/functions_m.html
@@ -87,14 +87,14 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all class members with links to the classes they belong to:</div>
 
 <h3><a id="index_m" name="index_m"></a>- m -</h3><ul>
-<li>M&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a85b20a4c4558cc78d76fcbd045a9c694">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a8bab0cf8a20d2abefe294a7505917e7e">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a2117fc93662d5177c8f3e7c2dbb9e2db">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a5cd3ede5f41d5fdf8177cab3f059f4d8">MLXFastAttentionParams</a></li>
+<li>M&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a85b20a4c4558cc78d76fcbd045a9c694">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a8bab0cf8a20d2abefe294a7505917e7e">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a2117fc93662d5177c8f3e7c2dbb9e2db">mlx::steel::ImplicitGemmConv2DParams</a></li>
 <li>make_arrays()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a1173db4e23f5a8230911cb8fba45d5e6">mlx::core::array</a></li>
 <li>malloc()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1allocator_1_1_allocator.html#a9a17d2c7a97772bf4a15e6c74af34ca4">mlx::core::allocator::Allocator</a>, <a class="el" href="classmlx_1_1core_1_1allocator_1_1_common_allocator.html#a4f3d5de6b8c0eba22e9403b28a5ef3f0">mlx::core::allocator::CommonAllocator</a>, <a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a6c0feb9b1ff9977f76c69745393944bc">mlx::core::metal::MetalAllocator</a></li>
 <li>mask_h&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0b892c1a7edb9ed20c076d8945855c19">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>mask_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a270ab3da7c98a12525a59952742cc97d">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>mask_w&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a19ddba7259c3c2c02ed90f3f635557be">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>mat_at()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
-<li>mat_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>mat_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>Matmul()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7">mlx::core::Matmul</a></li>
 <li>Max&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924ac00cf69bbba24f7ab08d3ad618705988">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a3d11c500ea4f7f639e20dd0755d39260">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ad54b2905015a390708f79bae6cdac56d">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca1c2da7b96d743296fe660f5fc4072f16">mlx::core::Scatter</a></li>
 <li>max&#160;:&#160;<a class="el" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">Limits&lt; U &gt;</a>, <a class="el" href="struct_limits_3_01bfloat16__t_01_4.html#a0ead3618da6718629ea9fa4670b5005f">Limits&lt; bfloat16_t &gt;</a>, <a class="el" href="struct_limits_3_01bool_01_4.html#acbd2132145888d51220558a101ffcff4">Limits&lt; bool &gt;</a>, <a class="el" href="struct_limits_3_01complex64__t_01_4.html#ac01c274b224b90f5210b675a484f4607">Limits&lt; complex64_t &gt;</a>, <a class="el" href="struct_limits_3_01float_01_4.html#aba172b22b388190aa3969ef16885d8a6">Limits&lt; float &gt;</a>, <a class="el" href="struct_limits_3_01half_01_4.html#a4f9515dbf2a622074f121bea39a7b175">Limits&lt; half &gt;</a>, <a class="el" href="struct_limits_3_01int16__t_01_4.html#a12d64c398ca7609b7c906f3cf1a6f678">Limits&lt; int16_t &gt;</a>, <a class="el" href="struct_limits_3_01int32__t_01_4.html#af756344b31e84222dd73d3445dcd5640">Limits&lt; int32_t &gt;</a>, <a class="el" href="struct_limits_3_01int64__t_01_4.html#ac9c420604c0f3d237ddfb2b8a2439224">Limits&lt; int64_t &gt;</a>, <a class="el" href="struct_limits_3_01int8__t_01_4.html#a96fed01fa9249226be69760652643289">Limits&lt; int8_t &gt;</a>, <a class="el" href="struct_limits_3_01uint16__t_01_4.html#a228b33556ba4cb7e6137ab6258628488">Limits&lt; uint16_t &gt;</a>, <a class="el" href="struct_limits_3_01uint32__t_01_4.html#a91fa8f7214ec936976a8324c7431c651">Limits&lt; uint32_t &gt;</a>, <a class="el" href="struct_limits_3_01uint64__t_01_4.html#aa8c2257881a4e1fa8596fa07dba5e107">Limits&lt; uint64_t &gt;</a>, <a class="el" href="struct_limits_3_01uint8__t_01_4.html#a1570fb640e2e41f96776db5ca08d500c">Limits&lt; uint8_t &gt;</a>, <a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a92320d40a58218e40cc414986ac95c50">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
@@ -111,9 +111,9 @@ $(function(){ initResizable(false); });
 <li>min_exponent10&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aeaed172780720e06b8731cef3177e277">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 <li>Minimum()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5">mlx::core::Minimum</a></li>
 <li>mma()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>mma_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#add8c6a31011a4895667c2a94a5af3782">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
-<li>MMAFrag_acc_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae2c42cb6d0dde785859164c195f4d13c">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>MMAFrag_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>mma_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ae52eb09c9478cd4f199662346ac0c83e">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
+<li>MMAFrag_acc_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8231b0e3475077c1381eb8f5daf62e35">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
+<li>MMAFrag_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>MMATile()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>move_shared_buffer()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#acce00db63e0f3d80f797b02397ade836">mlx::core::array</a></li>
 <li>mtl_device()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">mlx::core::metal::Device</a></li>
diff --git a/docs/build/html/functions_n.html b/docs/build/html/functions_n.html
index 543b19882..ff59dacbd 100644
--- a/docs/build/html/functions_n.html
+++ b/docs/build/html/functions_n.html
@@ -87,15 +87,13 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all class members with links to the classes they belong to:</div>
 
 <h3><a id="index_n" name="index_n"></a>- n -</h3><ul>
-<li>N&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a174626ab98515d89923b2841a664b9a1">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a1103e79fb8962812b9a3c9d5c902ff86">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a213f5ea4018120d8b61ab82754aaba83">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_conv_params.html#ae6b7054dc3cffa8e6aedeb29fa7da932">MLXConvParams&lt; NDIM &gt;</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#ab42c792a80388002e34992cbd837a167">MLXFastAttentionParams</a></li>
+<li>N&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a174626ab98515d89923b2841a664b9a1">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a1103e79fb8962812b9a3c9d5c902ff86">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a213f5ea4018120d8b61ab82754aaba83">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_conv_params.html#ae6b7054dc3cffa8e6aedeb29fa7da932">MLXConvParams&lt; NDIM &gt;</a></li>
 <li>n&#160;:&#160;<a class="el" href="struct_read_writer.html#a655346c9ebfc33a69da3f1c1d4238dfb">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>n_active_tasks()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a3c9fa21442974acba3409d49bb033131">mlx::core::scheduler::Scheduler</a></li>
 <li>n_channels&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_channel_helper.html#aa476bd0fcb38494c268547fc9820fc0a">mlx::steel::ChannelHelper&lt; n_channels_ &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a06c2fb9c93660e8f6916228cd77f9494">mlx::steel::ChannelHelper&lt; 1 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#ac66ff37bc2cf78d96667192a6cca73b5">mlx::steel::ChannelHelper&lt; 2 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a071c015713b7bab09930661165517eff">mlx::steel::ChannelHelper&lt; 3 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#a167b00a84adf93b60e3d7a943d5eb977">mlx::steel::ChannelHelper&lt; 4 &gt;</a></li>
-<li>N_KV_HEADS&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a68a292b9986c20560aca88394f82e9f7">MLXScaledDotProductAttentionParams</a></li>
 <li>N_PER_BLOCK&#160;:&#160;<a class="el" href="struct_kernel_merge_sort.html#a959aaf5bfb70796a525fed318f7ae8ab">KernelMergeSort&lt; T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a>, <a class="el" href="struct_kernel_multi_block_merge_sort.html#ae5113ca5852d11999ae932439af95a5c">KernelMultiBlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
-<li>N_Q_HEADS&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a1a63d2e7ad712b4ba26219c784c95177">MLXScaledDotProductAttentionParams</a></li>
 <li>n_reads&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
-<li>n_rows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#abff29c5d96645d9113314c9a997dd7a8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a097c48a23e1bd7d8cf3e9d531397602f">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a8b6c0936c9ad2766242664f034d1115f">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3ec8a92c9e6643c1d5bf8af278026fe8">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a593ec140370d53f8c968f6240116d38b">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aaebb6da2cac9961f5edf52d16c18de7d">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae905e56c1129606e93dbbcd7baed8f0f">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
+<li>n_rows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a0ccc7caa93e6e709981a1a08159d41dc">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#abff29c5d96645d9113314c9a997dd7a8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a097c48a23e1bd7d8cf3e9d531397602f">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a8b6c0936c9ad2766242664f034d1115f">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3ec8a92c9e6643c1d5bf8af278026fe8">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a593ec140370d53f8c968f6240116d38b">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aaebb6da2cac9961f5edf52d16c18de7d">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae905e56c1129606e93dbbcd7baed8f0f">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 <li>names&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_node_namer.html#a57823f9a2cdc60b2f06f857b36019277">mlx::core::NodeNamer</a></li>
 <li>nbytes()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a387b67cd3ef5cfc1e749c371766c4a05">mlx::core::array</a></li>
 <li>ndarr()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1ndarr.html#a8f0037a172d96cb1ad915a5069175fa2">pocketfft::detail::ndarr&lt; T &gt;</a></li>
@@ -104,11 +102,15 @@ $(function(){ initResizable(false); });
 <li>Negative()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70">mlx::core::Negative</a></li>
 <li>new_queue()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">mlx::core::metal::Device</a></li>
 <li>new_stream()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a157c8da85fa1bddb8eacf8515a3cc879">mlx::core::scheduler::Scheduler</a></li>
-<li>next()&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">looped_elem_to_loc&lt; dim, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#aa1e9e1009c16befb9a730835836436e0">looped_elem_to_loc&lt; 0, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a96cf2987c04210c9197e5237e425c4b4">looped_elem_to_loc&lt; 1, offset_t &gt;</a>, <a class="el" href="classmlx_1_1core_1_1random_1_1_key_sequence.html#a4193c5eac3ef093a740d5305b25d3e18">mlx::core::random::KeySequence</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3e5ee68ed0ee43f7e979dd4222f76a8c">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3b71f379ff9baf39830c92f4f1ecde52">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af9ce1a767266664bea131a5437002c80">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a78d2b0098311a278be8394edbd5fc731">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aae56c19bb562219770fec38e5666c6ce">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a11743cb1c108f42ccdc6e59204a5b3e8">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a30b10bebde7f08b89d03bdd9ea0f48da">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>next()&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#a54c743940bf96350f3be42bba5d28205">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a03f3ca7a60bb85e36d7eba75e0e08b15">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#af5a7c0cddeb52da88fa1140f44aec45c">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a>, <a class="el" href="classmlx_1_1core_1_1random_1_1_key_sequence.html#a4193c5eac3ef093a740d5305b25d3e18">mlx::core::random::KeySequence</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a6008ef45ff980dbe1119da0630f6c697">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3e5ee68ed0ee43f7e979dd4222f76a8c">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3b71f379ff9baf39830c92f4f1ecde52">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af9ce1a767266664bea131a5437002c80">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a78d2b0098311a278be8394edbd5fc731">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aae56c19bb562219770fec38e5666c6ce">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a11743cb1c108f42ccdc6e59204a5b3e8">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a30b10bebde7f08b89d03bdd9ea0f48da">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>NK&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a68a66e3fafa922dcfd1ab1f6bdc2375e">mlx::steel::AttnParams</a></li>
+<li>NK_aligned&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#aaf953954274794cfcb4e35e82d681b58">mlx::steel::AttnParams</a></li>
 <li>None&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca87a6a1927de175b71d7d0b5c11b8665c">mlx::core::Scatter</a></li>
 <li>NotEqual()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_not_equal.html#ac568397bd17b5d9f25ad1a0ebadedbb9">mlx::core::NotEqual</a></li>
 <li>notify_new_task()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ae8aa34a9be8bc73508dd500000421173">mlx::core::scheduler::Scheduler</a></li>
 <li>notify_task_completion()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#abbb2b1c2f8bae2b9c7cc51db65f18a3b">mlx::core::scheduler::Scheduler</a></li>
+<li>NQ&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a48575afc94ab9ff74deaba61464e57a1">mlx::steel::AttnParams</a></li>
+<li>NQ_aligned&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a4cfd2ccb0fd7eb81c2a781a0614fdcbe">mlx::steel::AttnParams</a></li>
 <li>NumberOfElements()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#ac64d7c40ae29d687f8b7d2fa33e13b06">mlx::core::NumberOfElements</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/functions_o.html b/docs/build/html/functions_o.html
index cb2ffa938..3565ce21f 100644
--- a/docs/build/html/functions_o.html
+++ b/docs/build/html/functions_o.html
@@ -88,7 +88,8 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_o" name="index_o"></a>- o -</h3><ul>
 <li>O&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#ad55ff586d30072d8154865f9dfe92d97">MLXConvParams&lt; NDIM &gt;</a></li>
-<li>offset&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">looped_elem_to_loc&lt; dim, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a7aebc0b0656e3a55d0dbca27a57d600e">looped_elem_to_loc&lt; 1, offset_t &gt;</a></li>
+<li>O_strides&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a33dc7fc22d2604a73af9f94eeea45bb4">mlx::steel::AttnParams</a></li>
+<li>offset&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af792b1fd4e8286f97b9b863c127a2d9a">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a3a18944c158e2747a6ddebb420299a3b">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></li>
 <li>ofs()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1rev__iter.html#a78c3b4ad19edf9d20cab40ad109e9dd1">pocketfft::detail::rev_iter</a>, <a class="el" href="classpocketfft_1_1detail_1_1simple__iter.html#ab59481ad9c8f04addb907c3ebb89f8fa">pocketfft::detail::simple_iter</a></li>
 <li>oofs()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1multi__iter.html#aae63e67caac095d474ddd32daa5ffa34">pocketfft::detail::multi_iter&lt; N &gt;</a></li>
 <li>Op&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a6f8b5d455d0c1770428a6bef1608f23d">mlx::core::BitwiseBinary</a></li>
@@ -108,7 +109,6 @@ $(function(){ initResizable(false); });
 <li>operator+=()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#ad4e69dcd89bdb7764c9c5807168f911e">pocketfft::detail::cmplx&lt; T &gt;</a></li>
 <li>operator-()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#a460da5db36d1c72fb1ed3496fd3abde4">pocketfft::detail::cmplx&lt; T &gt;</a></li>
 <li>operator-=()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#a12441ff423274bd1b54245933d69ad7e">pocketfft::detail::cmplx&lt; T &gt;</a></li>
-<li>operator-&gt;()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966">mlx::core::metal::CommandEncoder</a></li>
 <li>operator=()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1___m_l_x___b_float16.html#a0f65b0523b8ddd989f338da6cb2860e3">mlx::core::_MLX_BFloat16</a>, <a class="el" href="structmlx_1_1core_1_1___m_l_x___float16.html#a608a099bf7116ee608dcfd31ea3ade2c">mlx::core::_MLX_Float16</a>, <a class="el" href="classmlx_1_1core_1_1allocator_1_1_allocator.html#a027b84cddc8d476f736ac1f1a9991fe4">mlx::core::allocator::Allocator</a>, <a class="el" href="structmlx_1_1core_1_1array_1_1_data.html#a68e9417954fe811b5e41e6317a526748">mlx::core::array::Data</a>, <a class="el" href="classmlx_1_1core_1_1array.html#a8acf2b4c75f9b7f79da6675dbc36cf36">mlx::core::array</a>, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e">mlx::core::metal::CommandEncoder</a>, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">mlx::core::metal::Device</a>, <a class="el" href="classmlx_1_1core_1_1metal_1_1_residency_set.html#aef97dbbc755940789f99a26164591c45">mlx::core::metal::ResidencySet</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a6b1be7ea92f3a7bb19875c70259dad6b">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ab170dbd2ce34c51e2eeebf5d08e7e2db">mlx::core::scheduler::Scheduler</a>, <a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a0a859309a4f192f2679e07f2e4ff4d22">mlx::core::UnaryPrimitive</a></li>
 <li>operator==&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_array_iterator.html#a1afd6d2a19a2b0d712063f221ab4eba7">mlx::core::array::ArrayIterator</a></li>
 <li>operator[]()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1arr.html#aea0bd899b19e03f54dfd6c188727061a">pocketfft::detail::arr&lt; T &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1cndarr.html#ae4852d1fe936a5d61832b507816c7054">pocketfft::detail::cndarr&lt; T &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1ndarr.html#a2b2c4e205e8b5c32c9fe55dfd7b8c8d8">pocketfft::detail::ndarr&lt; T &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1sincos__2pibyn.html#a71b02f67c47b24adb296eafd2c7a3598">pocketfft::detail::sincos_2pibyn&lt; T &gt;</a></li>
@@ -118,7 +118,7 @@ $(function(){ initResizable(false); });
 <li>out&#160;:&#160;<a class="el" href="struct_read_writer.html#abea3b913c952c505d0ca4e529c7316ef">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>out_of_bounds()&#160;:&#160;<a class="el" href="struct_read_writer.html#a08e10626fbc789b6dff9172fd6c36f7c">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>out_strides&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#a0c8b2cfc26859a2af9d39a2cfcc3aea6">MLXConvParams&lt; NDIM &gt;</a></li>
-<li>output_shapes()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a8849dc20991398f6f9a24d6785673853">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325">mlx::core::Tanh</a></li>
+<li>output_shapes()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a1a53623d7c591ba6567ac1533fbc2b7c">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a8849dc20991398f6f9a24d6785673853">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325">mlx::core::Tanh</a></li>
 <li>outputs()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a2c186fd527f984f0589d4183b4976289">mlx::core::array</a>, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">mlx::core::metal::CommandEncoder</a>, <a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">mlx::core::metal::DeviceStream</a></li>
 <li>overwrite_descriptor()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a95e6b156c8e05439f076b85c05079387">mlx::core::array</a></li>
 </ul>
diff --git a/docs/build/html/functions_p.html b/docs/build/html/functions_p.html
index 744b1bc4d..d99c6b8ec 100644
--- a/docs/build/html/functions_p.html
+++ b/docs/build/html/functions_p.html
@@ -102,7 +102,7 @@ $(function(){ initResizable(false); });
 <li>primitive()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a790548666511d8c6d9f92ee79d2ce14c">mlx::core::array</a></li>
 <li>primitive_id()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#af5ad83605d4eea81561246873bee1d7c">mlx::core::array</a></li>
 <li>primitive_ptr()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a5119cd616ec3c05d65878944b8889469">mlx::core::array</a></li>
-<li>print()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb">mlx::core::Primitive</a>, <a class="el" href="structmlx_1_1core_1_1_print_formatter.html#a79fad4cf5844db8c92b066539146281b">mlx::core::PrintFormatter</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c">mlx::core::View</a></li>
+<li>print()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e">mlx::core::Arange</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#aca8a4ba9a58cc10f063e6b082fa2fc23">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa">mlx::core::Load</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb">mlx::core::Primitive</a>, <a class="el" href="structmlx_1_1core_1_1_print_formatter.html#a79fad4cf5844db8c92b066539146281b">mlx::core::PrintFormatter</a>, <a class="el" href="classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b">mlx::core::QRF</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c">mlx::core::View</a></li>
 <li>Prod&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924acdd1ec09a2fd99c81c561b5c63a4b482">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9ac5b077bfec55fe2b141b197dfa00ecf7">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1a33edce755ed1a74632c302ad93a14789">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca5e43e8ffd1f5ba49826e2e7ac3450466">mlx::core::Scatter</a></li>
 <li>prod()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1util.html#a33acae07b20b28fe4658bc338fce1b89">pocketfft::detail::util</a></li>
 <li>ptr()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1allocator_1_1_buffer.html#a990643feac06961c5599aac098c17b94">mlx::core::allocator::Buffer</a></li>
diff --git a/docs/build/html/functions_q.html b/docs/build/html/functions_q.html
index 0d170e5ee..11a461a45 100644
--- a/docs/build/html/functions_q.html
+++ b/docs/build/html/functions_q.html
@@ -88,10 +88,11 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_q" name="index_q"></a>- q -</h3><ul>
 <li>q&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#adf608e22d0c0397217472408aab52631">mlx::core::scheduler::StreamThread</a></li>
+<li>Q_strides&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a90bba215328201a37eb1c430ce9f8563">mlx::steel::AttnParams</a></li>
+<li>qL&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a59255882cbd78bb6f15e704e3a356a7f">mlx::steel::AttnParams</a></li>
 <li>QRF()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983">mlx::core::QRF</a></li>
-<li>QuantizedBlockLoader()&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#af59b054750a65e7e79c1cd05c4acac93">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>QuantizedBlockLoader()&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>QuantizedMatmul()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c">mlx::core::QuantizedMatmul</a></li>
-<li>QUERY_SEQUENCE_LENGTH&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a46cc2da6a069d822f36983ee18467e5c">MLXScaledDotProductAttentionParams</a></li>
 <li>queue&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">mlx::core::metal::DeviceStream</a></li>
 <li>quiet_NaN()&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 </ul>
diff --git a/docs/build/html/functions_r.html b/docs/build/html/functions_r.html
index 7c70413c7..9362b7743 100644
--- a/docs/build/html/functions_r.html
+++ b/docs/build/html/functions_r.html
@@ -87,7 +87,7 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all class members with links to the classes they belong to:</div>
 
 <h3><a id="index_r" name="index_r"></a>- r -</h3><ul>
-<li>r&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#afc51cdf222d77690953a8cb8ce3ee692">pocketfft::detail::cmplx&lt; T &gt;</a></li>
+<li>r&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a>, <a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#afc51cdf222d77690953a8cb8ce3ee692">pocketfft::detail::cmplx&lt; T &gt;</a></li>
 <li>r2h&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1_exec_r2_r.html#a925b398c8e1868614ce9eaf381d02b7e">pocketfft::detail::ExecR2R</a></li>
 <li>radix&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aaefa8c2cadd11ac7e22f7b2c5edbd1cd">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 <li>RandomBits()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_random_bits.html#a7dd5ed03f2a4ab45d1d5e8e2b587de6b">mlx::core::RandomBits</a></li>
@@ -127,7 +127,10 @@ $(function(){ initResizable(false); });
 <li>RoPE()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a031fa27aeba94cfa5ccc633137a12163">mlx::core::fast::RoPE</a></li>
 <li>Round()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde">mlx::core::Round</a></li>
 <li>round_error()&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
+<li>row_bin_op()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a318c4279bdc7b39b7919f108b1cd8010">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>row_contiguous&#160;:&#160;<a class="el" href="struct_indices.html#a255e340a39c6ac28ef2c232b106f85d1">Indices&lt; IdxT, NIDX &gt;</a>, <a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html#a3170fa381dc7a90f6eabcc029bdf9bfd">mlx::core::array::Flags</a></li>
+<li>row_frag_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a5ec2e40a8f5ad98c71b825544cdd878b">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></li>
+<li>row_reduce()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a51d662e4cff88b5ad17d7c44bb6b6970">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>run()&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#ac4a7b5011a0ea938ab1949bb1767fc1a">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a5d68656832de892f33db939005713927">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/functions_s.html b/docs/build/html/functions_s.html
index 062daa4dd..f3e41cc16 100644
--- a/docs/build/html/functions_s.html
+++ b/docs/build/html/functions_s.html
@@ -88,7 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_s" name="index_s"></a>- s -</h3><ul>
 <li>sanity_check()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1util.html#a3d2f5f00d4fed5c09bd1509ffa9a44cd">pocketfft::detail::util</a></li>
-<li>scale&#160;:&#160;<a class="el" href="struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b">ScaleOp&lt; OutT, InT &gt;</a></li>
+<li>scale&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#ad81bcd32e6ff8fec0000eca505fb6826">mlx::steel::AttnParams</a>, <a class="el" href="struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b">ScaleOp&lt; OutT, InT &gt;</a>, <a class="el" href="struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6">TransformScale&lt; T &gt;</a></li>
 <li>ScaledDotProductAttention()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a6a904c377fffc3c193102a3123f5e706">mlx::core::fast::ScaledDotProductAttention</a></li>
 <li>scales&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>Scan()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_scan.html#ac93e8f9c6771de825d2186ef34fa7087">mlx::core::Scan</a></li>
@@ -100,7 +100,9 @@ $(function(){ initResizable(false); });
 <li>Select()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9">mlx::core::Select</a></li>
 <li>Send()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a2481dd876b14d4a13ac466cbca9c4eac">mlx::core::distributed::Send</a></li>
 <li>Set()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#a647fece372b64b13c4a7e5877d09a807">pocketfft::detail::cmplx&lt; T &gt;</a></li>
+<li>set_bytes()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9c343f791812a45c6c03a5c9f27f74d5">mlx::core::metal::CommandEncoder</a></li>
 <li>set_cache_limit()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html#af392bced29d9e4e3f1a7cc4725d83764">mlx::core::metal::MetalAllocator</a></li>
+<li>set_compute_pipeline_state()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6d4c03a6585deedb5ccd1a1057d0c6ef">mlx::core::metal::CommandEncoder</a></li>
 <li>set_data()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a631acd8e318189640b8338f9ae1a554d">mlx::core::array</a></li>
 <li>set_default_stream()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a6d15314ac9cf25efc9bd1278de9a66bb">mlx::core::scheduler::Scheduler</a></li>
 <li>set_input_array()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">mlx::core::metal::CommandEncoder</a></li>
@@ -111,8 +113,10 @@ $(function(){ initResizable(false); });
 <li>set_status()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a63598018999b49f1340b183cb303f05c">mlx::core::array</a></li>
 <li>set_tracer()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#af26e6be1a9e6239471a4c24310c0c7c8">mlx::core::array</a></li>
 <li>set_value()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#a0d077b11f4b28f882b42440b7ac6d40d">mlx::core::Event</a></li>
+<li>set_vector_bytes()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b">mlx::core::metal::CommandEncoder</a></li>
 <li>set_wired_limit()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a84fa0347da18055bc13ba0a5c4b57253">mlx::core::metal::MetalAllocator</a></li>
-<li>shape()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a4a2a2c8a4a5beafd723fc13f2055d55d">mlx::core::array</a>, <a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">mlx::core::fast::CustomKernelShapeInfo</a>, <a class="el" href="structmlx_1_1core_1_1_reduction_plan.html#a6cfa8771fa9caf6fdcc3d74c9fca83ae">mlx::core::ReductionPlan</a>, <a class="el" href="classpocketfft_1_1detail_1_1arr__info.html#accada8146cb8d3ab7facb4c1e3413ec0">pocketfft::detail::arr_info</a></li>
+<li>shape()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a4a2a2c8a4a5beafd723fc13f2055d55d">mlx::core::array</a>, <a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">mlx::core::fast::CustomKernelShapeInfo</a>, <a class="el" href="structmlx_1_1core_1_1_reduction_plan.html#a6cfa8771fa9caf6fdcc3d74c9fca83ae">mlx::core::ReductionPlan</a>, <a class="el" href="structmlx_1_1steel_1_1_layout2_d.html#a23183747ab1ddbdd3f1fcac6d0faa2cd">mlx::steel::Layout2D&lt; Shape, Layout &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1arr__info.html#accada8146cb8d3ab7facb4c1e3413ec0">pocketfft::detail::arr_info</a></li>
+<li>Shape2D()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_shape2_d.html#a070ce70eb6d84361c7f313159c438a5c">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a></li>
 <li>shapes&#160;:&#160;<a class="el" href="struct_indices.html#a5ab170f1a77636180889ddfffd4f7d2f">Indices&lt; IdxT, NIDX &gt;</a></li>
 <li>shp&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1arr__info.html#a2467e9e01de1ba4d7cd28c1af783da8d">pocketfft::detail::arr_info</a></li>
 <li>shutdown()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a0c7c29290fde806031c497f24c4ad411">pocketfft::detail::threading::thread_pool</a></li>
@@ -143,8 +147,8 @@ $(function(){ initResizable(false); });
 <li>split_k_partitions&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#ae06c27116905d4ff3b9b436e588a93fd">mlx::steel::GEMMSpiltKParams</a></li>
 <li>Sqrt()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29">mlx::core::Sqrt</a></li>
 <li>Square()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4">mlx::core::Square</a></li>
-<li>src&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1ee2922961b5fcb1db577928c4d9d731">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a656a46ee27486482b45ff90b3d626255">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#afe21e46e08523232830c25eb1b4ade16">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a57552e9cfbafad71d47b2f3a8e027bdf">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7bfbcc4a1e3eef7aef5dd8e8c374a95f">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a50f458dbb74d61be2ed24727d8d43614">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b04a69952404a04029dacc424df6e8f">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
-<li>src_ld&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7464ec687323fa79050702952ed9084f">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6918c1df7712c4e408e2871467ea7987">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#aa6bedc0cbb447eaf70c03f2e26df2cb2">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>src&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1ee2922961b5fcb1db577928c4d9d731">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a656a46ee27486482b45ff90b3d626255">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#afe21e46e08523232830c25eb1b4ade16">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a57552e9cfbafad71d47b2f3a8e027bdf">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7bfbcc4a1e3eef7aef5dd8e8c374a95f">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a50f458dbb74d61be2ed24727d8d43614">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b04a69952404a04029dacc424df6e8f">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>src_ld&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7464ec687323fa79050702952ed9084f">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6918c1df7712c4e408e2871467ea7987">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#aa6bedc0cbb447eaf70c03f2e26df2cb2">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>start_concurrent()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">mlx::core::metal::CommandEncoder</a></li>
 <li>start_row&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a220e033b689c8d6a6f319dae02b38334">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>Status&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078">mlx::core::array</a></li>
@@ -172,7 +176,7 @@ $(function(){ initResizable(false); });
 <li>Sum&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924a1fc7c1f09c80650ab0497e2d6781d65f">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a8582875544f1d3d396a1a376473ef1dd">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ade23893033e4849f5596e7ce76a5fc36">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca14abe2d8818efa71726be4e156813d6f">mlx::core::Scatter</a></li>
 <li>SVD()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1">mlx::core::SVD</a></li>
 <li>swizzle()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760">mlx::steel::BlockSwizzle</a></li>
-<li>swizzle_log&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#af9ff2c06dd8994126634531440325be7">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ad0713159d4f710cd9a066596593d8840">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a68a338d522ffeb6761b7b168869361e2">MLXFastAttentionParams</a></li>
+<li>swizzle_log&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#af9ff2c06dd8994126634531440325be7">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ad0713159d4f710cd9a066596593d8840">mlx::steel::ImplicitGemmConv2DParams</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_t.html b/docs/build/html/functions_t.html
index 0dbfaf7c8..991d2fd31 100644
--- a/docs/build/html/functions_t.html
+++ b/docs/build/html/functions_t.html
@@ -106,24 +106,25 @@ $(function(){ initResizable(false); });
 <li>thread&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a449de02bf2ac80d8fe2f208fa7eac359">mlx::core::scheduler::StreamThread</a></li>
 <li>thread_count()&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1util.html#a3b012d5a19215bcd32cf6e228556fa87">pocketfft::detail::util</a></li>
 <li>thread_fn()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a06a62c21c1174e4eb4d242e50aad7adf">mlx::core::scheduler::StreamThread</a></li>
-<li>thread_idx&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a401f0c7cf1588552556603c7ffba2316">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a70da26a715135d973f88371a70255be9">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9642399b8066e29123524f36ebc7b482">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac18de37cde1459595bfe18b0d5ef146d">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ab1cb2ade639787243e0325dcd3dc0a11">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08a517bc50caf41155b98be0690bfe44">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acacdac168004c87fee27c8554ac905a7">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>thread_idx&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a401f0c7cf1588552556603c7ffba2316">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a70da26a715135d973f88371a70255be9">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9642399b8066e29123524f36ebc7b482">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac18de37cde1459595bfe18b0d5ef146d">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ab1cb2ade639787243e0325dcd3dc0a11">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08a517bc50caf41155b98be0690bfe44">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acacdac168004c87fee27c8554ac905a7">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>thread_pool()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a37a8121a99dd06a9d44b3e80ba0ea560">pocketfft::detail::threading::thread_pool</a></li>
 <li>thread_sort_t&#160;:&#160;<a class="el" href="struct_block_merge_sort.html#ad2474d16721f4ceb954125728a0e2ea2">BlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
 <li>ThreadPool()&#160;:&#160;<a class="el" href="class_thread_pool.html#ac291710e33dbbed96ee20711080d506d">ThreadPool</a></li>
 <li>threads_per_tg&#160;:&#160;<a class="el" href="struct_read_writer.html#a64c58e358da22358df3075448ea23893">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>threadsM&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a1dd943fcbf5e7be435fc36bed589a641">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a4a53e73a581aa8881b1f86ce653519e6">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
 <li>threadsN&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a47bfab7d21dd18760d3e0937ad36b19d">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#ade6f15a9744616de9dd71498ad7e758d">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
-<li>tile_stride&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>tile_stride&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>tile_stride_a&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>tile_stride_b&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>tiles_m&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#ad23a5a7f74cd5859741a36e4bc7823ca">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a0970989624e17088d5326c2e198cb95b">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a4c5e33edf70be99cf93ac5723c12eb24">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a0df159c839fc27b9426b8ac4336cc0ad">MLXFastAttentionParams</a></li>
-<li>tiles_n&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0e6b8b629232f1b43fbce9a395174bed">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a5b46dfb9cee3606efa05d217349a20a6">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a76f9f381e7187a993d65128b9b681b2d">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a608aa256216ac6d80af00209303d2029">MLXFastAttentionParams</a></li>
+<li>tiles_m&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#ad23a5a7f74cd5859741a36e4bc7823ca">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a0970989624e17088d5326c2e198cb95b">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a4c5e33edf70be99cf93ac5723c12eb24">mlx::steel::ImplicitGemmConv2DParams</a></li>
+<li>tiles_n&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0e6b8b629232f1b43fbce9a395174bed">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a5b46dfb9cee3606efa05d217349a20a6">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a76f9f381e7187a993d65128b9b681b2d">mlx::steel::ImplicitGemmConv2DParams</a></li>
 <li>TM&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>TM_stride&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>TN&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>TN_stride&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>TransformAdd()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></li>
 <li>TransformAxpby()&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></li>
+<li>TransformScale()&#160;:&#160;<a class="el" href="struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70">TransformScale&lt; T &gt;</a></li>
 <li>Transpose()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a">mlx::core::Transpose</a></li>
 <li>TROWS&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a2aff22af70f685f858adea73f5575cf7">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a2528ff5ed472e4ed35415ada42276b07">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a789683f9ac9d9309d07c05f3bdedd2fd">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3957fb263fe040fe70683fd1d7b06487">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a94f0ce5bb7d87bc1fb6a7c2ba2b892d4">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acbc28f364381166faaeec2783dc88e10">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a5cefb1285ed13ad3490198e9303453de">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 <li>try_pop()&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#aa3807d46a126d229f9054c779105ea43">pocketfft::detail::threading::concurrent_queue&lt; T &gt;</a></li>
diff --git a/docs/build/html/functions_type.html b/docs/build/html/functions_type.html
index a8136f01f..2f396bfa7 100644
--- a/docs/build/html/functions_type.html
+++ b/docs/build/html/functions_type.html
@@ -87,7 +87,7 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all typedefs with links to the classes they belong to:</div>
 
 <h3><a id="index_a" name="index_a"></a>- a -</h3><ul>
-<li>accum_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da">mlx::steel::AccumHelper&lt; T &gt;</a></li>
+<li>accum_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">mlx::steel::AccumHelper&lt; T &gt;</a></li>
 </ul>
 
 
@@ -96,18 +96,23 @@ $(function(){ initResizable(false); });
 </ul>
 
 
+<h3><a id="index_c" name="index_c"></a>- c -</h3><ul>
+<li>col_frag_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aab8dd1c6917247da41dd3a31139a665f">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></li>
+</ul>
+
+
 <h3><a id="index_d" name="index_d"></a>- d -</h3><ul>
 <li>difference_type&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_array_iterator.html#adcee44c77980fc2370a2c31e203aead5">mlx::core::array::ArrayIterator</a></li>
 </ul>
 
 
 <h3><a id="index_e" name="index_e"></a>- e -</h3><ul>
-<li>elem_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>elem_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 </ul>
 
 
 <h3><a id="index_f" name="index_f"></a>- f -</h3><ul>
-<li>frag_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>frag_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 </ul>
 
 
@@ -118,22 +123,23 @@ $(function(){ initResizable(false); });
 
 
 <h3><a id="index_l" name="index_l"></a>- l -</h3><ul>
-<li>loader_a_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa8a04ed74d2259f99b337d4662c64d83">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
-<li>loader_b_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa98f32278b5fd98c93ae5483c3596395">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
+<li>loader_a_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a98b6ec692580510081e2aa887a61944b">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
+<li>loader_b_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1a115d5af0fb6e260165adba2e377635">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
 </ul>
 
 
 <h3><a id="index_m" name="index_m"></a>- m -</h3><ul>
 <li>mask_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a270ab3da7c98a12525a59952742cc97d">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
-<li>mat_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
-<li>mma_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#add8c6a31011a4895667c2a94a5af3782">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
-<li>MMAFrag_acc_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae2c42cb6d0dde785859164c195f4d13c">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>MMAFrag_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>mat_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>mma_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ae52eb09c9478cd4f199662346ac0c83e">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
+<li>MMAFrag_acc_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8231b0e3475077c1381eb8f5daf62e35">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
+<li>MMAFrag_t&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 </ul>
 
 
 <h3><a id="index_r" name="index_r"></a>- r -</h3><ul>
 <li>reference&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_array_iterator.html#a44e2e1f29191c20ec4390de4fa0bd59f">mlx::core::array::ArrayIterator</a></li>
+<li>row_frag_type&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a5ec2e40a8f5ad98c71b825544cdd878b">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></li>
 </ul>
 
 
diff --git a/docs/build/html/functions_u.html b/docs/build/html/functions_u.html
index 420a59b25..aea0371da 100644
--- a/docs/build/html/functions_u.html
+++ b/docs/build/html/functions_u.html
@@ -90,6 +90,7 @@ $(function(){ initResizable(false); });
 <li>UnaryPrimitive()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_unary_primitive.html#a189f6d4ed369f82a4b724a29eb056d4e">mlx::core::UnaryPrimitive</a></li>
 <li>Uniform()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1">mlx::core::Uniform</a></li>
 <li>unscheduled&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078ae8a9988458b0355001674020a45656fb">mlx::core::array</a></li>
+<li>update_fence()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aeef08f5f3c015578d40de756a6465aa2">mlx::core::metal::CommandEncoder</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_v.html b/docs/build/html/functions_v.html
index 44ca8b803..724da0a5c 100644
--- a/docs/build/html/functions_v.html
+++ b/docs/build/html/functions_v.html
@@ -87,18 +87,19 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all class members with links to the classes they belong to:</div>
 
 <h3><a id="index_v" name="index_v"></a>- v -</h3><ul>
-<li>v&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#afbef88bfb901a71e8423de911b7c7347">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a></li>
+<li>v&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a></li>
+<li>V_strides&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#acc4860c3ce09c7230b470182ed002d3c">mlx::steel::AttnParams</a></li>
 <li>Val&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1da">mlx::core::Dtype</a></li>
 <li>val()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_dtype.html#a7a99656f121c8922ab82e72c8e9bd7f1">mlx::core::Dtype</a>, <a class="el" href="structmlx__atomic.html#a6f6651b8dd8149917c50cd99b13c6747">mlx_atomic&lt; T, typename &gt;</a>, <a class="el" href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html#a8dbf729fcd8c4a16e41b546c7405543d">mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;</a>, <a class="el" href="structpocketfft_1_1detail_1_1_v_l_e_n.html#ab1fdc340dedde723e636746c828a4534">pocketfft::detail::VLEN&lt; T &gt;</a></li>
-<li>val_frags&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>val_frags&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>val_t&#160;:&#160;<a class="el" href="struct_kernel_merge_sort.html#a4e3f09896275956fc4c23e1f157dca3b">KernelMergeSort&lt; T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
 <li>valid()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#aa77afd9669e2ef9d5e9ae1c2c6fd24fa">mlx::core::Event</a></li>
 <li>value()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_event.html#ab71c7baee3d1d02ad6a2001bbf90b970">mlx::core::Event</a>, <a class="el" href="structmlx_1_1steel_1_1integral__constant.html#a4efa69cb3fd42ac0dcad46578600d637">mlx::steel::integral_constant&lt; T, v &gt;</a></li>
 <li>value_type&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_array_iterator.html#ae24fe304397e961687d0d4c7012b8ae4">mlx::core::array::ArrayIterator</a>, <a class="el" href="structmlx_1_1steel_1_1integral__constant.html#a0569cc1334e0bc4f474304b33d365759">mlx::steel::integral_constant&lt; T, v &gt;</a>, <a class="el" href="structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#ab86a2740ed9ce3199135372ff1d88c76">pocketfft::detail::threading::aligned_allocator&lt; T &gt;</a></li>
-<li>vec_size&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper.html#a2b24f991a9380fdad6b51a038770b925">mlx::steel::ChannelHelper&lt; n_channels_ &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a71449551bbfe56058440755dfd50fc75">mlx::steel::ChannelHelper&lt; 1 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#acfb18991a77a9d1d4a79918ac5f387af">mlx::steel::ChannelHelper&lt; 2 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a5cb83774601c29564a6bbc010fc0bf7f">mlx::steel::ChannelHelper&lt; 3 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#af28cdbe2a3c027d95832de07f60448ca">mlx::steel::ChannelHelper&lt; 4 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1587047caa339cf5b2c06adc4b332ab8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#adcc83bf6c02391cc2375e55c06a1c9a4">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a6b0b18428516d1d6dcae3beb3faee81c">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a71c313e1597a2bb99f7b07d434e119d2">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a10109dc9553207f5a365799e4969c6d2">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08dba753ec7c8ea2892775746933b3e7">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a006153d274aa13d5fd4448b4607fff3a">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
+<li>vec_size&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper.html#a2b24f991a9380fdad6b51a038770b925">mlx::steel::ChannelHelper&lt; n_channels_ &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a71449551bbfe56058440755dfd50fc75">mlx::steel::ChannelHelper&lt; 1 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#acfb18991a77a9d1d4a79918ac5f387af">mlx::steel::ChannelHelper&lt; 2 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a5cb83774601c29564a6bbc010fc0bf7f">mlx::steel::ChannelHelper&lt; 3 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#af28cdbe2a3c027d95832de07f60448ca">mlx::steel::ChannelHelper&lt; 4 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1587047caa339cf5b2c06adc4b332ab8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#adcc83bf6c02391cc2375e55c06a1c9a4">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a6b0b18428516d1d6dcae3beb3faee81c">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a71c313e1597a2bb99f7b07d434e119d2">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a10109dc9553207f5a365799e4969c6d2">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08dba753ec7c8ea2892775746933b3e7">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a006153d274aa13d5fd4448b4607fff3a">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 <li>View()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e">mlx::core::View</a></li>
-<li>vjp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a1dcb6807326eeab62474c6a0e3836d42">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80">mlx::core::Transpose</a></li>
-<li>vmap()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ac632b9619dd7a6a0f177bd36202e8103">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121">mlx::core::View</a></li>
+<li>vjp()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062">mlx::core::AsStrided</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120">mlx::core::BlockMaskedMM</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#abf488f02057fd5852f38b2e8a600ad2a">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">mlx::core::Convolution</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">mlx::core::Depends</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6">mlx::core::fast::LayerNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb">mlx::core::fast::RMSNorm</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533">mlx::core::fast::RoPE</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda">mlx::core::GatherMM</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#a1dcb6807326eeab62474c6a0e3836d42">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80">mlx::core::Transpose</a></li>
+<li>vmap()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f">mlx::core::Abs</a>, <a class="el" href="classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646">mlx::core::Add</a>, <a class="el" href="classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81">mlx::core::AddMM</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83">mlx::core::ArcCos</a>, <a class="el" href="classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461">mlx::core::ArcCosh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82">mlx::core::ArcSin</a>, <a class="el" href="classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d">mlx::core::ArcSinh</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634">mlx::core::ArcTan2</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556">mlx::core::ArcTan</a>, <a class="el" href="classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040">mlx::core::ArcTanh</a>, <a class="el" href="classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a">mlx::core::ArgPartition</a>, <a class="el" href="classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba">mlx::core::ArgReduce</a>, <a class="el" href="classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e">mlx::core::ArgSort</a>, <a class="el" href="classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc">mlx::core::AsType</a>, <a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965">mlx::core::BitwiseBinary</a>, <a class="el" href="classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f">mlx::core::Broadcast</a>, <a class="el" href="classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4">mlx::core::Ceil</a>, <a class="el" href="classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5">mlx::core::Cholesky</a>, <a class="el" href="classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d">mlx::core::Compiled</a>, <a class="el" href="classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1">mlx::core::Concatenate</a>, <a class="el" href="classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60">mlx::core::Conjugate</a>, <a class="el" href="classmlx_1_1core_1_1_contiguous.html#a563221e90b15aa90bfae23d29c10e4ec">mlx::core::Contiguous</a>, <a class="el" href="classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61">mlx::core::Copy</a>, <a class="el" href="classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6">mlx::core::Cos</a>, <a class="el" href="classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406">mlx::core::Cosh</a>, <a class="el" href="classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b">mlx::core::CustomTransforms</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031">mlx::core::distributed::AllGather</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93">mlx::core::distributed::Send</a>, <a class="el" href="classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242">mlx::core::Divide</a>, <a class="el" href="classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942">mlx::core::DivMod</a>, <a class="el" href="classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f">mlx::core::Eigh</a>, <a class="el" href="classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca">mlx::core::Equal</a>, <a class="el" href="classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa">mlx::core::Erf</a>, <a class="el" href="classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9">mlx::core::ErfInv</a>, <a class="el" href="classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37">mlx::core::Exp</a>, <a class="el" href="classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296">mlx::core::Expm1</a>, <a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d">mlx::core::fast::Custom</a>, <a class="el" href="classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1">mlx::core::FFT</a>, <a class="el" href="classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10">mlx::core::Floor</a>, <a class="el" href="classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95">mlx::core::Full</a>, <a class="el" href="classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275">mlx::core::Gather</a>, <a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f">mlx::core::GatherQMM</a>, <a class="el" href="classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0">mlx::core::Greater</a>, <a class="el" href="classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d">mlx::core::GreaterEqual</a>, <a class="el" href="classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c">mlx::core::Hadamard</a>, <a class="el" href="classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3">mlx::core::Imag</a>, <a class="el" href="classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2">mlx::core::Inverse</a>, <a class="el" href="classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e">mlx::core::Less</a>, <a class="el" href="classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480">mlx::core::LessEqual</a>, <a class="el" href="classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71">mlx::core::Log1p</a>, <a class="el" href="classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49">mlx::core::Log</a>, <a class="el" href="classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78">mlx::core::LogAddExp</a>, <a class="el" href="classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5">mlx::core::LogicalAnd</a>, <a class="el" href="classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d">mlx::core::LogicalNot</a>, <a class="el" href="classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3">mlx::core::LogicalOr</a>, <a class="el" href="classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2">mlx::core::Matmul</a>, <a class="el" href="classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3">mlx::core::Maximum</a>, <a class="el" href="classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980">mlx::core::Minimum</a>, <a class="el" href="classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf">mlx::core::Multiply</a>, <a class="el" href="classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0">mlx::core::Negative</a>, <a class="el" href="classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5">mlx::core::NotEqual</a>, <a class="el" href="classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2">mlx::core::NumberOfElements</a>, <a class="el" href="classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf">mlx::core::Pad</a>, <a class="el" href="classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c">mlx::core::Partition</a>, <a class="el" href="classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f">mlx::core::Power</a>, <a class="el" href="classmlx_1_1core_1_1_primitive.html#ac632b9619dd7a6a0f177bd36202e8103">mlx::core::Primitive</a>, <a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763">mlx::core::QuantizedMatmul</a>, <a class="el" href="classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415">mlx::core::RandomBits</a>, <a class="el" href="classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6">mlx::core::Real</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d">mlx::core::Remainder</a>, <a class="el" href="classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d">mlx::core::Reshape</a>, <a class="el" href="classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd">mlx::core::Round</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322">mlx::core::Scatter</a>, <a class="el" href="classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f">mlx::core::Select</a>, <a class="el" href="classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85">mlx::core::Sigmoid</a>, <a class="el" href="classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295">mlx::core::Sign</a>, <a class="el" href="classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba">mlx::core::Sin</a>, <a class="el" href="classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788">mlx::core::Sinh</a>, <a class="el" href="classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2">mlx::core::Slice</a>, <a class="el" href="classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3">mlx::core::SliceUpdate</a>, <a class="el" href="classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19">mlx::core::Softmax</a>, <a class="el" href="classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c">mlx::core::Sort</a>, <a class="el" href="classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6">mlx::core::Split</a>, <a class="el" href="classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e">mlx::core::Sqrt</a>, <a class="el" href="classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5">mlx::core::Square</a>, <a class="el" href="classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0">mlx::core::StopGradient</a>, <a class="el" href="classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098">mlx::core::Subtract</a>, <a class="el" href="classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8">mlx::core::SVD</a>, <a class="el" href="classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7">mlx::core::Tan</a>, <a class="el" href="classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f">mlx::core::Tanh</a>, <a class="el" href="classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe">mlx::core::Transpose</a>, <a class="el" href="classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926">mlx::core::Uniform</a>, <a class="el" href="classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121">mlx::core::View</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_vars.html b/docs/build/html/functions_vars.html
index 9be45cb9f..e0e70e22f 100644
--- a/docs/build/html/functions_vars.html
+++ b/docs/build/html/functions_vars.html
@@ -94,9 +94,9 @@ $(function(){ initResizable(false); });
 <li>adj_out_h&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_jump_params.html#a879cc9757f59605a87d936ec4156040d">mlx::steel::Conv2DGeneralJumpParams</a></li>
 <li>adj_out_hw&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_jump_params.html#aed0ffd63fbc85fd5d5c4cc7b43f68363">mlx::steel::Conv2DGeneralJumpParams</a></li>
 <li>adj_out_w&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_jump_params.html#ab971bf879079895189331fbeaf33c211">mlx::steel::Conv2DGeneralJumpParams</a></li>
-<li>alpha&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#af8693d96512eff3e125d33d203920710">mlx::steel::GEMMAddMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a932266d04fa7d6e27d4a4a2c175f1477">MLXFastAttentionParams</a></li>
+<li>alpha&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#af8693d96512eff3e125d33d203920710">mlx::steel::GEMMAddMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></li>
 <li>As_offset&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>Atile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
+<li>Atile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_vars_b.html b/docs/build/html/functions_vars_b.html
index 6880a2f55..cdb0253b6 100644
--- a/docs/build/html/functions_vars_b.html
+++ b/docs/build/html/functions_vars_b.html
@@ -87,37 +87,35 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_b" name="index_b"></a>- b -</h3><ul>
+<li>B&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a1cba7fedbd02e157922619195997cf4f">mlx::steel::AttnParams</a></li>
 <li>b&#160;:&#160;<a class="el" href="unionbool4__or__uint.html#a47d77eac47598fe420f8f04a615f76ca">bool4_or_uint</a></li>
 <li>B_str_k&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>B_str_n&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>base_wh&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aca37adba6f148579eb1cd0a7800a5cfe">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6c46564bf1a96a02791dd432cc9c883e">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>base_ww&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32d020c6715d06f7de360877fcb7b6e4">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a230f0e581f9b8227b9ee68760b3b1503">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
-<li>batch_ndim&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a640dc138a8bf7b2b5bed6a436b429c2f">mlx::steel::GEMMParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a6f3d94dbe44b32e675558768710bf0a3">MLXFastAttentionParams</a></li>
+<li>batch_ndim&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a640dc138a8bf7b2b5bed6a436b429c2f">mlx::steel::GEMMParams</a></li>
 <li>batch_size&#160;:&#160;<a class="el" href="struct_read_writer.html#a689f4890c1d2ce33fc6da7550beec735">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>batch_stride_a&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a76f55783a8e2ee203cf8507eee4b000c">mlx::steel::GEMMParams</a></li>
 <li>batch_stride_b&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a99b959b12d12da657648fa24d43e49e8">mlx::steel::GEMMParams</a></li>
 <li>batch_stride_c&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a77c065db228e9654a0a75a6ffe47c15a">mlx::steel::GEMMAddMMParams</a></li>
 <li>batch_stride_d&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#ad98006dd509a455864e6aa7c52743a41">mlx::steel::GEMMParams</a></li>
-<li>batch_stride_k&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a162826d3f288f64c0aea88a36b34859b">MLXFastAttentionParams</a></li>
-<li>batch_stride_o&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a3c5b1170999087f3f3a03830193b55c7">MLXFastAttentionParams</a></li>
-<li>batch_stride_q&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a98766fc89f75d5eef65b345f16a782d1">MLXFastAttentionParams</a></li>
-<li>batch_stride_v&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a1180e311b95cd4b6d4a336d21b873c21">MLXFastAttentionParams</a></li>
 <li>BCOLS&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3d6272d000f8ea79d9b3b5228bdca20f">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a29fbeeacdf5b6feeb74815ced255fa5a">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9229d22e0a02d96825eb5a57c8cb95ac">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac3b40db720055350bba59d614ea1dd79">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a86519729ef0561686bb86e474c95b93d">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a1843921cd67926002bb0dcccf3048eb6">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b6cf53a10514310d01f4d6459053a57">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 <li>BCOLS_PACKED&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>beta&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#ac0ce4d8a6014f8adb29fd0a0bb23139f">mlx::steel::GEMMAddMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></li>
-<li>bi&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32a3a91fa715b82f36e05ceb10933d09">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8e53b0a9951cb840d922cc285b257ee3">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9eb024e2fc6f07345f87fbf7141c0d16">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ae3af75287f279d2cdeef189126740d4c">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a8c5e74003600132954cb953616e1a026">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a4c91f848856ab0872bdfd37c62d4b0ba">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae3b9f21f72e5e6c541c9978f55d354c7">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>bi&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32a3a91fa715b82f36e05ceb10933d09">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8e53b0a9951cb840d922cc285b257ee3">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9eb024e2fc6f07345f87fbf7141c0d16">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ae3af75287f279d2cdeef189126740d4c">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a8c5e74003600132954cb953616e1a026">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a4c91f848856ab0872bdfd37c62d4b0ba">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae3b9f21f72e5e6c541c9978f55d354c7">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>biases&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>bits_&#160;:&#160;<a class="el" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">_MLX_BFloat16</a>, <a class="el" href="structmlx_1_1core_1_1___m_l_x___b_float16.html#aca48963f820065c3d8ecab24265ab3fc">mlx::core::_MLX_BFloat16</a>, <a class="el" href="structmlx_1_1core_1_1___m_l_x___float16.html#a5203fe52424fd32bce6eb7917dd9288b">mlx::core::_MLX_Float16</a></li>
-<li>bj&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#ace16704025bc6e6204c306a357f3a8b8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a7ae9e41f50c0c63c35b63086a1c22cc3">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7cf448573d41fbc67f8dfc65b7aef2b2">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a6fd3dd7b74d91609fa9dd61c657a0e32">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a6f2fdcaf5a67567cca38ae3d8120ab37">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acec010e10d5733654963407af38d4f67">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#adaa261fc2e8e694aedab4ebd60b52e5e">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>bj&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#ace16704025bc6e6204c306a357f3a8b8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a7ae9e41f50c0c63c35b63086a1c22cc3">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7cf448573d41fbc67f8dfc65b7aef2b2">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a6fd3dd7b74d91609fa9dd61c657a0e32">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a6f2fdcaf5a67567cca38ae3d8120ab37">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acec010e10d5733654963407af38d4f67">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#adaa261fc2e8e694aedab4ebd60b52e5e">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>blockM&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a7281520100658811076400060663903c">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a2ae8ce535d59cccf453381b4485a77f0">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
 <li>blockN&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a2fef17f9c9aa0bdf530ad3554fb0988b">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a60be87666006ba0bf88bc8e6902da42a">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
 <li>BROWS&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aba1e1c8012e4e50f0e9bcfb9486c1781">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ac070c6bd5be85b1ae805e18890db4fd4">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a343984fb74ec579a4404278dbbc7e7b5">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a10591ea957605a9c662f93d59ff3410d">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ae9b86b05b23153ea1abaeead456c491c">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a015a0c56de74a0c4d51953a7e94fbba8">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acc8140aae84694f62e6324dbb6a614a4">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 <li>Bs_offset&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>Btile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
+<li>Btile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>buf&#160;:&#160;<a class="el" href="struct_read_writer.html#a23bac3c96dd0265ddbee1f256be45ff5">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>buffer&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_data.html#a9a51e2d12ba505027cc0fca86bdd39ad">mlx::core::array::Data</a>, <a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">mlx::core::metal::DeviceStream</a></li>
 <li>buffer_ops&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">mlx::core::metal::DeviceStream</a></li>
 <li>buffers&#160;:&#160;<a class="el" href="struct_indices.html#ad705070a740579c07d109ae4f3d86e76">Indices&lt; IdxT, NIDX &gt;</a></li>
+<li>bytes_per_pack&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_vars_c.html b/docs/build/html/functions_vars_c.html
index 054191205..b39005e39 100644
--- a/docs/build/html/functions_vars_c.html
+++ b/docs/build/html/functions_vars_c.html
@@ -88,13 +88,14 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_c" name="index_c"></a>- c -</h3><ul>
 <li>C&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#a0953063962ac3b5a027243289e72fbb2">MLXConvParams&lt; NDIM &gt;</a></li>
+<li>c&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a></li>
 <li>capitalize_bool&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_print_formatter.html#adf49a949db36f0ba076842a6d675d79a">mlx::core::PrintFormatter</a></li>
 <li>col_contiguous&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html#ae24709026598d635e6b5c24a15f8a802">mlx::core::array::Flags</a></li>
 <li>cond&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a4ffd524d6a5bedd1a303b63bdde6701c">mlx::core::scheduler::StreamThread</a></li>
 <li>contiguous&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html#afd0ab11e7a486a2a8e50ee84b971ac8a">mlx::core::array::Flags</a></li>
 <li>cosine&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1_exec_dcst.html#a185023fc1e386cc8f233b79c49c1fd8a">pocketfft::detail::ExecDcst</a></li>
 <li>cpu&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_device.html#a69ee81924251dec96f1945c9d91506fd">mlx::core::Device</a></li>
-<li>Ctile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
+<li>Ctile&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_vars_d.html b/docs/build/html/functions_vars_d.html
index bb3dad61c..2dae62855 100644
--- a/docs/build/html/functions_vars_d.html
+++ b/docs/build/html/functions_vars_d.html
@@ -87,12 +87,14 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_d" name="index_d"></a>- d -</h3><ul>
+<li>D&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a07ae31628e43e09bce533c7682c8dae3">mlx::steel::AttnParams</a></li>
 <li>d&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1array_1_1_data.html#a25f52ac67912a49bb6e2b6715aa65311">mlx::core::array::Data</a>, <a class="el" href="classpocketfft_1_1detail_1_1cndarr.html#ac29c769aebb03f81fbcf16ba6e766af2">pocketfft::detail::cndarr&lt; T &gt;</a></li>
 <li>device&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_stream.html#a406b1b0162287a4162fab1f70e2ff3bb">mlx::core::Stream</a></li>
 <li>digits&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#af6a681edff230c8d734a1feefb8d1879">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 <li>digits10&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a0f48dd0c8a2d2dfa825067fb212b2e6b">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
+<li>dim&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a7be6bf560080472d61e74b522979ef1e">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></li>
 <li>do_read&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a13eb86acf6abe288c19645935a47d2ad">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a640155880483e1042ec5f647b9adaac6">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
-<li>dst&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#af34c184a19846e4b40ba54b2946589ec">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aa84c4ad43a5defb83ba1a5f49a7adb2a">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ae048eb79f8b8d98f0fe8805c30fbb09f">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a59a4fffc1dc2f3fadfb3fdd1b886da70">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8598bf23a2bce6af13c876cbfa76449f">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aea6494838175225d02cbc7768a646ec7">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8474daf268013e138a84fc1c4bff7352">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a24e20e4c1dd1ebf9534bfa2b3e050ed3">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>dst&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a6eb4e566b687395e27f290da288362db">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aa84c4ad43a5defb83ba1a5f49a7adb2a">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ae048eb79f8b8d98f0fe8805c30fbb09f">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a59a4fffc1dc2f3fadfb3fdd1b886da70">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8598bf23a2bce6af13c876cbfa76449f">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aea6494838175225d02cbc7768a646ec7">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8474daf268013e138a84fc1c4bff7352">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a24e20e4c1dd1ebf9534bfa2b3e050ed3">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>dst_ld&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a07c85eab8cbf7b02c60df29cf32031ef">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a91192d512e7a18c2d16a139065000959">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ae71570942c7b0ad8e67c62662b336c4a">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a9e59da7e4436e61b2d3c3f982355910b">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a0ff5a6d503e0bbac4634030a75ab818d">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aae121ca6016fc6c7255027b3641f3a09">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ac18eeebea26cc6da434ead6eb4397350">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/functions_vars_g.html b/docs/build/html/functions_vars_g.html
index fc9d6c8b2..d50509576 100644
--- a/docs/build/html/functions_vars_g.html
+++ b/docs/build/html/functions_vars_g.html
@@ -88,11 +88,10 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_g" name="index_g"></a>- g -</h3><ul>
 <li>gemm_k_iterations&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a8b50863e4e2d3481c154be6c3629bf51">mlx::steel::ImplicitGemmConv2DParams</a></li>
-<li>gemm_k_iterations_aligned&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0d7f419ba265805b418e93ce1ca2e0f9">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#aa37e05a03ac8b34ec7dc31ca42f68998">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#adbc0a13076da5f704498e57239cb2bf2">MLXFastAttentionParams</a></li>
-<li>gemm_n_iterations_aligned&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#ab56b3db8fc6a938ce9c739ee78a7b803">MLXFastAttentionParams</a></li>
+<li>gemm_k_iterations_aligned&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0d7f419ba265805b418e93ce1ca2e0f9">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#aa37e05a03ac8b34ec7dc31ca42f68998">mlx::steel::GEMMSpiltKParams</a></li>
 <li>gemm_params&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ab0724eb3ef52ee773b6607f6433b9f2c">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af59f9d356c4c3ec5627dc5a263d239d4">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#acc778b3c0b7ec38a43e8ea943df8704c">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
-<li>gemm_sv_m_block_iterations&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a2799a2f219441fef7f351374f4cbc67c">MLXFastAttentionParams</a></li>
 <li>gpu&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_device.html#a45ed081b56ae5d4ddd39c83a5d8a1616">mlx::core::Device</a></li>
+<li>gqa_factor&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a3b3e18cb993ab24819c852bc64288841">mlx::steel::AttnParams</a></li>
 <li>grid&#160;:&#160;<a class="el" href="struct_read_writer.html#ac7a957f99873d3797081f5d620f3b2c8">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>group_step_cnt&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>group_steps&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
diff --git a/docs/build/html/functions_vars_h.html b/docs/build/html/functions_vars_h.html
index 11e09c959..5fbf70601 100644
--- a/docs/build/html/functions_vars_h.html
+++ b/docs/build/html/functions_vars_h.html
@@ -87,6 +87,7 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_h" name="index_h"></a>- h -</h3><ul>
+<li>H&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a3d286a0c27bace6016ed7a87f43291b7">mlx::steel::AttnParams</a></li>
 <li>has_mul_operand_mask&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#ad47223ee49b3cb7bf3746a2cec45f883">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a8db6f01f96a36b216acd801c34a96ef5">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
 <li>has_mul_output_mask&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a0edbf2dd6a6563e7afa6dab6b670615c">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a8eb06f6569e4042e24fee220b11fa10d">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
 <li>has_operand_mask&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#ab00784dff1512a7b0919fcb4cfa5d50e">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a6729d6e63e76a1e9c7c8e78d9aac4869">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
diff --git a/docs/build/html/functions_vars_i.html b/docs/build/html/functions_vars_i.html
index 919b61876..769a86d42 100644
--- a/docs/build/html/functions_vars_i.html
+++ b/docs/build/html/functions_vars_i.html
@@ -92,14 +92,13 @@ $(function(){ initResizable(false); });
 <li>imag&#160;:&#160;<a class="el" href="structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de">complex64_t</a></li>
 <li>in&#160;:&#160;<a class="el" href="struct_read_writer.html#ab6057215920138f28fd00f0e7ea8afa4">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>in_strides&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#ab25eade6573784985dbea1216f9068cf">MLXConvParams&lt; NDIM &gt;</a></li>
-<li>index&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">looped_elem_to_loc&lt; dim, offset_t &gt;</a>, <a class="el" href="structmlx_1_1core_1_1_device.html#a5e345748fe318a267833ab7398b364ac">mlx::core::Device</a>, <a class="el" href="structmlx_1_1core_1_1_stream.html#a9d0dafc1899333e1176eb2bbc0a8b626">mlx::core::Stream</a></li>
+<li>index&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a2cd3b616739b3d5b41e5b46ae335957d">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a>, <a class="el" href="structmlx_1_1core_1_1_device.html#a5e345748fe318a267833ab7398b364ac">mlx::core::Device</a>, <a class="el" href="structmlx_1_1core_1_1_stream.html#a9d0dafc1899333e1176eb2bbc0a8b626">mlx::core::Stream</a></li>
 <li>init&#160;:&#160;<a class="el" href="struct_cum_max.html#a16480052a2eeb4340e546838aab59cc4">CumMax&lt; U &gt;</a>, <a class="el" href="struct_cum_min.html#a8b67f739c620d0cc194b533190990ab9">CumMin&lt; U &gt;</a>, <a class="el" href="struct_cum_prod_3_01bool_01_4.html#ae7a8b0ba9e6898356b87b18766e76d2c">CumProd&lt; bool &gt;</a>, <a class="el" href="struct_less_than.html#abf97a6b0163048e4ba96460939dbd3a3">LessThan&lt; T &gt;</a></li>
-<li>inner_looper&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">looped_elem_to_loc&lt; dim, offset_t &gt;</a></li>
+<li>inner_looper&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></li>
 <li>inp_jump_c&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a78d30e843d65d1829623afb0b607f0a5">mlx::steel::ImplicitGemmConv2DParams</a></li>
 <li>inp_jump_h&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a03685a4066cdb11ffb647408e2c5b122">mlx::steel::ImplicitGemmConv2DParams</a></li>
 <li>inp_jump_w&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#acf168c72f4a86b72b8f5f386f07c9d8c">mlx::steel::ImplicitGemmConv2DParams</a></li>
 <li>inv&#160;:&#160;<a class="el" href="struct_read_writer.html#a773fa8524515bdc2ff8b0e2060741813">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
-<li>INV_ALPHA&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a7461e0e17cdc7d3fed80bb00d58d8644">MLXScaledDotProductAttentionParams</a></li>
 <li>iS&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#a72e1c3b4da0f70622cf18036bbf97fe6">MLXConvParams&lt; NDIM &gt;</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/functions_vars_k.html b/docs/build/html/functions_vars_k.html
index 4b708b4da..2da64e6b0 100644
--- a/docs/build/html/functions_vars_k.html
+++ b/docs/build/html/functions_vars_k.html
@@ -87,8 +87,10 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_k" name="index_k"></a>- k -</h3><ul>
-<li>K&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#aa0851af4da8df820bdad9589ff517cff">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a3733f9031e82e761ec44e72ed5c6d0e7">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ae1b0386e4cd1a7018f4b654c4e9493ba">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#ada454f5ad22ec36a22d0ff596751af23">MLXFastAttentionParams</a></li>
-<li>kCols&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>K&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#aa0851af4da8df820bdad9589ff517cff">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a3733f9031e82e761ec44e72ed5c6d0e7">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ae1b0386e4cd1a7018f4b654c4e9493ba">mlx::steel::ImplicitGemmConv2DParams</a></li>
+<li>K_strides&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a03e5480d1cca6af541be54a8720e9974">mlx::steel::AttnParams</a></li>
+<li>kCols&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_c_shape.html#a01b09227356b6a682a0694523a8e6901">mlx::steel::CShape&lt; R, C &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>kColsPerThread&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1ea49efd92696b15302ee4b52ecd548c">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>kdil&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#a7611db8f1621c7e09fc685ed44073b14">MLXConvParams&lt; NDIM &gt;</a></li>
 <li>kElemCols&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></li>
 <li>kElemRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></li>
@@ -97,11 +99,12 @@ $(function(){ initResizable(false); });
 <li>kFragCols&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a211102315e2afbcfcd2e2c201b638e9f">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>kFragRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>kFragSize&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
+<li>kL&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a497b7404bcd25b535c3589c61f269f63">mlx::steel::AttnParams</a></li>
 <li>kNumFrags&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
-<li>kRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>kRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_c_shape.html#a5caf36cb9acf9f90ba59a9b0b4197993">mlx::steel::CShape&lt; R, C &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>kRowsPerThread&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>kTileCols&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>kTileRows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
-<li>KV_TILES&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a58ef2765fd681e6b35b2ba72030610e0">MLXScaledDotProductAttentionParams</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_vars_l.html b/docs/build/html/functions_vars_l.html
index d129cbe1a..84f6d68d0 100644
--- a/docs/build/html/functions_vars_l.html
+++ b/docs/build/html/functions_vars_l.html
@@ -87,15 +87,11 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_l" name="index_l"></a>- l -</h3><ul>
+<li>layout&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_layout2_d.html#a6beedf1677ee1b192fb48c83a29ac8a1">mlx::steel::Layout2D&lt; Shape, Layout &gt;</a></li>
 <li>lda&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#afec42b532ffcad32bbffd494526bef03">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a6fac3c4a7c35af7b46b53f9662f882c6">mlx::steel::GEMMSpiltKParams</a></li>
 <li>ldb&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a6032a081ab707c14b5f28069faa7cf62">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a7f6f511854ccc98fa573bb560776ebed">mlx::steel::GEMMSpiltKParams</a></li>
 <li>ldc&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a801e2245a36632160975a784b762a4e6">mlx::steel::GEMMAddMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a888730efa5c5c8ae7ed771c3084d583c">mlx::steel::GEMMSpiltKParams</a></li>
 <li>ldd&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a6e8ae14e3f97c499ad9c39358a1855ab">mlx::steel::GEMMParams</a></li>
-<li>ldk&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a1f8c89bd55d89ad7b9fe27c60e3cb8d5">MLXFastAttentionParams</a></li>
-<li>ldo&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a9e73dc1971b5ab913bd85a7afa7cf46c">MLXFastAttentionParams</a></li>
-<li>ldq&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#af2dadba2a28f5db2ca52472d00937e58">MLXFastAttentionParams</a></li>
-<li>lds&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#a274eeb8591c02511014dce50c4240c8a">MLXFastAttentionParams</a></li>
-<li>ldv&#160;:&#160;<a class="el" href="struct_m_l_x_fast_attention_params.html#aebada0bf0789e8706dce564752208e8b">MLXFastAttentionParams</a></li>
 <li>loc&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html#a027b29e06d5cb467d961c019699514b1">mlx::core::ContiguousIterator&lt; StrideT &gt;</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/functions_vars_m.html b/docs/build/html/functions_vars_m.html
index a0c651c54..bef8e35ce 100644
--- a/docs/build/html/functions_vars_m.html
+++ b/docs/build/html/functions_vars_m.html
@@ -87,7 +87,7 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_m" name="index_m"></a>- m -</h3><ul>
-<li>M&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a85b20a4c4558cc78d76fcbd045a9c694">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a8bab0cf8a20d2abefe294a7505917e7e">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a2117fc93662d5177c8f3e7c2dbb9e2db">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a5cd3ede5f41d5fdf8177cab3f059f4d8">MLXFastAttentionParams</a></li>
+<li>M&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a85b20a4c4558cc78d76fcbd045a9c694">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a8bab0cf8a20d2abefe294a7505917e7e">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a2117fc93662d5177c8f3e7c2dbb9e2db">mlx::steel::ImplicitGemmConv2DParams</a></li>
 <li>mask_h&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0b892c1a7edb9ed20c076d8945855c19">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>mask_w&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a19ddba7259c3c2c02ed90f3f635557be">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>max&#160;:&#160;<a class="el" href="struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8">Limits&lt; U &gt;</a>, <a class="el" href="struct_limits_3_01bfloat16__t_01_4.html#a0ead3618da6718629ea9fa4670b5005f">Limits&lt; bfloat16_t &gt;</a>, <a class="el" href="struct_limits_3_01bool_01_4.html#acbd2132145888d51220558a101ffcff4">Limits&lt; bool &gt;</a>, <a class="el" href="struct_limits_3_01complex64__t_01_4.html#ac01c274b224b90f5210b675a484f4607">Limits&lt; complex64_t &gt;</a>, <a class="el" href="struct_limits_3_01float_01_4.html#aba172b22b388190aa3969ef16885d8a6">Limits&lt; float &gt;</a>, <a class="el" href="struct_limits_3_01half_01_4.html#a4f9515dbf2a622074f121bea39a7b175">Limits&lt; half &gt;</a>, <a class="el" href="struct_limits_3_01int16__t_01_4.html#a12d64c398ca7609b7c906f3cf1a6f678">Limits&lt; int16_t &gt;</a>, <a class="el" href="struct_limits_3_01int32__t_01_4.html#af756344b31e84222dd73d3445dcd5640">Limits&lt; int32_t &gt;</a>, <a class="el" href="struct_limits_3_01int64__t_01_4.html#ac9c420604c0f3d237ddfb2b8a2439224">Limits&lt; int64_t &gt;</a>, <a class="el" href="struct_limits_3_01int8__t_01_4.html#a96fed01fa9249226be69760652643289">Limits&lt; int8_t &gt;</a>, <a class="el" href="struct_limits_3_01uint16__t_01_4.html#a228b33556ba4cb7e6137ab6258628488">Limits&lt; uint16_t &gt;</a>, <a class="el" href="struct_limits_3_01uint32__t_01_4.html#a91fa8f7214ec936976a8324c7431c651">Limits&lt; uint32_t &gt;</a>, <a class="el" href="struct_limits_3_01uint64__t_01_4.html#aa8c2257881a4e1fa8596fa07dba5e107">Limits&lt; uint64_t &gt;</a>, <a class="el" href="struct_limits_3_01uint8__t_01_4.html#a1570fb640e2e41f96776db5ca08d500c">Limits&lt; uint8_t &gt;</a></li>
diff --git a/docs/build/html/functions_vars_n.html b/docs/build/html/functions_vars_n.html
index 06ab3cb3d..4900d217a 100644
--- a/docs/build/html/functions_vars_n.html
+++ b/docs/build/html/functions_vars_n.html
@@ -87,17 +87,19 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_n" name="index_n"></a>- n -</h3><ul>
-<li>N&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a174626ab98515d89923b2841a664b9a1">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a1103e79fb8962812b9a3c9d5c902ff86">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a213f5ea4018120d8b61ab82754aaba83">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_conv_params.html#ae6b7054dc3cffa8e6aedeb29fa7da932">MLXConvParams&lt; NDIM &gt;</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#ab42c792a80388002e34992cbd837a167">MLXFastAttentionParams</a></li>
+<li>N&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a174626ab98515d89923b2841a664b9a1">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a1103e79fb8962812b9a3c9d5c902ff86">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a213f5ea4018120d8b61ab82754aaba83">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_conv_params.html#ae6b7054dc3cffa8e6aedeb29fa7da932">MLXConvParams&lt; NDIM &gt;</a></li>
 <li>n&#160;:&#160;<a class="el" href="struct_read_writer.html#a655346c9ebfc33a69da3f1c1d4238dfb">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>n_channels&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_channel_helper.html#aa476bd0fcb38494c268547fc9820fc0a">mlx::steel::ChannelHelper&lt; n_channels_ &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a06c2fb9c93660e8f6916228cd77f9494">mlx::steel::ChannelHelper&lt; 1 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#ac66ff37bc2cf78d96667192a6cca73b5">mlx::steel::ChannelHelper&lt; 2 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a071c015713b7bab09930661165517eff">mlx::steel::ChannelHelper&lt; 3 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#a167b00a84adf93b60e3d7a943d5eb977">mlx::steel::ChannelHelper&lt; 4 &gt;</a></li>
-<li>N_KV_HEADS&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a68a292b9986c20560aca88394f82e9f7">MLXScaledDotProductAttentionParams</a></li>
 <li>N_PER_BLOCK&#160;:&#160;<a class="el" href="struct_kernel_merge_sort.html#a959aaf5bfb70796a525fed318f7ae8ab">KernelMergeSort&lt; T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a>, <a class="el" href="struct_kernel_multi_block_merge_sort.html#ae5113ca5852d11999ae932439af95a5c">KernelMultiBlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
-<li>N_Q_HEADS&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a1a63d2e7ad712b4ba26219c784c95177">MLXScaledDotProductAttentionParams</a></li>
 <li>n_reads&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
-<li>n_rows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#abff29c5d96645d9113314c9a997dd7a8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a097c48a23e1bd7d8cf3e9d531397602f">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a8b6c0936c9ad2766242664f034d1115f">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3ec8a92c9e6643c1d5bf8af278026fe8">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a593ec140370d53f8c968f6240116d38b">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aaebb6da2cac9961f5edf52d16c18de7d">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae905e56c1129606e93dbbcd7baed8f0f">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
+<li>n_rows&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a0ccc7caa93e6e709981a1a08159d41dc">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#abff29c5d96645d9113314c9a997dd7a8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a097c48a23e1bd7d8cf3e9d531397602f">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a8b6c0936c9ad2766242664f034d1115f">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3ec8a92c9e6643c1d5bf8af278026fe8">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a593ec140370d53f8c968f6240116d38b">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aaebb6da2cac9961f5edf52d16c18de7d">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae905e56c1129606e93dbbcd7baed8f0f">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 <li>names&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_node_namer.html#a57823f9a2cdc60b2f06f857b36019277">mlx::core::NodeNamer</a></li>
 <li>ndim&#160;:&#160;<a class="el" href="struct_indices.html#a7dec359e91d0eb2b64e5461b54308313">Indices&lt; IdxT, NIDX &gt;</a>, <a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051">mlx::core::fast::CustomKernelShapeInfo</a></li>
 <li>needs_tgp_reduction&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#ae8113fddf6fb637acfd12efd978b704c">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a67be7ec69c3791f02e97ccdb00ae0e03">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
+<li>NK&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a68a66e3fafa922dcfd1ab1f6bdc2375e">mlx::steel::AttnParams</a></li>
+<li>NK_aligned&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#aaf953954274794cfcb4e35e82d681b58">mlx::steel::AttnParams</a></li>
+<li>NQ&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a48575afc94ab9ff74deaba61464e57a1">mlx::steel::AttnParams</a></li>
+<li>NQ_aligned&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a4cfd2ccb0fd7eb81c2a781a0614fdcbe">mlx::steel::AttnParams</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_vars_o.html b/docs/build/html/functions_vars_o.html
index ad9ea0ff5..ef80a368d 100644
--- a/docs/build/html/functions_vars_o.html
+++ b/docs/build/html/functions_vars_o.html
@@ -88,7 +88,8 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_o" name="index_o"></a>- o -</h3><ul>
 <li>O&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#ad55ff586d30072d8154865f9dfe92d97">MLXConvParams&lt; NDIM &gt;</a></li>
-<li>offset&#160;:&#160;<a class="el" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">looped_elem_to_loc&lt; dim, offset_t &gt;</a>, <a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a7aebc0b0656e3a55d0dbca27a57d600e">looped_elem_to_loc&lt; 1, offset_t &gt;</a></li>
+<li>O_strides&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a33dc7fc22d2604a73af9f94eeea45bb4">mlx::steel::AttnParams</a></li>
+<li>offset&#160;:&#160;<a class="el" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af792b1fd4e8286f97b9b863c127a2d9a">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a>, <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a3a18944c158e2747a6ddebb420299a3b">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></li>
 <li>op&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1_default_contiguous_reduce.html#a1928f07db988715cc177999e386f4830">mlx::core::DefaultContiguousReduce&lt; T, U, Op &gt;</a>, <a class="el" href="structmlx_1_1core_1_1_default_strided_reduce.html#ac871f55a7ddd205574974cb4492a240b">mlx::core::DefaultStridedReduce&lt; T, U, Op &gt;</a></li>
 <li>ortho&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1_exec_dcst.html#aea17551a49acaca5e7808dc181d38b7f">pocketfft::detail::ExecDcst</a></li>
 <li>oS&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#a19ccb9fecfccdc18b6a7f0cc43adbc6e">MLXConvParams&lt; NDIM &gt;</a></li>
diff --git a/docs/build/html/functions_vars_q.html b/docs/build/html/functions_vars_q.html
index a996ba9d3..e456a1315 100644
--- a/docs/build/html/functions_vars_q.html
+++ b/docs/build/html/functions_vars_q.html
@@ -88,7 +88,8 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_q" name="index_q"></a>- q -</h3><ul>
 <li>q&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#adf608e22d0c0397217472408aab52631">mlx::core::scheduler::StreamThread</a></li>
-<li>QUERY_SEQUENCE_LENGTH&#160;:&#160;<a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a46cc2da6a069d822f36983ee18467e5c">MLXScaledDotProductAttentionParams</a></li>
+<li>Q_strides&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a90bba215328201a37eb1c430ce9f8563">mlx::steel::AttnParams</a></li>
+<li>qL&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a59255882cbd78bb6f15e704e3a356a7f">mlx::steel::AttnParams</a></li>
 <li>queue&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">mlx::core::metal::DeviceStream</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/functions_vars_r.html b/docs/build/html/functions_vars_r.html
index 436c03512..fab2dc13d 100644
--- a/docs/build/html/functions_vars_r.html
+++ b/docs/build/html/functions_vars_r.html
@@ -87,7 +87,7 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_r" name="index_r"></a>- r -</h3><ul>
-<li>r&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#afc51cdf222d77690953a8cb8ce3ee692">pocketfft::detail::cmplx&lt; T &gt;</a></li>
+<li>r&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a>, <a class="el" href="structpocketfft_1_1detail_1_1cmplx.html#afc51cdf222d77690953a8cb8ce3ee692">pocketfft::detail::cmplx&lt; T &gt;</a></li>
 <li>r2h&#160;:&#160;<a class="el" href="structpocketfft_1_1detail_1_1_exec_r2_r.html#a925b398c8e1868614ce9eaf381d02b7e">pocketfft::detail::ExecR2R</a></li>
 <li>radix&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aaefa8c2cadd11ac7e22f7b2c5edbd1cd">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 <li>read_ih&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a35a010c3819df6667339d37a5e8f5b43">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a82dd8230e1f37500f1a562177c3ad692">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a6623e33d946b41d01c69ec793706d789">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
diff --git a/docs/build/html/functions_vars_s.html b/docs/build/html/functions_vars_s.html
index 7a93664b4..b6a7ad008 100644
--- a/docs/build/html/functions_vars_s.html
+++ b/docs/build/html/functions_vars_s.html
@@ -87,9 +87,9 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_s" name="index_s"></a>- s -</h3><ul>
-<li>scale&#160;:&#160;<a class="el" href="struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b">ScaleOp&lt; OutT, InT &gt;</a></li>
+<li>scale&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#ad81bcd32e6ff8fec0000eca505fb6826">mlx::steel::AttnParams</a>, <a class="el" href="struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b">ScaleOp&lt; OutT, InT &gt;</a>, <a class="el" href="struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6">TransformScale&lt; T &gt;</a></li>
 <li>scales&#160;:&#160;<a class="el" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
-<li>shape&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">mlx::core::fast::CustomKernelShapeInfo</a>, <a class="el" href="structmlx_1_1core_1_1_reduction_plan.html#a6cfa8771fa9caf6fdcc3d74c9fca83ae">mlx::core::ReductionPlan</a></li>
+<li>shape&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a">mlx::core::fast::CustomKernelShapeInfo</a>, <a class="el" href="structmlx_1_1core_1_1_reduction_plan.html#a6cfa8771fa9caf6fdcc3d74c9fca83ae">mlx::core::ReductionPlan</a>, <a class="el" href="structmlx_1_1steel_1_1_layout2_d.html#a23183747ab1ddbdd3f1fcac6d0faa2cd">mlx::steel::Layout2D&lt; Shape, Layout &gt;</a></li>
 <li>shapes&#160;:&#160;<a class="el" href="struct_indices.html#a5ab170f1a77636180889ddfffd4f7d2f">Indices&lt; IdxT, NIDX &gt;</a></li>
 <li>shp&#160;:&#160;<a class="el" href="classpocketfft_1_1detail_1_1arr__info.html#a2467e9e01de1ba4d7cd28c1af783da8d">pocketfft::detail::arr_info</a></li>
 <li>sm&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
@@ -97,8 +97,8 @@ $(function(){ initResizable(false); });
 <li>split_k_partition_size&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a9f5a67b2343645b570e109c3837d4042">mlx::steel::GEMMSpiltKParams</a></li>
 <li>split_k_partition_stride&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a12144ce89d404812cd862611d770b9fb">mlx::steel::GEMMSpiltKParams</a></li>
 <li>split_k_partitions&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#ae06c27116905d4ff3b9b436e588a93fd">mlx::steel::GEMMSpiltKParams</a></li>
-<li>src&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1ee2922961b5fcb1db577928c4d9d731">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a656a46ee27486482b45ff90b3d626255">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#afe21e46e08523232830c25eb1b4ade16">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a57552e9cfbafad71d47b2f3a8e027bdf">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7bfbcc4a1e3eef7aef5dd8e8c374a95f">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a50f458dbb74d61be2ed24727d8d43614">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b04a69952404a04029dacc424df6e8f">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
-<li>src_ld&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7464ec687323fa79050702952ed9084f">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6918c1df7712c4e408e2871467ea7987">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#aa6bedc0cbb447eaf70c03f2e26df2cb2">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>src&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1ee2922961b5fcb1db577928c4d9d731">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a656a46ee27486482b45ff90b3d626255">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#afe21e46e08523232830c25eb1b4ade16">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a57552e9cfbafad71d47b2f3a8e027bdf">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7bfbcc4a1e3eef7aef5dd8e8c374a95f">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a50f458dbb74d61be2ed24727d8d43614">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b04a69952404a04029dacc424df6e8f">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>src_ld&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7464ec687323fa79050702952ed9084f">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6918c1df7712c4e408e2871467ea7987">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#aa6bedc0cbb447eaf70c03f2e26df2cb2">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>start_row&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a220e033b689c8d6a6f319dae02b38334">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
 <li>stop&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a456ad1c0c9e731833a2f8411c4ed51aa">mlx::core::scheduler::StreamThread</a></li>
 <li>str&#160;:&#160;<a class="el" href="struct_m_l_x_conv_params.html#a862191e8ab1bc8a47aa1396b36d46058">MLXConvParams&lt; NDIM &gt;</a>, <a class="el" href="classpocketfft_1_1detail_1_1arr__info.html#abe1f7b92501b4e0e5a38fd26294ac5a4">pocketfft::detail::arr_info</a></li>
@@ -106,7 +106,7 @@ $(function(){ initResizable(false); });
 <li>strided_device_idx&#160;:&#160;<a class="el" href="struct_read_writer.html#a4c0b12484aac4fd6759d67c190391989">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>strided_shared_idx&#160;:&#160;<a class="el" href="struct_read_writer.html#ace40adb02cfb33d89c98353327c251fc">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>strides&#160;:&#160;<a class="el" href="struct_indices.html#a7f73d7652f0f751e6a06c2663e329a4a">Indices&lt; IdxT, NIDX &gt;</a>, <a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2">mlx::core::fast::CustomKernelShapeInfo</a>, <a class="el" href="structmlx_1_1core_1_1_reduction_plan.html#a9bf7cae845ab633247c1811613ece8bd">mlx::core::ReductionPlan</a></li>
-<li>swizzle_log&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#af9ff2c06dd8994126634531440325be7">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ad0713159d4f710cd9a066596593d8840">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a68a338d522ffeb6761b7b168869361e2">MLXFastAttentionParams</a></li>
+<li>swizzle_log&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#af9ff2c06dd8994126634531440325be7">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ad0713159d4f710cd9a066596593d8840">mlx::steel::ImplicitGemmConv2DParams</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_vars_t.html b/docs/build/html/functions_vars_t.html
index 27dcdda3d..159559b04 100644
--- a/docs/build/html/functions_vars_t.html
+++ b/docs/build/html/functions_vars_t.html
@@ -96,15 +96,15 @@ $(function(){ initResizable(false); });
 <li>tgp_padding_b&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
 <li>tgp_size&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></li>
 <li>thread&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a449de02bf2ac80d8fe2f208fa7eac359">mlx::core::scheduler::StreamThread</a></li>
-<li>thread_idx&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a401f0c7cf1588552556603c7ffba2316">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a70da26a715135d973f88371a70255be9">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9642399b8066e29123524f36ebc7b482">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac18de37cde1459595bfe18b0d5ef146d">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ab1cb2ade639787243e0325dcd3dc0a11">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08a517bc50caf41155b98be0690bfe44">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acacdac168004c87fee27c8554ac905a7">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>thread_idx&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a401f0c7cf1588552556603c7ffba2316">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a70da26a715135d973f88371a70255be9">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9642399b8066e29123524f36ebc7b482">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac18de37cde1459595bfe18b0d5ef146d">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ab1cb2ade639787243e0325dcd3dc0a11">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08a517bc50caf41155b98be0690bfe44">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acacdac168004c87fee27c8554ac905a7">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>threads_per_tg&#160;:&#160;<a class="el" href="struct_read_writer.html#a64c58e358da22358df3075448ea23893">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></li>
 <li>threadsM&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a1dd943fcbf5e7be435fc36bed589a641">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#a4a53e73a581aa8881b1f86ce653519e6">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
 <li>threadsN&#160;:&#160;<a class="el" href="struct_g_e_m_v_kernel.html#a47bfab7d21dd18760d3e0937ad36b19d">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a>, <a class="el" href="struct_g_e_m_v_t_kernel.html#ade6f15a9744616de9dd71498ad7e758d">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></li>
-<li>tile_stride&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
+<li>tile_stride&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></li>
 <li>tile_stride_a&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>tile_stride_b&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
-<li>tiles_m&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#ad23a5a7f74cd5859741a36e4bc7823ca">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a0970989624e17088d5326c2e198cb95b">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a4c5e33edf70be99cf93ac5723c12eb24">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a0df159c839fc27b9426b8ac4336cc0ad">MLXFastAttentionParams</a></li>
-<li>tiles_n&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0e6b8b629232f1b43fbce9a395174bed">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a5b46dfb9cee3606efa05d217349a20a6">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a76f9f381e7187a993d65128b9b681b2d">mlx::steel::ImplicitGemmConv2DParams</a>, <a class="el" href="struct_m_l_x_fast_attention_params.html#a608aa256216ac6d80af00209303d2029">MLXFastAttentionParams</a></li>
+<li>tiles_m&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#ad23a5a7f74cd5859741a36e4bc7823ca">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a0970989624e17088d5326c2e198cb95b">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a4c5e33edf70be99cf93ac5723c12eb24">mlx::steel::ImplicitGemmConv2DParams</a></li>
+<li>tiles_n&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html#a0e6b8b629232f1b43fbce9a395174bed">mlx::steel::GEMMParams</a>, <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a5b46dfb9cee3606efa05d217349a20a6">mlx::steel::GEMMSpiltKParams</a>, <a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a76f9f381e7187a993d65128b9b681b2d">mlx::steel::ImplicitGemmConv2DParams</a></li>
 <li>TM&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>TM_stride&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
 <li>TN&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></li>
diff --git a/docs/build/html/functions_vars_v.html b/docs/build/html/functions_vars_v.html
index d5f537663..b73d12a66 100644
--- a/docs/build/html/functions_vars_v.html
+++ b/docs/build/html/functions_vars_v.html
@@ -87,11 +87,12 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the classes they belong to:</div>
 
 <h3><a id="index_v" name="index_v"></a>- v -</h3><ul>
-<li>v&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#afbef88bfb901a71e8423de911b7c7347">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a></li>
+<li>v&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a></li>
+<li>V_strides&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_attn_params.html#acc4860c3ce09c7230b470182ed002d3c">mlx::steel::AttnParams</a></li>
 <li>val&#160;:&#160;<a class="el" href="structmlx__atomic.html#a6f6651b8dd8149917c50cd99b13c6747">mlx_atomic&lt; T, typename &gt;</a>, <a class="el" href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html#a8dbf729fcd8c4a16e41b546c7405543d">mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;</a>, <a class="el" href="structpocketfft_1_1detail_1_1_v_l_e_n.html#ab1fdc340dedde723e636746c828a4534">pocketfft::detail::VLEN&lt; T &gt;</a></li>
-<li>val_frags&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
+<li>val_frags&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></li>
 <li>value&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1integral__constant.html#a4efa69cb3fd42ac0dcad46578600d637">mlx::steel::integral_constant&lt; T, v &gt;</a></li>
-<li>vec_size&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper.html#a2b24f991a9380fdad6b51a038770b925">mlx::steel::ChannelHelper&lt; n_channels_ &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a71449551bbfe56058440755dfd50fc75">mlx::steel::ChannelHelper&lt; 1 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#acfb18991a77a9d1d4a79918ac5f387af">mlx::steel::ChannelHelper&lt; 2 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a5cb83774601c29564a6bbc010fc0bf7f">mlx::steel::ChannelHelper&lt; 3 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#af28cdbe2a3c027d95832de07f60448ca">mlx::steel::ChannelHelper&lt; 4 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1587047caa339cf5b2c06adc4b332ab8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#adcc83bf6c02391cc2375e55c06a1c9a4">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a6b0b18428516d1d6dcae3beb3faee81c">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a71c313e1597a2bb99f7b07d434e119d2">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a10109dc9553207f5a365799e4969c6d2">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08dba753ec7c8ea2892775746933b3e7">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a006153d274aa13d5fd4448b4607fff3a">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
+<li>vec_size&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper.html#a2b24f991a9380fdad6b51a038770b925">mlx::steel::ChannelHelper&lt; n_channels_ &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a71449551bbfe56058440755dfd50fc75">mlx::steel::ChannelHelper&lt; 1 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#acfb18991a77a9d1d4a79918ac5f387af">mlx::steel::ChannelHelper&lt; 2 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a5cb83774601c29564a6bbc010fc0bf7f">mlx::steel::ChannelHelper&lt; 3 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#af28cdbe2a3c027d95832de07f60448ca">mlx::steel::ChannelHelper&lt; 4 &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1587047caa339cf5b2c06adc4b332ab8">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#adcc83bf6c02391cc2375e55c06a1c9a4">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a6b0b18428516d1d6dcae3beb3faee81c">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a71c313e1597a2bb99f7b07d434e119d2">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a10109dc9553207f5a365799e4969c6d2">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08dba753ec7c8ea2892775746933b3e7">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a006153d274aa13d5fd4448b4607fff3a">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/functions_w.html b/docs/build/html/functions_w.html
index ca1390cb0..9e11326bf 100644
--- a/docs/build/html/functions_w.html
+++ b/docs/build/html/functions_w.html
@@ -88,6 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_w" name="index_w"></a>- w -</h3><ul>
 <li>wait()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1array.html#a648592006f1c92287734ba2428eaa45e">mlx::core::array</a>, <a class="el" href="classmlx_1_1core_1_1_event.html#a634afd918e6ed847f354531ba9f48252">mlx::core::Event</a>, <a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html#af503189cc9247047fbdfc3ebf1daacc1">pocketfft::detail::threading::latch</a></li>
+<li>wait_for_fence()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefdadbff4e003dc6f77506840babc088">mlx::core::metal::CommandEncoder</a></li>
 <li>wait_for_one()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a01c574bb388f10d67aaaaa541894d807">mlx::core::scheduler::Scheduler</a></li>
 <li>weight_base&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_base_info.html#a1d88677c4617f4bdae157e40a64a407b">mlx::steel::Conv2DGeneralBaseInfo</a></li>
 <li>weight_h&#160;:&#160;<a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a397412909eb955babc935a35d97c3fd4">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3be4815d4090cb27ebe2f9bad1a39e95">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a366c3cee4ed1165545287c8d5ce49445">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a>, <a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a5997fd8ef249e4cd3df7dad7b251d8d5">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></li>
diff --git a/docs/build/html/gather_8h.html b/docs/build/html/gather_8h.html
index 3782444c0..c895d0983 100644
--- a/docs/build/html/gather_8h.html
+++ b/docs/build/html/gather_8h.html
@@ -99,18 +99,18 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:abdec470e1af0109563ddae3e85e6526c" id="r_abdec470e1af0109563ddae3e85e6526c"><td class="memTemplParams" colspan="2">template&lt;typename T , typename IdxT , int NIDX, int IDX_NDIM&gt; </td></tr>
-<tr class="memitem:abdec470e1af0109563ddae3e85e6526c"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#abdec470e1af0109563ddae3e85e6526c">gather_impl</a> (const device T *src, device T *out, const constant int *src_shape, const constant size_t *src_strides, const constant size_t &amp;src_ndim, const constant int *slice_sizes, const constant int *axes, const thread <a class="el" href="struct_indices.html">Indices</a>&lt; IdxT, NIDX &gt; &amp;indices, uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:abdec470e1af0109563ddae3e85e6526c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a767d7c5be6f2f649101f581449af5599" id="r_a767d7c5be6f2f649101f581449af5599"><td class="memTemplParams" colspan="2">template&lt;typename T , typename IdxT , int NIDX, int IDX_NDIM, typename LocT &gt; </td></tr>
+<tr class="memitem:a767d7c5be6f2f649101f581449af5599"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a767d7c5be6f2f649101f581449af5599">gather_impl</a> (const device T *src, device T *out, const constant int *src_shape, const constant size_t *src_strides, const constant size_t &amp;src_ndim, const constant int *slice_sizes, const constant int *axes, const thread <a class="el" href="struct_indices.html">Indices</a>&lt; IdxT, NIDX &gt; &amp;indices, uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:a767d7c5be6f2f649101f581449af5599"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="abdec470e1af0109563ddae3e85e6526c" name="abdec470e1af0109563ddae3e85e6526c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#abdec470e1af0109563ddae3e85e6526c">&#9670;&#160;</a></span>gather_impl()</h2>
+<a id="a767d7c5be6f2f649101f581449af5599" name="a767d7c5be6f2f649101f581449af5599"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a767d7c5be6f2f649101f581449af5599">&#9670;&#160;</a></span>gather_impl()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename IdxT , int NIDX, int IDX_NDIM&gt; </div>
+template&lt;typename T , typename IdxT , int NIDX, int IDX_NDIM, typename LocT &gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">METAL_FUNC void gather_impl </td>
diff --git a/docs/build/html/gather_8h_source.html b/docs/build/html/gather_8h_source.html
index b172497cd..a677c1f52 100644
--- a/docs/build/html/gather_8h_source.html
+++ b/docs/build/html/gather_8h_source.html
@@ -97,9 +97,9 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
 <div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#include &quot;<a class="code" href="kernels_2indexing_8h.html">mlx/backend/metal/kernels/indexing.h</a>&quot;</span></div>
 <div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span> </div>
-<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> IdxT, <span class="keywordtype">int</span> NIDX, <span class="keywordtype">int</span> IDX_NDIM&gt;</div>
+<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> IdxT, <span class="keywordtype">int</span> NIDX, <span class="keywordtype">int</span> IDX_NDIM, <span class="keyword">typename</span> LocT&gt;</div>
 <div class="foldopen" id="foldopen00008" data-start="{" data-end="}">
-<div class="line"><a id="l00008" name="l00008"></a><span class="lineno"><a class="line" href="gather_8h.html#abdec470e1af0109563ddae3e85e6526c">    8</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="gather_8h.html#abdec470e1af0109563ddae3e85e6526c">gather_impl</a>(</div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno"><a class="line" href="gather_8h.html#a767d7c5be6f2f649101f581449af5599">    8</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="gather_8h.html#a767d7c5be6f2f649101f581449af5599">gather_impl</a>(</div>
 <div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span>    <span class="keyword">const</span> device T* src [[buffer(0)]],</div>
 <div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span>    device T* out [[buffer(1)]],</div>
 <div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* src_shape [[buffer(2)]],</div>
@@ -110,44 +110,44 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    <span class="keyword">const</span> thread <a class="code hl_struct" href="struct_indices.html">Indices&lt;IdxT, NIDX&gt;</a>&amp; indices,</div>
 <div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    uint3 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>  <span class="keywordtype">size_t</span> src_idx = 0;</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>  LocT src_idx = 0;</div>
 <div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; NIDX; ++i) {</div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>    <span class="keywordtype">size_t</span> idx_loc;</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>    LocT idx_loc;</div>
 <div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>    <span class="keywordflow">if</span> (IDX_NDIM == 0) {</div>
 <div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>      idx_loc = 0;</div>
 <div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (IDX_NDIM == 1) {</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>      idx_loc = index.x * indices.strides[indices.ndim * i];</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>      idx_loc = index.x * <span class="keyword">static_cast&lt;</span>LocT<span class="keyword">&gt;</span>(indices.strides[indices.ndim * i]);</div>
 <div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>      idx_loc = index.x * indices.strides[indices.ndim * i];</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>      idx_loc = index.x * <span class="keyword">static_cast&lt;</span>LocT<span class="keyword">&gt;</span>(indices.strides[indices.ndim * i]);</div>
 <div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>      idx_loc += indices.row_contiguous[i]</div>
 <div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>          ? index.y</div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>          : <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>          : <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, LocT&gt;</a>(</div>
 <div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>                index.y,</div>
 <div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>                &amp;indices.shapes[indices.ndim * i + 1],</div>
 <div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>                &amp;indices.strides[indices.ndim * i + 1],</div>
 <div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>                indices.ndim - 1);</div>
 <div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>    }</div>
 <div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>    <span class="keyword">auto</span> ax = axes[i];</div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>    <span class="keyword">auto</span> idx_val = <a class="code hl_function" href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df">offset_neg_idx</a>(indices.buffers[i][idx_loc], src_shape[ax]);</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>    src_idx += idx_val * src_strides[ax];</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>    <span class="keyword">auto</span> idx_val = <a class="code hl_function" href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8">offset_neg_idx</a>(indices.buffers[i][idx_loc], src_shape[ax]);</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>    src_idx += <span class="keyword">static_cast&lt;</span>LocT<span class="keyword">&gt;</span>(idx_val) * <span class="keyword">static_cast&lt;</span>LocT<span class="keyword">&gt;</span>(src_strides[ax]);</div>
 <div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  }</div>
 <div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span> </div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  <span class="keyword">auto</span> src_offset = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(index.z, slice_sizes, src_strides, src_ndim);</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span> </div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>  <span class="keywordtype">size_t</span> out_idx = index.z;</div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  <span class="keywordflow">if</span> (IDX_NDIM == 1) {</div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>    out_idx += <span class="keyword">static_cast&lt;</span><span class="keywordtype">size_t</span><span class="keyword">&gt;</span>(grid_dim.z) * index.x;</div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (IDX_NDIM &gt;= 2) {</div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>    out_idx +=</div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>        grid_dim.z * (index.x * <span class="keyword">static_cast&lt;</span><span class="keywordtype">size_t</span><span class="keyword">&gt;</span>(grid_dim.y) + index.y);</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  <span class="keyword">auto</span> src_offset =</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>      <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, LocT&gt;</a>(index.z, slice_sizes, src_strides, src_ndim);</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span> </div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  LocT out_idx = index.z;</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>  <span class="keywordflow">if</span> (IDX_NDIM == 1) {</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>    out_idx += <span class="keyword">static_cast&lt;</span>LocT<span class="keyword">&gt;</span>(grid_dim.z) * index.x;</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>  } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (IDX_NDIM &gt;= 2) {</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>    out_idx += grid_dim.z * (index.x * <span class="keyword">static_cast&lt;</span>LocT<span class="keyword">&gt;</span>(grid_dim.y) + index.y);</div>
 <div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>  }</div>
 <div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>  out[out_idx] = src[src_offset + src_idx];</div>
 <div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>}</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
-<div class="ttc" id="agather_8h_html_abdec470e1af0109563ddae3e85e6526c"><div class="ttname"><a href="gather_8h.html#abdec470e1af0109563ddae3e85e6526c">gather_impl</a></div><div class="ttdeci">METAL_FUNC void gather_impl(const device T *src, device T *out, const constant int *src_shape, const constant size_t *src_strides, const constant size_t &amp;src_ndim, const constant int *slice_sizes, const constant int *axes, const thread Indices&lt; IdxT, NIDX &gt; &amp;indices, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> gather.h:8</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
+<div class="ttc" id="agather_8h_html_a767d7c5be6f2f649101f581449af5599"><div class="ttname"><a href="gather_8h.html#a767d7c5be6f2f649101f581449af5599">gather_impl</a></div><div class="ttdeci">METAL_FUNC void gather_impl(const device T *src, device T *out, const constant int *src_shape, const constant size_t *src_strides, const constant size_t &amp;src_ndim, const constant int *slice_sizes, const constant int *axes, const thread Indices&lt; IdxT, NIDX &gt; &amp;indices, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> gather.h:8</div></div>
 <div class="ttc" id="akernels_2indexing_8h_html"><div class="ttname"><a href="kernels_2indexing_8h.html">indexing.h</a></div></div>
-<div class="ttc" id="akernels_2indexing_8h_html_ab41167dc537c06fbdb4df100972393df"><div class="ttname"><a href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df">offset_neg_idx</a></div><div class="ttdeci">METAL_FUNC size_t offset_neg_idx(IdxT idx, size_t size)</div><div class="ttdef"><b>Definition</b> indexing.h:17</div></div>
+<div class="ttc" id="akernels_2indexing_8h_html_a58a65ea6215999cd4ccb4fe757cc2dc8"><div class="ttname"><a href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8">offset_neg_idx</a></div><div class="ttdeci">METAL_FUNC size_t offset_neg_idx(IdxT idx, int size)</div><div class="ttdef"><b>Definition</b> indexing.h:17</div></div>
 <div class="ttc" id="astruct_indices_html"><div class="ttname"><a href="struct_indices.html">Indices</a></div><div class="ttdef"><b>Definition</b> indexing.h:8</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/gemm_2loader_8h_source.html b/docs/build/html/gemm_2loader_8h_source.html
index fda1087bf..2887eb0d5 100644
--- a/docs/build/html/gemm_2loader_8h_source.html
+++ b/docs/build/html/gemm_2loader_8h_source.html
@@ -115,25 +115,25 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    <span class="keywordtype">short</span> TROWS = tgp_size / TCOLS&gt;</div>
 <div class="foldopen" id="foldopen00025" data-start="{" data-end="};">
 <div class="line"><a id="l00025" name="l00025"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html">   25</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a> {</div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">   26</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">n_rows</a> = (BROWS + TROWS - 1) / TROWS;</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">   27</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a> = n_reads;</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">n_rows</a> = (BROWS + TROWS - 1) / TROWS;</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a> = n_reads;</div>
 <div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span> </div>
 <div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  <span class="comment">// Leading dimension for src</span></div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">   30</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a>;</div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">   31</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">tile_stride</a>;</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a>;</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">tile_stride</a>;</div>
 <div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span> </div>
 <div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  <span class="comment">// Thread location indices</span></div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">   34</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a>;</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">   35</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a>;</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">   36</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>;</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a>;</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a>;</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>;</div>
 <div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span> </div>
 <div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <span class="comment">// threadgroup and device memory</span></div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#af34c184a19846e4b40ba54b2946589ec">   39</a></span>  threadgroup T* <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#af34c184a19846e4b40ba54b2946589ec">dst</a>;</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">   40</a></span>  <span class="keyword">const</span> device T* <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">src</a>;</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  threadgroup T* <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2">dst</a>;</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  <span class="keyword">const</span> device T* <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a>;</div>
 <div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span> </div>
 <div class="foldopen" id="foldopen00042" data-start="{" data-end="};">
 <div class="line"><a id="l00042" name="l00042"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">   42</a></span>  <span class="keyword">struct </span><span class="keyword">alignas</span>(alignment * sizeof(T)) <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">ReadVector</a> {</div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#afbef88bfb901a71e8423de911b7c7347">   43</a></span>    uint8_t <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#afbef88bfb901a71e8423de911b7c7347">v</a>[<span class="keyword">sizeof</span>(T) * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>];</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>    uint8_t <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d">v</a>[<span class="keyword">sizeof</span>(T) * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>];</div>
 <div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  };</div>
 </div>
 <div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span> </div>
@@ -151,7 +151,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a> / TCOLS),</div>
 <div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a> * (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a> % TCOLS)),</div>
 <div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>        dst(dst_ + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a> * dst_ld + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>),</div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">src</a>(src_ + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a> + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>) {}</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>        <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a>(src_ + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a> + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a>) {}</div>
 </div>
 <div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span> </div>
 <div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>  <span class="comment">/* Apply operation to threadgroup without bound checking */</span></div>
@@ -174,7 +174,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; BROWS; i += TROWS) {</div>
 <div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>      *((threadgroup <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">ReadVector</a>*)(&amp;dst[i * dst_ld])) =</div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>          *((<span class="keyword">const</span> device <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">ReadVector</a>*)(&amp;<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">src</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a>]));</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>          *((<span class="keyword">const</span> device <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">ReadVector</a>*)(&amp;<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a>]));</div>
 <div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    }</div>
 <div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>  }</div>
 </div>
@@ -211,7 +211,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>      <span class="comment">// Read valid indices into tmp_val</span></div>
 <div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>; j++) {</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>        tmp_val[j] = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">src</a>[(tmp_idx[j] ? i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a> + j : 0)];</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>        tmp_val[j] = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a>[(tmp_idx[j] ? i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a> + j : 0)];</div>
 <div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>      }</div>
 <div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span> </div>
 <div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>      <span class="comment">// Zero out uneeded values</span></div>
@@ -232,7 +232,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>  <span class="comment">/* Iteration helper */</span></div>
 <div class="foldopen" id="foldopen00131" data-start="{" data-end="}">
 <div class="line"><a id="l00131" name="l00131"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">  131</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">next</a>() {</div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">src</a> += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">tile_stride</a>;</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a> += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">tile_stride</a>;</div>
 <div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  }</div>
 </div>
 <div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>};</div>
@@ -246,10 +246,9 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6"><div class="ttname"><a href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div><div class="ttdeci">#define STEEL_PRAGMA_UNROLL</div><div class="ttdef"><b>Definition</b> defines.h:4</div></div>
 <div class="ttc" id="asteel_2defines_8h_html_a90b91c866313ffa46eff6d9cc944ad2b"><div class="ttname"><a href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a></div><div class="ttdeci">#define STEEL_CONST</div><div class="ttdef"><b>Definition</b> defines.h:3</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_1_1_read_vector_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">mlx::steel::BlockLoader::ReadVector</a></div><div class="ttdef"><b>Definition</b> loader.h:42</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_1_1_read_vector_html_afbef88bfb901a71e8423de911b7c7347"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#afbef88bfb901a71e8423de911b7c7347">mlx::steel::BlockLoader::ReadVector::v</a></div><div class="ttdeci">uint8_t v[sizeof(T) *vec_size]</div><div class="ttdef"><b>Definition</b> loader.h:43</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_1_1_read_vector_html_a20963f7191251defca48bf8a843d019d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d">mlx::steel::BlockLoader::ReadVector::v</a></div><div class="ttdeci">uint8_t v[sizeof(T) *vec_size]</div><div class="ttdef"><b>Definition</b> loader.h:43</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a></div><div class="ttdef"><b>Definition</b> loader.h:25</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a064e2cc77e0b1cf0f8027929e031775b"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">mlx::steel::BlockLoader::thread_idx</a></div><div class="ttdeci">const short thread_idx</div><div class="ttdef"><b>Definition</b> loader.h:34</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a13004952d0bf2030b95acb621a3779dd"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">mlx::steel::BlockLoader::src</a></div><div class="ttdeci">const device T * src</div><div class="ttdef"><b>Definition</b> loader.h:40</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a37aca066e63dff238865b5923a2d4335"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335">mlx::steel::BlockLoader::BlockLoader</a></div><div class="ttdeci">METAL_FUNC BlockLoader(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> loader.h:47</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a58bdf9b9c81962733e22ecdeae28c092"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">mlx::steel::BlockLoader::vec_size</a></div><div class="ttdeci">STEEL_CONST short vec_size</div><div class="ttdef"><b>Definition</b> loader.h:27</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_a6af21428f0e7c17b48ddedf4dd20a1e8"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">mlx::steel::BlockLoader::next</a></div><div class="ttdeci">METAL_FUNC void next()</div><div class="ttdef"><b>Definition</b> loader.h:131</div></div>
@@ -260,8 +259,9 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_aadafc50f7f06af434149d7469df4714d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">mlx::steel::BlockLoader::src_ld</a></div><div class="ttdeci">const int src_ld</div><div class="ttdef"><b>Definition</b> loader.h:30</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_ab87876699d55473620c7ea99f9da911d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">mlx::steel::BlockLoader::tile_stride</a></div><div class="ttdeci">const int tile_stride</div><div class="ttdef"><b>Definition</b> loader.h:31</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_abb0f4f66ec8b123627beb8eb4fbb609d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">mlx::steel::BlockLoader::load_safe</a></div><div class="ttdeci">METAL_FUNC void load_safe(short2 src_tile_dim) const</div><div class="ttdef"><b>Definition</b> loader.h:83</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_ad1db14517568ae9eddfb6986ef31c7aa"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">mlx::steel::BlockLoader::src</a></div><div class="ttdeci">const device T * src</div><div class="ttdef"><b>Definition</b> loader.h:40</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_adb4ca2cc193630a779de552fa8847ddf"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">mlx::steel::BlockLoader::apply_inplace_op</a></div><div class="ttdeci">METAL_FUNC void apply_inplace_op(thread const UnaryOp &amp;op) const</div><div class="ttdef"><b>Definition</b> loader.h:63</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_af34c184a19846e4b40ba54b2946589ec"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#af34c184a19846e4b40ba54b2946589ec">mlx::steel::BlockLoader::dst</a></div><div class="ttdeci">threadgroup T * dst</div><div class="ttdef"><b>Definition</b> loader.h:39</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html_af1c6c35a42e9da4408c1013ff1741bc2"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2">mlx::steel::BlockLoader::dst</a></div><div class="ttdeci">threadgroup T * dst</div><div class="ttdef"><b>Definition</b> loader.h:39</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/mma_8h.html b/docs/build/html/gemm_2mma_8h.html
similarity index 99%
rename from docs/build/html/mma_8h.html
rename to docs/build/html/gemm_2mma_8h.html
index d0a24d364..8c11f8a96 100644
--- a/docs/build/html/mma_8h.html
+++ b/docs/build/html/gemm_2mma_8h.html
@@ -102,7 +102,7 @@ $(function(){ initResizable(false); });
 <code>#include &quot;<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">mlx/backend/metal/kernels/steel/gemm/transforms.h</a>&quot;</code><br />
 <code>#include &quot;<a class="el" href="integral__constant_8h_source.html">mlx/backend/metal/kernels/steel/utils/integral_constant.h</a>&quot;</code><br />
 </div>
-<p><a href="mma_8h_source.html">Go to the source code of this file.</a></p>
+<p><a href="gemm_2mma_8h_source.html">Go to the source code of this file.</a></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
 Classes</h2></td></tr>
diff --git a/docs/build/html/mma_8h_source.html b/docs/build/html/gemm_2mma_8h_source.html
similarity index 84%
rename from docs/build/html/mma_8h_source.html
rename to docs/build/html/gemm_2mma_8h_source.html
index 75f6ea40b..1b8fc26da 100644
--- a/docs/build/html/mma_8h_source.html
+++ b/docs/build/html/gemm_2mma_8h_source.html
@@ -91,7 +91,7 @@ $(function(){ initResizable(false); });
   <div class="headertitle"><div class="title">mma.h</div></div>
 </div><!--header-->
 <div class="contents">
-<a href="mma_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2024 Apple Inc.</span></div>
+<a href="gemm_2mma_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2024 Apple Inc.</span></div>
 <div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
 <div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
 <div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
@@ -125,20 +125,20 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
 <div class="foldopen" id="foldopen00033" data-start="{" data-end="};">
 <div class="line"><a id="l00033" name="l00033"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">   33</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a>&lt;T, 8, 8&gt; {</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4">   34</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kFragRows = 8;</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a211102315e2afbcfcd2e2c201b638e9f">   35</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kFragCols = 8;</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kFragRows = 8;</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kFragCols = 8;</div>
 <div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span> </div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a3c34dfdc944db110f4735f1b25307cf0">   37</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kElemsPerFrag = (kFragRows * kFragCols) / 32;</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kElemsPerFrag = (kFragRows * kFragCols) / 32;</div>
 <div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span> </div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f">   39</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kElemRows = 1;</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd">   40</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kElemCols = 2;</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kElemRows = 1;</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> kElemCols = 2;</div>
 <div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span> </div>
 <div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  <span class="keyword">static_assert</span>(</div>
 <div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>      kElemRows * kElemCols == kElemsPerFrag,</div>
 <div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>      <span class="stringliteral">&quot;MMAFrag shape is not consistent with MMAFrag size&quot;</span>);</div>
 <div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span> </div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">   46</a></span>  <span class="keyword">typedef</span> metal::simdgroup_matrix&lt;T, kFragRows, kFragCols&gt; <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a>;</div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">   47</a></span>  <span class="keyword">typedef</span> metal::vec&lt;T, kElemsPerFrag&gt; <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>;</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  <span class="keyword">typedef</span> metal::simdgroup_matrix&lt;T, kFragRows, kFragCols&gt; <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>;</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>  <span class="keyword">typedef</span> metal::vec&lt;T, kElemsPerFrag&gt; <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>;</div>
 <div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span> </div>
 <div class="foldopen" id="foldopen00049" data-start="{" data-end="}">
 <div class="line"><a id="l00049" name="l00049"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83">   49</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> short2 <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83">get_coord</a>(ushort simd_lane_id</div>
@@ -153,7 +153,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> SrcPtrType, <span class="keyword">typename</span> StrX, <span class="keyword">typename</span> StrY&gt;</div>
 <div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span></div>
 <div class="foldopen" id="foldopen00059" data-start="{" data-end="}">
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">   59</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">load</a>(thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp; dst, SrcPtrType src, StrX str_x, StrY str_y) {</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">   59</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">load</a>(thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; dst, SrcPtrType src, StrX str_x, StrY str_y) {</div>
 <div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kElemRows; i++) {</div>
 <div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
@@ -174,7 +174,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>      <span class="keyword">typename</span> OffY&gt;</div>
 <div class="foldopen" id="foldopen00077" data-start="{" data-end="}">
 <div class="line"><a id="l00077" name="l00077"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">   77</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">load_safe</a>(</div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp; dst,</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; dst,</div>
 <div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>      SrcPtrType src,</div>
 <div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>      StrX str_x,</div>
 <div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>      StrY str_y,</div>
@@ -200,7 +200,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> DstPtrType, <span class="keyword">typename</span> StrX, <span class="keyword">typename</span> StrY&gt;</div>
 <div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span></div>
 <div class="foldopen" id="foldopen00102" data-start="{" data-end="}">
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">  102</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">store</a>(<span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp; src, DstPtrType dst, StrX str_x, StrY str_y) {</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">  102</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">store</a>(<span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; src, DstPtrType dst, StrX str_x, StrY str_y) {</div>
 <div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    <span class="keyword">using </span>U = <a class="code hl_typedef" href="namespacemetal.html#ac82ee6c3fbe9ec5c78c07329424aaec9">pointer_element_t&lt;DstPtrType&gt;</a>;</div>
 <div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span> </div>
 <div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
@@ -223,7 +223,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>      <span class="keyword">typename</span> OffY&gt;</div>
 <div class="foldopen" id="foldopen00122" data-start="{" data-end="}">
 <div class="line"><a id="l00122" name="l00122"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328">  122</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328">store_safe</a>(</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>      <span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp; src,</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>      <span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; src,</div>
 <div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>      DstPtrType dst,</div>
 <div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>      StrX str_x,</div>
 <div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>      StrY str_y,</div>
@@ -248,31 +248,31 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span> </div>
 <div class="foldopen" id="foldopen00145" data-start="{" data-end="}">
 <div class="line"><a id="l00145" name="l00145"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3">  145</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3">mma</a>(</div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp; D,</div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp; A,</div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp; B,</div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp; C) {</div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> D_mat;</div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> A_mat;</div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> B_mat;</div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> C_mat;</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; D,</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; A,</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; B,</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp; C) {</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> D_mat;</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> A_mat;</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> B_mat;</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> C_mat;</div>
 <div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span> </div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp;<span class="keyword">&gt;</span>(A_mat.thread_elements()) = A;</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp;<span class="keyword">&gt;</span>(B_mat.thread_elements()) = B;</div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp;<span class="keyword">&gt;</span>(C_mat.thread_elements()) = C;</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp;<span class="keyword">&gt;</span>(A_mat.thread_elements()) = A;</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp;<span class="keyword">&gt;</span>(B_mat.thread_elements()) = B;</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp;<span class="keyword">&gt;</span>(C_mat.thread_elements()) = C;</div>
 <div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span> </div>
 <div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    mma(D_mat, A_mat, B_mat, C_mat);</div>
 <div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span> </div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    D = <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a>&amp;<span class="keyword">&gt;</span>(D_mat.thread_elements());</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    D = <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a>&amp;<span class="keyword">&gt;</span>(D_mat.thread_elements());</div>
 <div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>  }</div>
 </div>
 <div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span> </div>
 <div class="foldopen" id="foldopen00164" data-start="{" data-end="}">
 <div class="line"><a id="l00164" name="l00164"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946">  164</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946">mma</a>(</div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a>&amp; D,</div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a>&amp; A,</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a>&amp; B,</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a>&amp; C) {</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>&amp; D,</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>&amp; A,</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>&amp; B,</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>      thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a>&amp; C) {</div>
 <div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    simdgroup_multiply_accumulate(D, A, B, C);</div>
 <div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>  }</div>
 </div>
@@ -286,25 +286,25 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    <span class="keyword">class </span>MMAFrag_ = BaseMMAFrag&lt;T, 8, 8&gt;&gt;</div>
 <div class="foldopen" id="foldopen00178" data-start="{" data-end="};">
 <div class="line"><a id="l00178" name="l00178"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html">  178</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a> {</div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382">  179</a></span>  <span class="keyword">using </span><a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382">MMAFrag_t</a> = MMAFrag_;</div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">  180</a></span>  <span class="keyword">using </span><a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a> = T;</div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">  181</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a> = MMAFrag_t::kFragRows;</div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">  182</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a> = MMAFrag_t::kFragCols;</div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">  183</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a> = MMAFrag_t::kElemsPerFrag;</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>  <span class="keyword">using </span><a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4">MMAFrag_t</a> = MMAFrag_;</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>  <span class="keyword">using </span><a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a> = T;</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a> = MMAFrag_t::kFragRows;</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a> = MMAFrag_t::kFragCols;</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a> = MMAFrag_t::kElemsPerFrag;</div>
 <div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span> </div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">  185</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> = kTileRows_;</div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">  186</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> = kTileCols_;</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> = kTileRows_;</div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> = kTileCols_;</div>
 <div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span> </div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">  188</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">kRows</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a>;</div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">  189</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">kCols</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a>;</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">kRows</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a>;</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">kCols</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a>;</div>
 <div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span> </div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">  191</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>;</div>
-<div class="line"><a id="l00192" name="l00192"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f">  192</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f">kElemsPerTile</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a>;</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a>;</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f">kElemsPerTile</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a>;</div>
 <div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span> </div>
-<div class="line"><a id="l00194" name="l00194"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190">  194</a></span>  <span class="keyword">typedef</span> <span class="keyword">typename</span> MMAFrag_t::mat_type <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190">mat_type</a>;</div>
-<div class="line"><a id="l00195" name="l00195"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">  195</a></span>  <span class="keyword">typedef</span> <span class="keyword">typename</span> MMAFrag_t::frag_type <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">frag_type</a>;</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>  <span class="keyword">typedef</span> <span class="keyword">typename</span> MMAFrag_t::mat_type <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a>;</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>  <span class="keyword">typedef</span> <span class="keyword">typename</span> MMAFrag_t::frag_type <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>;</div>
 <div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span> </div>
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">  197</a></span>  <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">frag_type</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">val_frags</a>[<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a>] = {<a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">frag_type</a>(0)};</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>  <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>[<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a>] = {<a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>(0)};</div>
 <div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span> </div>
 <div class="line"><a id="l00199" name="l00199"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6">  199</a></span>  METAL_FUNC <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6">MMATile</a>() thread {}</div>
 <div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span> </div>
@@ -312,28 +312,28 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00201" name="l00201"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">  201</a></span>  METAL_FUNC <span class="keyword">constexpr</span> <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">clear</a>() {</div>
 <div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a>; ++i) {</div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">val_frags</a>[i] = <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">frag_type</a>(0);</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>[i] = <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>(0);</div>
 <div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>    }</div>
 <div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>  }</div>
 </div>
 <div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span> </div>
 <div class="foldopen" id="foldopen00208" data-start="{" data-end="}">
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">  208</a></span>  METAL_FUNC <span class="keyword">constexpr</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">frag_type</a>&amp; <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(<span class="keyword">const</span> <span class="keywordtype">short</span> i, <span class="keyword">const</span> <span class="keywordtype">short</span> j) {</div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">val_frags</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> + j];</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">  208</a></span>  METAL_FUNC <span class="keyword">constexpr</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>&amp; <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(<span class="keyword">const</span> <span class="keywordtype">short</span> i, <span class="keyword">const</span> <span class="keywordtype">short</span> j) {</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> + j];</div>
 <div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>  }</div>
 </div>
 <div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span> </div>
 <div class="foldopen" id="foldopen00212" data-start="{" data-end="}">
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">  212</a></span>  METAL_FUNC <span class="keyword">constexpr</span> <span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">frag_type</a>&amp; <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">frag_at</a>(</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">  212</a></span>  METAL_FUNC <span class="keyword">constexpr</span> <span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>&amp; <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">frag_at</a>(</div>
 <div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>      <span class="keyword">const</span> <span class="keywordtype">short</span> i,</div>
 <div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>      <span class="keyword">const</span> <span class="keywordtype">short</span> j)<span class="keyword"> const </span>{</div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">val_frags</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> + j];</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>[i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> + j];</div>
 <div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>  }</div>
 </div>
 <div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span> </div>
 <div class="foldopen" id="foldopen00218" data-start="{" data-end="}">
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">  218</a></span>  METAL_FUNC <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190">mat_type</a> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">mat_at</a>(<span class="keyword">const</span> <span class="keywordtype">short</span> i, <span class="keyword">const</span> <span class="keywordtype">short</span> j) {</div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190">mat_type</a> val_mat;</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">  218</a></span>  METAL_FUNC <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a> <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">mat_at</a>(<span class="keyword">const</span> <span class="keywordtype">short</span> i, <span class="keyword">const</span> <span class="keywordtype">short</span> j) {</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>    <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a> val_mat;</div>
 <div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> ii = 0; ii &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a>; ++ii) {</div>
 <div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>      val_mat.thread_elements()[ii] = <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j)[ii];</div>
@@ -343,14 +343,14 @@ $(function(){ initResizable(false); });
 </div>
 <div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span> </div>
 <div class="foldopen" id="foldopen00227" data-start="{" data-end="}">
-<div class="line"><a id="l00227" name="l00227"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">  227</a></span>  METAL_FUNC thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a>* <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">elems</a>() {</div>
-<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>    <span class="keywordflow">return</span> <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a>*<span class="keyword">&gt;</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">val_frags</a>);</div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">  227</a></span>  METAL_FUNC thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a>* <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">elems</a>() {</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>    <span class="keywordflow">return</span> <span class="keyword">reinterpret_cast&lt;</span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a>*<span class="keyword">&gt;</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>);</div>
 <div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>  }</div>
 </div>
 <div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span> </div>
 <div class="foldopen" id="foldopen00231" data-start="{" data-end="}">
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">  231</a></span>  METAL_FUNC <span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a>* <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">elems</a>()<span class="keyword"> const </span>{</div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>    <span class="keywordflow">return</span> <span class="keyword">reinterpret_cast&lt;</span><span class="keyword">const </span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a>*<span class="keyword">&gt;</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">val_frags</a>);</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">  231</a></span>  METAL_FUNC <span class="keyword">const</span> thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a>* <a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">elems</a>()<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>    <span class="keywordflow">return</span> <span class="keyword">reinterpret_cast&lt;</span><span class="keyword">const </span>thread <a class="code hl_typedef" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a>*<span class="keyword">&gt;</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a>);</div>
 <div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>  }</div>
 </div>
 <div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span> </div>
@@ -473,12 +473,11 @@ $(function(){ initResizable(false); });
 </div>
 <div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span> </div>
 <div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keywordtype">int</span> M, <span class="keywordtype">int</span> N, <span class="keywordtype">int</span> K&gt;</div>
-<div class="foldopen" id="foldopen00341" data-start="{" data-end="}">
-<div class="line"><a id="l00341" name="l00341"></a><span class="lineno"><a class="line" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">  341</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">tile_matmad</a>(</div>
-<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;T, M, N&gt;</a>&amp; D,</div>
-<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;U, M, K&gt;</a>&amp; A,</div>
-<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;U, K, N&gt;</a>&amp; B,</div>
-<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>    thread <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;T, M, N&gt;</a>&amp; C) {</div>
+<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">tile_matmad</a>(</div>
+<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span>    thread MMATile&lt;T, M, N&gt;&amp; D,</div>
+<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>    thread MMATile&lt;U, M, K&gt;&amp; A,</div>
+<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>    thread MMATile&lt;U, K, N&gt;&amp; B,</div>
+<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>    thread MMATile&lt;T, M, N&gt;&amp; C) {</div>
 <div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span>  <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span>  <span class="keywordflow">for</span> (<span class="keywordtype">short</span> m = 0; m &lt; M; ++m) {</div>
 <div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
@@ -486,7 +485,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span>      <span class="keywordtype">short</span> n_serp = (m % 2) ? (N - 1 - n) : n;</div>
 <div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; K; ++k) {</div>
-<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span>        <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;T, M, N&gt;::MMAFrag_t::mma</a>(</div>
+<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span>        MMATile&lt;T, M, N&gt;::MMAFrag_t::mma(</div>
 <div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span>            D.frag_at(m, n_serp),</div>
 <div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>            A.frag_at(m, k),</div>
 <div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>            B.frag_at(k, n_serp),</div>
@@ -495,7 +494,6 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>    }</div>
 <div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>  }</div>
 <div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span>}</div>
-</div>
 <div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span> </div>
 <div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span><span class="keyword">template</span> &lt;</div>
 <div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>    <span class="keyword">typename</span> T,</div>
@@ -514,42 +512,42 @@ $(function(){ initResizable(false); });
 <div class="foldopen" id="foldopen00377" data-start="{" data-end="};">
 <div class="line"><a id="l00377" name="l00377"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html">  377</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_m_m_a.html">BlockMMA</a> {</div>
 <div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span>  <span class="comment">// MMAFrag size</span></div>
-<div class="line"><a id="l00379" name="l00379"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">  379</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> = 8;</div>
-<div class="line"><a id="l00380" name="l00380"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae2c42cb6d0dde785859164c195f4d13c">  380</a></span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">MMAFrag_acc_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag&lt;AccumType, kFragSize, kFragSize&gt;</a>;</div>
+<div class="line"><a id="l00379" name="l00379"></a><span class="lineno">  379</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> = 8;</div>
+<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">MMAFrag_acc_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag&lt;AccumType, kFragSize, kFragSize&gt;</a>;</div>
 <div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span> </div>
 <div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span>  <span class="comment">// Warp tile simdgroup matrix strides along M</span></div>
-<div class="line"><a id="l00383" name="l00383"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">  383</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * WM;</div>
+<div class="line"><a id="l00383" name="l00383"></a><span class="lineno">  383</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * WM;</div>
 <div class="line"><a id="l00384" name="l00384"></a><span class="lineno">  384</span>  <span class="comment">// Warp tile simdgroup matrix strides along M</span></div>
-<div class="line"><a id="l00385" name="l00385"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">  385</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * WN;</div>
+<div class="line"><a id="l00385" name="l00385"></a><span class="lineno">  385</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * WN;</div>
 <div class="line"><a id="l00386" name="l00386"></a><span class="lineno">  386</span> </div>
 <div class="line"><a id="l00387" name="l00387"></a><span class="lineno">  387</span>  <span class="comment">// Warp tile size along M</span></div>
-<div class="line"><a id="l00388" name="l00388"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">  388</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a> = BM / <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>;</div>
+<div class="line"><a id="l00388" name="l00388"></a><span class="lineno">  388</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a> = BM / (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * WM);</div>
 <div class="line"><a id="l00389" name="l00389"></a><span class="lineno">  389</span>  <span class="comment">// Warp tile size along N</span></div>
-<div class="line"><a id="l00390" name="l00390"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">  390</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a> = BN / <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>;</div>
+<div class="line"><a id="l00390" name="l00390"></a><span class="lineno">  390</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a> = BN / (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * WN);</div>
 <div class="line"><a id="l00391" name="l00391"></a><span class="lineno">  391</span> </div>
 <div class="line"><a id="l00392" name="l00392"></a><span class="lineno">  392</span>  <span class="comment">// Threadgroup A strides</span></div>
-<div class="line"><a id="l00393" name="l00393"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">  393</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">A_str_m</a> = transpose_a ? 1 : lda_tgp; <span class="comment">// M</span></div>
-<div class="line"><a id="l00394" name="l00394"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">  394</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">A_str_k</a> = transpose_a ? lda_tgp : 1; <span class="comment">// K</span></div>
+<div class="line"><a id="l00393" name="l00393"></a><span class="lineno">  393</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">A_str_m</a> = transpose_a ? 1 : lda_tgp; <span class="comment">// M</span></div>
+<div class="line"><a id="l00394" name="l00394"></a><span class="lineno">  394</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">A_str_k</a> = transpose_a ? lda_tgp : 1; <span class="comment">// K</span></div>
 <div class="line"><a id="l00395" name="l00395"></a><span class="lineno">  395</span> </div>
 <div class="line"><a id="l00396" name="l00396"></a><span class="lineno">  396</span>  <span class="comment">// Threadgroup B strides</span></div>
-<div class="line"><a id="l00397" name="l00397"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">  397</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">B_str_k</a> = transpose_b ? 1 : ldb_tgp; <span class="comment">// K</span></div>
-<div class="line"><a id="l00398" name="l00398"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">  398</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">B_str_n</a> = transpose_b ? ldb_tgp : 1; <span class="comment">// N</span></div>
+<div class="line"><a id="l00397" name="l00397"></a><span class="lineno">  397</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">B_str_k</a> = transpose_b ? 1 : ldb_tgp; <span class="comment">// K</span></div>
+<div class="line"><a id="l00398" name="l00398"></a><span class="lineno">  398</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">B_str_n</a> = transpose_b ? ldb_tgp : 1; <span class="comment">// N</span></div>
 <div class="line"><a id="l00399" name="l00399"></a><span class="lineno">  399</span> </div>
 <div class="line"><a id="l00400" name="l00400"></a><span class="lineno">  400</span>  <span class="comment">// Threadgroup strides along K</span></div>
-<div class="line"><a id="l00401" name="l00401"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">  401</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">tile_stride_a</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">A_str_k</a>;</div>
-<div class="line"><a id="l00402" name="l00402"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">  402</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">tile_stride_b</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">B_str_k</a>;</div>
+<div class="line"><a id="l00401" name="l00401"></a><span class="lineno">  401</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">tile_stride_a</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">A_str_k</a>;</div>
+<div class="line"><a id="l00402" name="l00402"></a><span class="lineno">  402</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">tile_stride_b</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">B_str_k</a>;</div>
 <div class="line"><a id="l00403" name="l00403"></a><span class="lineno">  403</span> </div>
 <div class="line"><a id="l00404" name="l00404"></a><span class="lineno">  404</span>  <span class="comment">// Simdgroup matrices</span></div>
-<div class="line"><a id="l00405" name="l00405"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c">  405</a></span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TM, 1, MMAFrag_acc_t&gt;</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c">Atile</a>;</div>
-<div class="line"><a id="l00406" name="l00406"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26">  406</a></span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, 1, TN, MMAFrag_acc_t&gt;</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26">Btile</a>;</div>
-<div class="line"><a id="l00407" name="l00407"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">  407</a></span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TM, TN, MMAFrag_acc_t&gt;</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>;</div>
+<div class="line"><a id="l00405" name="l00405"></a><span class="lineno">  405</span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TM, 1, MMAFrag_acc_t&gt;</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">Atile</a>;</div>
+<div class="line"><a id="l00406" name="l00406"></a><span class="lineno">  406</span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, 1, TN, MMAFrag_acc_t&gt;</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">Btile</a>;</div>
+<div class="line"><a id="l00407" name="l00407"></a><span class="lineno">  407</span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TM, TN, MMAFrag_acc_t&gt;</a> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>;</div>
 <div class="line"><a id="l00408" name="l00408"></a><span class="lineno">  408</span> </div>
 <div class="line"><a id="l00409" name="l00409"></a><span class="lineno">  409</span>  <span class="comment">// Offsets within threadgroup</span></div>
-<div class="line"><a id="l00410" name="l00410"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">  410</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>;</div>
-<div class="line"><a id="l00411" name="l00411"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">  411</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>;</div>
+<div class="line"><a id="l00410" name="l00410"></a><span class="lineno">  410</span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>;</div>
+<div class="line"><a id="l00411" name="l00411"></a><span class="lineno">  411</span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>;</div>
 <div class="line"><a id="l00412" name="l00412"></a><span class="lineno">  412</span> </div>
-<div class="line"><a id="l00413" name="l00413"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">  413</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">As_offset</a>;</div>
-<div class="line"><a id="l00414" name="l00414"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">  414</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">Bs_offset</a>;</div>
+<div class="line"><a id="l00413" name="l00413"></a><span class="lineno">  413</span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">As_offset</a>;</div>
+<div class="line"><a id="l00414" name="l00414"></a><span class="lineno">  414</span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">Bs_offset</a>;</div>
 <div class="line"><a id="l00415" name="l00415"></a><span class="lineno">  415</span> </div>
 <div class="line"><a id="l00416" name="l00416"></a><span class="lineno">  416</span>  <span class="comment">/* Constructor */</span></div>
 <div class="foldopen" id="foldopen00417" data-start="{" data-end="}">
@@ -585,15 +583,15 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00444" name="l00444"></a><span class="lineno">  444</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> kk = 0; kk &lt; BK; kk += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>) {</div>
 <div class="line"><a id="l00445" name="l00445"></a><span class="lineno">  445</span>      simdgroup_barrier(mem_flags::mem_none);</div>
 <div class="line"><a id="l00446" name="l00446"></a><span class="lineno">  446</span> </div>
-<div class="line"><a id="l00447" name="l00447"></a><span class="lineno">  447</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c">Atile</a>.template load&lt;T, WM, 1, A_str_m, A_str_k&gt;(As);</div>
+<div class="line"><a id="l00447" name="l00447"></a><span class="lineno">  447</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">Atile</a>.template load&lt;T, WM, 1, A_str_m, A_str_k&gt;(As);</div>
 <div class="line"><a id="l00448" name="l00448"></a><span class="lineno">  448</span> </div>
 <div class="line"><a id="l00449" name="l00449"></a><span class="lineno">  449</span>      simdgroup_barrier(mem_flags::mem_none);</div>
 <div class="line"><a id="l00450" name="l00450"></a><span class="lineno">  450</span> </div>
-<div class="line"><a id="l00451" name="l00451"></a><span class="lineno">  451</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26">Btile</a>.template load&lt;T, 1, WN, B_str_k, B_str_n&gt;(Bs);</div>
+<div class="line"><a id="l00451" name="l00451"></a><span class="lineno">  451</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">Btile</a>.template load&lt;T, 1, WN, B_str_k, B_str_n&gt;(Bs);</div>
 <div class="line"><a id="l00452" name="l00452"></a><span class="lineno">  452</span> </div>
 <div class="line"><a id="l00453" name="l00453"></a><span class="lineno">  453</span>      simdgroup_barrier(mem_flags::mem_none);</div>
 <div class="line"><a id="l00454" name="l00454"></a><span class="lineno">  454</span> </div>
-<div class="line"><a id="l00455" name="l00455"></a><span class="lineno">  455</span>      <a class="code hl_function" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">tile_matmad</a>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c">Atile</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26">Btile</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>);</div>
+<div class="line"><a id="l00455" name="l00455"></a><span class="lineno">  455</span>      <a class="code hl_function" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">tile_matmad</a>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">Atile</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">Btile</a>, <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>);</div>
 <div class="line"><a id="l00456" name="l00456"></a><span class="lineno">  456</span> </div>
 <div class="line"><a id="l00457" name="l00457"></a><span class="lineno">  457</span>      <span class="comment">// Progress to next simdgroup tile</span></div>
 <div class="line"><a id="l00458" name="l00458"></a><span class="lineno">  458</span>      As += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">tile_stride_a</a>;</div>
@@ -607,14 +605,14 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00464" name="l00464"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b">  464</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b">store_result</a>(device U* D, <span class="keyword">const</span> <span class="keywordtype">int</span> ldd) {</div>
 <div class="line"><a id="l00465" name="l00465"></a><span class="lineno">  465</span>    <span class="comment">// Apply epilogue</span></div>
 <div class="line"><a id="l00466" name="l00466"></a><span class="lineno">  466</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
-<div class="line"><a id="l00467" name="l00467"></a><span class="lineno">  467</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>)::kElemsPerTile; i++) {</div>
-<div class="line"><a id="l00468" name="l00468"></a><span class="lineno">  468</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.elems()[i] = Epilogue::apply(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.elems()[i]);</div>
+<div class="line"><a id="l00467" name="l00467"></a><span class="lineno">  467</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerTile; i++) {</div>
+<div class="line"><a id="l00468" name="l00468"></a><span class="lineno">  468</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i] = Epilogue::apply(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i]);</div>
 <div class="line"><a id="l00469" name="l00469"></a><span class="lineno">  469</span>    }</div>
 <div class="line"><a id="l00470" name="l00470"></a><span class="lineno">  470</span> </div>
 <div class="line"><a id="l00471" name="l00471"></a><span class="lineno">  471</span>    <span class="comment">// Adjust for simdgroup and thread location</span></div>
 <div class="line"><a id="l00472" name="l00472"></a><span class="lineno">  472</span>    D += <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a> * ldd + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>;</div>
 <div class="line"><a id="l00473" name="l00473"></a><span class="lineno">  473</span> </div>
-<div class="line"><a id="l00474" name="l00474"></a><span class="lineno">  474</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.template store&lt;U, WM, WN&gt;(D, ldd);</div>
+<div class="line"><a id="l00474" name="l00474"></a><span class="lineno">  474</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.template store&lt;U, WM, WN&gt;(D, ldd);</div>
 <div class="line"><a id="l00475" name="l00475"></a><span class="lineno">  475</span>  }</div>
 </div>
 <div class="line"><a id="l00476" name="l00476"></a><span class="lineno">  476</span> </div>
@@ -623,8 +621,8 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00478" name="l00478"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611">  478</a></span>  <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611">store_result_safe</a>(device U* D, <span class="keyword">const</span> <span class="keywordtype">int</span> ldd, short2 dst_tile_dims) {</div>
 <div class="line"><a id="l00479" name="l00479"></a><span class="lineno">  479</span>    <span class="comment">// Apply epilogue</span></div>
 <div class="line"><a id="l00480" name="l00480"></a><span class="lineno">  480</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
-<div class="line"><a id="l00481" name="l00481"></a><span class="lineno">  481</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>)::kElemsPerTile; i++) {</div>
-<div class="line"><a id="l00482" name="l00482"></a><span class="lineno">  482</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.elems()[i] = Epilogue::apply(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.elems()[i]);</div>
+<div class="line"><a id="l00481" name="l00481"></a><span class="lineno">  481</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerTile; i++) {</div>
+<div class="line"><a id="l00482" name="l00482"></a><span class="lineno">  482</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i] = Epilogue::apply(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i]);</div>
 <div class="line"><a id="l00483" name="l00483"></a><span class="lineno">  483</span>    }</div>
 <div class="line"><a id="l00484" name="l00484"></a><span class="lineno">  484</span> </div>
 <div class="line"><a id="l00485" name="l00485"></a><span class="lineno">  485</span>    <span class="comment">// Adjust for simdgroup and thread location</span></div>
@@ -634,7 +632,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00489" name="l00489"></a><span class="lineno">  489</span>    <span class="keywordflow">if</span> (dst_tile_dims.x &lt;= 0 || dst_tile_dims.y &lt;= 0)</div>
 <div class="line"><a id="l00490" name="l00490"></a><span class="lineno">  490</span>      <span class="keywordflow">return</span>;</div>
 <div class="line"><a id="l00491" name="l00491"></a><span class="lineno">  491</span> </div>
-<div class="line"><a id="l00492" name="l00492"></a><span class="lineno">  492</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.template store_safe&lt;U, WM, WN&gt;(D, ldd, dst_tile_dims);</div>
+<div class="line"><a id="l00492" name="l00492"></a><span class="lineno">  492</span>    <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.template store_safe&lt;U, WM, WN&gt;(D, ldd, dst_tile_dims);</div>
 <div class="line"><a id="l00493" name="l00493"></a><span class="lineno">  493</span>  }</div>
 </div>
 <div class="line"><a id="l00494" name="l00494"></a><span class="lineno">  494</span> </div>
@@ -644,8 +642,8 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00497" name="l00497"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">  497</a></span>  METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">apply_epilogue</a>(thread <span class="keyword">const</span> UnaryEpilogue&amp; epilogue_op) {</div>
 <div class="line"><a id="l00498" name="l00498"></a><span class="lineno">  498</span>    <span class="comment">// Loop over all simdgroup tiles</span></div>
 <div class="line"><a id="l00499" name="l00499"></a><span class="lineno">  499</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
-<div class="line"><a id="l00500" name="l00500"></a><span class="lineno">  500</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>)::kElemsPerTile; i++) {</div>
-<div class="line"><a id="l00501" name="l00501"></a><span class="lineno">  501</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.elems()[i] = epilogue_op.apply(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.elems()[i]);</div>
+<div class="line"><a id="l00500" name="l00500"></a><span class="lineno">  500</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerTile; i++) {</div>
+<div class="line"><a id="l00501" name="l00501"></a><span class="lineno">  501</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i] = epilogue_op.apply(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.elems()[i]);</div>
 <div class="line"><a id="l00502" name="l00502"></a><span class="lineno">  502</span>    }</div>
 <div class="line"><a id="l00503" name="l00503"></a><span class="lineno">  503</span>  }</div>
 </div>
@@ -667,12 +665,12 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00518" name="l00518"></a><span class="lineno">  518</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00519" name="l00519"></a><span class="lineno">  519</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a>; j++) {</div>
 <div class="line"><a id="l00520" name="l00520"></a><span class="lineno">  520</span>        <span class="comment">// Get accumulated result and associated offset in C</span></div>
-<div class="line"><a id="l00521" name="l00521"></a><span class="lineno">  521</span>        thread <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.frag_at(i, j);</div>
+<div class="line"><a id="l00521" name="l00521"></a><span class="lineno">  521</span>        thread <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.frag_at(i, j);</div>
 <div class="line"><a id="l00522" name="l00522"></a><span class="lineno">  522</span>        <span class="keywordtype">int</span> offset_c = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldc + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>) * fdc;</div>
 <div class="line"><a id="l00523" name="l00523"></a><span class="lineno">  523</span> </div>
 <div class="line"><a id="l00524" name="l00524"></a><span class="lineno">  524</span>        <span class="comment">// Apply epilogue</span></div>
 <div class="line"><a id="l00525" name="l00525"></a><span class="lineno">  525</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
-<div class="line"><a id="l00526" name="l00526"></a><span class="lineno">  526</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>)::kElemsPerFrag; k++) {</div>
+<div class="line"><a id="l00526" name="l00526"></a><span class="lineno">  526</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerFrag; k++) {</div>
 <div class="line"><a id="l00527" name="l00527"></a><span class="lineno">  527</span>          accum[k] = epilogue_op.apply(accum[k], C[offset_c + k * fdc]);</div>
 <div class="line"><a id="l00528" name="l00528"></a><span class="lineno">  528</span>        }</div>
 <div class="line"><a id="l00529" name="l00529"></a><span class="lineno">  529</span>      }</div>
@@ -702,10 +700,10 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00551" name="l00551"></a><span class="lineno">  551</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00552" name="l00552"></a><span class="lineno">  552</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a>; j++) {</div>
 <div class="line"><a id="l00553" name="l00553"></a><span class="lineno">  553</span>        <span class="comment">// Get accumulated result and associated offset in C</span></div>
-<div class="line"><a id="l00554" name="l00554"></a><span class="lineno">  554</span>        thread <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.frag_at(i, j);</div>
+<div class="line"><a id="l00554" name="l00554"></a><span class="lineno">  554</span>        thread <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.frag_at(i, j);</div>
 <div class="line"><a id="l00555" name="l00555"></a><span class="lineno">  555</span>        <span class="keywordtype">int</span> offset_c = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldc + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>) * fdc;</div>
 <div class="line"><a id="l00556" name="l00556"></a><span class="lineno">  556</span> </div>
-<div class="line"><a id="l00557" name="l00557"></a><span class="lineno">  557</span>        <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kelems = <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>)::kElemsPerFrag;</div>
+<div class="line"><a id="l00557" name="l00557"></a><span class="lineno">  557</span>        <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kelems = <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerFrag;</div>
 <div class="line"><a id="l00558" name="l00558"></a><span class="lineno">  558</span> </div>
 <div class="line"><a id="l00559" name="l00559"></a><span class="lineno">  559</span>        <span class="comment">// Read C</span></div>
 <div class="line"><a id="l00560" name="l00560"></a><span class="lineno">  560</span>        U c_elems[kelems] = {0};</div>
@@ -740,7 +738,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00587" name="l00587"></a><span class="lineno">  587</span>    C += (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>)*ldc + (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>)*fdc;</div>
 <div class="line"><a id="l00588" name="l00588"></a><span class="lineno">  588</span>    D += (<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a>)*ldd + <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a>;</div>
 <div class="line"><a id="l00589" name="l00589"></a><span class="lineno">  589</span> </div>
-<div class="line"><a id="l00590" name="l00590"></a><span class="lineno">  590</span>    <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kelems = <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>)::kElemsPerFrag;</div>
+<div class="line"><a id="l00590" name="l00590"></a><span class="lineno">  590</span>    <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kelems = <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerFrag;</div>
 <div class="line"><a id="l00591" name="l00591"></a><span class="lineno">  591</span> </div>
 <div class="line"><a id="l00592" name="l00592"></a><span class="lineno">  592</span>    <span class="comment">// Loop over all simdgroup tiles</span></div>
 <div class="line"><a id="l00593" name="l00593"></a><span class="lineno">  593</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
@@ -748,7 +746,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00595" name="l00595"></a><span class="lineno">  595</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00596" name="l00596"></a><span class="lineno">  596</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a>; j++) {</div>
 <div class="line"><a id="l00597" name="l00597"></a><span class="lineno">  597</span>        <span class="comment">// Get accumulated result and associated offset in C</span></div>
-<div class="line"><a id="l00598" name="l00598"></a><span class="lineno">  598</span>        thread <span class="keyword">const</span> <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.frag_at(i, j);</div>
+<div class="line"><a id="l00598" name="l00598"></a><span class="lineno">  598</span>        thread <span class="keyword">const</span> <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.frag_at(i, j);</div>
 <div class="line"><a id="l00599" name="l00599"></a><span class="lineno">  599</span>        <span class="keywordtype">int</span> offset_c = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldc + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>) * fdc;</div>
 <div class="line"><a id="l00600" name="l00600"></a><span class="lineno">  600</span>        <span class="keywordtype">int</span> offset_d = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldd + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>);</div>
 <div class="line"><a id="l00601" name="l00601"></a><span class="lineno">  601</span> </div>
@@ -779,7 +777,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00624" name="l00624"></a><span class="lineno">  624</span>    <span class="keywordflow">if</span> (dst_tile_dims.x &lt;= 0 || dst_tile_dims.y &lt;= 0)</div>
 <div class="line"><a id="l00625" name="l00625"></a><span class="lineno">  625</span>      <span class="keywordflow">return</span>;</div>
 <div class="line"><a id="l00626" name="l00626"></a><span class="lineno">  626</span> </div>
-<div class="line"><a id="l00627" name="l00627"></a><span class="lineno">  627</span>    <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kelems = <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>)::kElemsPerFrag;</div>
+<div class="line"><a id="l00627" name="l00627"></a><span class="lineno">  627</span>    <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kelems = <span class="keyword">decltype</span>(<a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>)::kElemsPerFrag;</div>
 <div class="line"><a id="l00628" name="l00628"></a><span class="lineno">  628</span> </div>
 <div class="line"><a id="l00629" name="l00629"></a><span class="lineno">  629</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00630" name="l00630"></a><span class="lineno">  630</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>; i++) {</div>
@@ -787,7 +785,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00632" name="l00632"></a><span class="lineno">  632</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
 <div class="line"><a id="l00633" name="l00633"></a><span class="lineno">  633</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j &lt; <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a>; j++) {</div>
 <div class="line"><a id="l00634" name="l00634"></a><span class="lineno">  634</span>          <span class="comment">// Get accumulated result and associated offset in C</span></div>
-<div class="line"><a id="l00635" name="l00635"></a><span class="lineno">  635</span>          thread <span class="keyword">const</span> <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a>.frag_at(i, j);</div>
+<div class="line"><a id="l00635" name="l00635"></a><span class="lineno">  635</span>          thread <span class="keyword">const</span> <span class="keyword">auto</span>&amp; accum = <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a>.frag_at(i, j);</div>
 <div class="line"><a id="l00636" name="l00636"></a><span class="lineno">  636</span>          <span class="keywordtype">int</span> offset_c = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldc + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>) * fdc;</div>
 <div class="line"><a id="l00637" name="l00637"></a><span class="lineno">  637</span>          <span class="keywordtype">int</span> offset_d = (i * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a>) * ldd + (j * <a class="code hl_variable" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a>);</div>
 <div class="line"><a id="l00638" name="l00638"></a><span class="lineno">  638</span> </div>
@@ -811,79 +809,79 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00654" name="l00654"></a><span class="lineno">  654</span>} <span class="comment">// namespace mlx</span></div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2gemm_2transforms_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html">transforms.h</a></div></div>
 <div class="ttc" id="aintegral__constant_8h_html"><div class="ttname"><a href="integral__constant_8h.html">integral_constant.h</a></div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
 <div class="ttc" id="anamespacemetal_html_ac82ee6c3fbe9ec5c78c07329424aaec9"><div class="ttname"><a href="namespacemetal.html#ac82ee6c3fbe9ec5c78c07329424aaec9">metal::pointer_element_t</a></div><div class="ttdeci">typename pointer_element&lt; remove_cv_t&lt; T &gt; &gt;::type pointer_element_t</div><div class="ttdef"><b>Definition</b> type_traits.h:51</div></div>
-<div class="ttc" id="anamespacemlx_1_1steel_html_ad583e6038efc119542410f43b603d4ad"><div class="ttname"><a href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">mlx::steel::tile_matmad</a></div><div class="ttdeci">METAL_FUNC void tile_matmad(thread MMATile&lt; T, M, N &gt; &amp;D, thread MMATile&lt; U, M, K &gt; &amp;A, thread MMATile&lt; U, K, N &gt; &amp;B, thread MMATile&lt; T, M, N &gt; &amp;C)</div><div class="ttdef"><b>Definition</b> mma.h:341</div></div>
+<div class="ttc" id="anamespacemlx_1_1steel_html_ad583e6038efc119542410f43b603d4ad"><div class="ttname"><a href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">mlx::steel::tile_matmad</a></div><div class="ttdeci">METAL_FUNC void tile_matmad(thread MMATile&lt; T, M, N &gt; &amp;D, thread MMATile&lt; U, M, K &gt; &amp;A, thread MMATile&lt; U, K, N &gt; &amp;B, thread MMATile&lt; T, M, N &gt; &amp;C)</div><div class="ttdef"><b>Definition</b> mma.h:413</div></div>
 <div class="ttc" id="anamespacemlx_html"><div class="ttname"><a href="namespacemlx.html">mlx</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="asteel_2defines_8h_html"><div class="ttname"><a href="steel_2defines_8h.html">defines.h</a></div></div>
 <div class="ttc" id="asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6"><div class="ttname"><a href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div><div class="ttdeci">#define STEEL_PRAGMA_UNROLL</div><div class="ttdef"><b>Definition</b> defines.h:4</div></div>
 <div class="ttc" id="asteel_2defines_8h_html_a90b91c866313ffa46eff6d9cc944ad2b"><div class="ttname"><a href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a></div><div class="ttdeci">#define STEEL_CONST</div><div class="ttdef"><b>Definition</b> defines.h:3</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a1868f57d57c8adedab2c58492ec76946"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma</a></div><div class="ttdeci">static METAL_FUNC constexpr void mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)</div><div class="ttdef"><b>Definition</b> mma.h:164</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a1f0b00daad8eba2f855bb306e70d2328"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store_safe</a></div><div class="ttdeci">static METAL_FUNC constexpr void store_safe(const thread frag_type &amp;src, DstPtrType dst, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=Int&lt; 0 &gt;{}, OffY off_y=Int&lt; 0 &gt;{})</div><div class="ttdef"><b>Definition</b> mma.h:122</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a72054f003c695b90a4fe5101e19cbaa9"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mat_type</a></div><div class="ttdeci">metal::simdgroup_matrix&lt; T, kFragRows, kFragCols &gt; mat_type</div><div class="ttdef"><b>Definition</b> mma.h:46</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a7331fff1d12f2f8b72b0006a3ad0dd83"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::get_coord</a></div><div class="ttdeci">static METAL_FUNC constexpr short2 get_coord(ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> mma.h:49</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a8028512f5a3d2b6acaf966be529627a3"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma</a></div><div class="ttdeci">static METAL_FUNC constexpr void mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)</div><div class="ttdef"><b>Definition</b> mma.h:145</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a958b6952cbd9462d7ae9f6e029631887"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mat_type</a></div><div class="ttdeci">metal::simdgroup_matrix&lt; T, kFragRows, kFragCols &gt; mat_type</div><div class="ttdef"><b>Definition</b> mma.h:60</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_a9f53a5e9b046b4f217e782b733941b0c"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::frag_type</a></div><div class="ttdeci">metal::vec&lt; T, kElemsPerFrag &gt; frag_type</div><div class="ttdef"><b>Definition</b> mma.h:61</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_aa8f50ea8961ec5b35c1b81366d64f2cb"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store</a></div><div class="ttdeci">static METAL_FUNC constexpr void store(const thread frag_type &amp;src, DstPtrType dst, StrX str_x, StrY str_y)</div><div class="ttdef"><b>Definition</b> mma.h:102</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_ac73006b36fc710feda3a7c796e21415c"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load</a></div><div class="ttdeci">static METAL_FUNC constexpr void load(thread frag_type &amp;dst, SrcPtrType src, StrX str_x, StrY str_y)</div><div class="ttdef"><b>Definition</b> mma.h:59</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_ad22aaee4a2938cbdd315b39eda84e07d"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load_safe</a></div><div class="ttdeci">static METAL_FUNC constexpr void load_safe(thread frag_type &amp;dst, SrcPtrType src, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=Int&lt; 0 &gt;{}, OffY off_y=Int&lt; 0 &gt;{})</div><div class="ttdef"><b>Definition</b> mma.h:77</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4_html_af67c1b1aea594468e9426e1be0e31d0b"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::frag_type</a></div><div class="ttdeci">metal::vec&lt; T, kElemsPerFrag &gt; frag_type</div><div class="ttdef"><b>Definition</b> mma.h:47</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a></div><div class="ttdef"><b>Definition</b> mma.h:23</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a></div><div class="ttdef"><b>Definition</b> mma.h:377</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a0461451ffb5041b6a916ea17ed34288b"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b">mlx::steel::BlockMMA::store_result</a></div><div class="ttdeci">METAL_FUNC void store_result(device U *D, const int ldd)</div><div class="ttdef"><b>Definition</b> mma.h:464</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a081ba538d30d1d02498a7f341e6bd611"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611">mlx::steel::BlockMMA::store_result_safe</a></div><div class="ttdeci">METAL_FUNC void store_result_safe(device U *D, const int ldd, short2 dst_tile_dims)</div><div class="ttdef"><b>Definition</b> mma.h:478</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a138ed1bbad2ca88d3a3c7d162cd36562"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">mlx::steel::BlockMMA::As_offset</a></div><div class="ttdeci">short As_offset</div><div class="ttdef"><b>Definition</b> mma.h:413</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a19abba19edeb37018da4bd31e01c8e26"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26">mlx::steel::BlockMMA::Btile</a></div><div class="ttdeci">MMATile&lt; AccumType, 1, TN, MMAFrag_acc_t &gt; Btile</div><div class="ttdef"><b>Definition</b> mma.h:406</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a257287702dc849d0d8a078fced453142"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">mlx::steel::BlockMMA::A_str_k</a></div><div class="ttdeci">STEEL_CONST short A_str_k</div><div class="ttdef"><b>Definition</b> mma.h:394</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a49538190209e522ddbef45fe95563d17"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">mlx::steel::BlockMMA::B_str_n</a></div><div class="ttdeci">STEEL_CONST short B_str_n</div><div class="ttdef"><b>Definition</b> mma.h:398</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a5b0029866f493363942133b55bff7307"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">mlx::steel::BlockMMA::TM_stride</a></div><div class="ttdeci">STEEL_CONST short TM_stride</div><div class="ttdef"><b>Definition</b> mma.h:383</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a138ed1bbad2ca88d3a3c7d162cd36562"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">mlx::steel::BlockMMA::As_offset</a></div><div class="ttdeci">short As_offset</div><div class="ttdef"><b>Definition</b> mma.h:485</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a21b0c40d16eced109bd3196186170bc6"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">mlx::steel::BlockMMA::Ctile</a></div><div class="ttdeci">MMATile&lt; AccumType, TM, TN, MMAFrag_acc_t &gt; Ctile</div><div class="ttdef"><b>Definition</b> mma.h:479</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a257287702dc849d0d8a078fced453142"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a257287702dc849d0d8a078fced453142">mlx::steel::BlockMMA::A_str_k</a></div><div class="ttdeci">STEEL_CONST short A_str_k</div><div class="ttdef"><b>Definition</b> mma.h:466</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a44fca27c821764317263047a780977b0"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">mlx::steel::BlockMMA::Btile</a></div><div class="ttdeci">MMATile&lt; AccumType, 1, TN, MMAFrag_acc_t &gt; Btile</div><div class="ttdef"><b>Definition</b> mma.h:478</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a47e614120c650f7479db79f23a0df586"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">mlx::steel::BlockMMA::Atile</a></div><div class="ttdeci">MMATile&lt; AccumType, TM, 1, MMAFrag_acc_t &gt; Atile</div><div class="ttdef"><b>Definition</b> mma.h:477</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a49538190209e522ddbef45fe95563d17"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">mlx::steel::BlockMMA::B_str_n</a></div><div class="ttdeci">STEEL_CONST short B_str_n</div><div class="ttdef"><b>Definition</b> mma.h:470</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a5b0029866f493363942133b55bff7307"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">mlx::steel::BlockMMA::TM_stride</a></div><div class="ttdeci">STEEL_CONST short TM_stride</div><div class="ttdef"><b>Definition</b> mma.h:455</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a6a2c2a6d5e767d52c41b42a9d36086b0"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0">mlx::steel::BlockMMA::mma</a></div><div class="ttdeci">METAL_FUNC void mma(const threadgroup T *As, const threadgroup T *Bs)</div><div class="ttdef"><b>Definition</b> mma.h:437</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a706ae779c1f8d2eb18f19c248567d424"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">mlx::steel::BlockMMA::TN</a></div><div class="ttdeci">STEEL_CONST short TN</div><div class="ttdef"><b>Definition</b> mma.h:390</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a706ae779c1f8d2eb18f19c248567d424"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">mlx::steel::BlockMMA::TN</a></div><div class="ttdeci">STEEL_CONST short TN</div><div class="ttdef"><b>Definition</b> mma.h:462</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a7b324c992750ed3aaa4c485f15b2f391"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391">mlx::steel::BlockMMA::store_result_safe</a></div><div class="ttdeci">METAL_FUNC void store_result_safe(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const</div><div class="ttdef"><b>Definition</b> mma.h:611</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a7cf757e9785e23997b1417e024559ed3"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3">mlx::steel::BlockMMA::store_result</a></div><div class="ttdeci">METAL_FUNC void store_result(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const</div><div class="ttdef"><b>Definition</b> mma.h:579</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a81838da5d81e62d372d581be599c5a88"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">mlx::steel::BlockMMA::Ctile</a></div><div class="ttdeci">MMATile&lt; AccumType, TM, TN, MMAFrag_acc_t &gt; Ctile</div><div class="ttdef"><b>Definition</b> mma.h:407</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a823c56cbd2086f10272df7284a5247ae"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae">mlx::steel::BlockMMA::apply_epilogue</a></div><div class="ttdeci">METAL_FUNC void apply_epilogue(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)</div><div class="ttdef"><b>Definition</b> mma.h:507</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a8b3690b383afd26563efb38f9c375e50"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">mlx::steel::BlockMMA::TN_stride</a></div><div class="ttdeci">STEEL_CONST short TN_stride</div><div class="ttdef"><b>Definition</b> mma.h:385</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a8fddaa78913cdc8eea5e1cf7d2776330"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">mlx::steel::BlockMMA::tile_stride_a</a></div><div class="ttdeci">STEEL_CONST short tile_stride_a</div><div class="ttdef"><b>Definition</b> mma.h:401</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a92f6aeee432f53638447eac842f43eca"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">mlx::steel::BlockMMA::Bs_offset</a></div><div class="ttdeci">short Bs_offset</div><div class="ttdef"><b>Definition</b> mma.h:414</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a8b3690b383afd26563efb38f9c375e50"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">mlx::steel::BlockMMA::TN_stride</a></div><div class="ttdeci">STEEL_CONST short TN_stride</div><div class="ttdef"><b>Definition</b> mma.h:457</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a8fddaa78913cdc8eea5e1cf7d2776330"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">mlx::steel::BlockMMA::tile_stride_a</a></div><div class="ttdeci">STEEL_CONST short tile_stride_a</div><div class="ttdef"><b>Definition</b> mma.h:473</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a92f6aeee432f53638447eac842f43eca"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">mlx::steel::BlockMMA::Bs_offset</a></div><div class="ttdeci">short Bs_offset</div><div class="ttdef"><b>Definition</b> mma.h:486</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_a9e48f2d51099ec00171506724faab54a"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a">mlx::steel::BlockMMA::apply_epilogue_safe</a></div><div class="ttdeci">METAL_FUNC void apply_epilogue_safe(const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const BinaryEpilogue &amp;epilogue_op)</div><div class="ttdef"><b>Definition</b> mma.h:535</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aa14406b7298456ac45d23dd3c4642dd8"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8">mlx::steel::BlockMMA::BlockMMA</a></div><div class="ttdeci">METAL_FUNC BlockMMA(ushort simd_group_id, ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> mma.h:417</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aa71400922babd388177f228c2c82b211"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">mlx::steel::BlockMMA::B_str_k</a></div><div class="ttdeci">STEEL_CONST short B_str_k</div><div class="ttdef"><b>Definition</b> mma.h:397</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aa85451edf6900fd6af164d4d50889ae3"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">mlx::steel::BlockMMA::sm</a></div><div class="ttdeci">short sm</div><div class="ttdef"><b>Definition</b> mma.h:410</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_ab9c7f5386594497f5f4df7e59670b877"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">mlx::steel::BlockMMA::A_str_m</a></div><div class="ttdeci">STEEL_CONST short A_str_m</div><div class="ttdef"><b>Definition</b> mma.h:393</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aba5f749fdf32d8bd9d9e29f2a9ae4591"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">mlx::steel::BlockMMA::TM</a></div><div class="ttdeci">STEEL_CONST short TM</div><div class="ttdef"><b>Definition</b> mma.h:388</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_ade420e8b811d597345783c324c23a34a"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">mlx::steel::BlockMMA::sn</a></div><div class="ttdeci">short sn</div><div class="ttdef"><b>Definition</b> mma.h:411</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_ae3f35453b3afbaac9df64ad5966b34a4"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">mlx::steel::BlockMMA::tile_stride_b</a></div><div class="ttdeci">STEEL_CONST short tile_stride_b</div><div class="ttdef"><b>Definition</b> mma.h:402</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aee8caec45c1f9e4428586effbfe6137d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">mlx::steel::BlockMMA::kFragSize</a></div><div class="ttdeci">STEEL_CONST short kFragSize</div><div class="ttdef"><b>Definition</b> mma.h:379</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_af1a138c5e118147dc46475e4a5557e7c"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c">mlx::steel::BlockMMA::Atile</a></div><div class="ttdeci">MMATile&lt; AccumType, TM, 1, MMAFrag_acc_t &gt; Atile</div><div class="ttdef"><b>Definition</b> mma.h:405</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aa71400922babd388177f228c2c82b211"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">mlx::steel::BlockMMA::B_str_k</a></div><div class="ttdeci">STEEL_CONST short B_str_k</div><div class="ttdef"><b>Definition</b> mma.h:469</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aa85451edf6900fd6af164d4d50889ae3"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">mlx::steel::BlockMMA::sm</a></div><div class="ttdeci">short sm</div><div class="ttdef"><b>Definition</b> mma.h:482</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_ab9c7f5386594497f5f4df7e59670b877"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">mlx::steel::BlockMMA::A_str_m</a></div><div class="ttdeci">STEEL_CONST short A_str_m</div><div class="ttdef"><b>Definition</b> mma.h:465</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aba5f749fdf32d8bd9d9e29f2a9ae4591"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">mlx::steel::BlockMMA::TM</a></div><div class="ttdeci">STEEL_CONST short TM</div><div class="ttdef"><b>Definition</b> mma.h:460</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_ade420e8b811d597345783c324c23a34a"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">mlx::steel::BlockMMA::sn</a></div><div class="ttdeci">short sn</div><div class="ttdef"><b>Definition</b> mma.h:483</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_ae3f35453b3afbaac9df64ad5966b34a4"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">mlx::steel::BlockMMA::tile_stride_b</a></div><div class="ttdeci">STEEL_CONST short tile_stride_b</div><div class="ttdef"><b>Definition</b> mma.h:474</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_aee8caec45c1f9e4428586effbfe6137d"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">mlx::steel::BlockMMA::kFragSize</a></div><div class="ttdeci">STEEL_CONST short kFragSize</div><div class="ttdef"><b>Definition</b> mma.h:451</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html_af653c0808ba4fa9a25286f1febb7baff"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">mlx::steel::BlockMMA::apply_epilogue</a></div><div class="ttdeci">METAL_FUNC void apply_epilogue(thread const UnaryEpilogue &amp;epilogue_op)</div><div class="ttdef"><b>Definition</b> mma.h:497</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a></div><div class="ttdef"><b>Definition</b> mma.h:178</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a1a6b1446e8c8da46885bbaa8e8fdc7e4"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">mlx::steel::MMATile::frag_at</a></div><div class="ttdeci">METAL_FUNC constexpr thread frag_type &amp; frag_at(const short i, const short j)</div><div class="ttdef"><b>Definition</b> mma.h:208</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a1d126b14910385ab644e224ac1d0307a"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">mlx::steel::MMATile::kTileRows</a></div><div class="ttdeci">STEEL_CONST int kTileRows</div><div class="ttdef"><b>Definition</b> mma.h:185</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a28306efc1a903b80219c8bb16dc5b190"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190">mlx::steel::MMATile::mat_type</a></div><div class="ttdeci">MMAFrag_t::mat_type mat_type</div><div class="ttdef"><b>Definition</b> mma.h:194</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a1d126b14910385ab644e224ac1d0307a"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">mlx::steel::MMATile::kTileRows</a></div><div class="ttdeci">STEEL_CONST int kTileRows</div><div class="ttdef"><b>Definition</b> mma.h:230</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a1eeb197c9bdf4db42892a39cdb9bd73a"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mlx::steel::MMATile::mat_type</a></div><div class="ttdeci">MMAFrag_t::mat_type mat_type</div><div class="ttdef"><b>Definition</b> mma.h:242</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a2aadaa3239cb3af0c2ee8af9b88c8a98"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98">mlx::steel::MMATile::store</a></div><div class="ttdeci">METAL_FUNC void store(threadgroup U *dst) const</div><div class="ttdef"><b>Definition</b> mma.h:253</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a323a4f38cd0693bf333832bb4258b28e"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">mlx::steel::MMATile::mat_at</a></div><div class="ttdeci">METAL_FUNC mat_type mat_at(const short i, const short j)</div><div class="ttdef"><b>Definition</b> mma.h:218</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a46324d40f8ad61cade08a1ebad6d9ad4"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">mlx::steel::MMATile::kTileCols</a></div><div class="ttdeci">STEEL_CONST int kTileCols</div><div class="ttdef"><b>Definition</b> mma.h:186</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a46324d40f8ad61cade08a1ebad6d9ad4"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">mlx::steel::MMATile::kTileCols</a></div><div class="ttdeci">STEEL_CONST int kTileCols</div><div class="ttdef"><b>Definition</b> mma.h:231</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a57703f522c7409dbe2c0a68bb7acc2ba"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba">mlx::steel::MMATile::store_safe</a></div><div class="ttdeci">METAL_FUNC void store_safe(device U *dst, const int ld, const short2 dst_tile_dims) const</div><div class="ttdef"><b>Definition</b> mma.h:321</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a594142f957ffb99296a243f7af7b59e7"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">mlx::steel::MMATile::kFragRows</a></div><div class="ttdeci">STEEL_CONST int kFragRows</div><div class="ttdef"><b>Definition</b> mma.h:181</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a5976565323f2e30479158c14f4b1bfef"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">mlx::steel::MMATile::frag_type</a></div><div class="ttdeci">MMAFrag_t::frag_type frag_type</div><div class="ttdef"><b>Definition</b> mma.h:195</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a60ea6b8ff2923b7fe6f598e74ac54323"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">mlx::steel::MMATile::kRows</a></div><div class="ttdeci">STEEL_CONST int kRows</div><div class="ttdef"><b>Definition</b> mma.h:188</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a594142f957ffb99296a243f7af7b59e7"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">mlx::steel::MMATile::kFragRows</a></div><div class="ttdeci">STEEL_CONST int kFragRows</div><div class="ttdef"><b>Definition</b> mma.h:226</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a60ea6b8ff2923b7fe6f598e74ac54323"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">mlx::steel::MMATile::kRows</a></div><div class="ttdeci">STEEL_CONST int kRows</div><div class="ttdef"><b>Definition</b> mma.h:233</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a684e6c6d9f00f583994285b60aaa3b62"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">mlx::steel::MMATile::val_frags</a></div><div class="ttdeci">frag_type val_frags[kNumFrags]</div><div class="ttdef"><b>Definition</b> mma.h:245</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a6dadcd666afb3759a11094e754560dd4"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4">mlx::steel::MMATile::MMAFrag_t</a></div><div class="ttdeci">MMAFrag_ MMAFrag_t</div><div class="ttdef"><b>Definition</b> mma.h:224</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a752f708e4fe5ef37fdd902dae153179f"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f">mlx::steel::MMATile::store</a></div><div class="ttdeci">METAL_FUNC void store(device U *dst, const int ld) const</div><div class="ttdef"><b>Definition</b> mma.h:285</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a80078f0dfa4c225e79d9b460202d5e2c"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">mlx::steel::MMATile::elem_type</a></div><div class="ttdeci">T elem_type</div><div class="ttdef"><b>Definition</b> mma.h:180</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a824409bc107330805853f932e80a7628"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">mlx::steel::MMATile::elem_type</a></div><div class="ttdeci">T elem_type</div><div class="ttdef"><b>Definition</b> mma.h:225</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a865ece5ad0b9a56937b6d77a18b5a1dc"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">mlx::steel::MMATile::elems</a></div><div class="ttdeci">METAL_FUNC thread elem_type * elems()</div><div class="ttdef"><b>Definition</b> mma.h:227</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a948784652e93830887ee8ad506ec3257"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">mlx::steel::MMATile::kCols</a></div><div class="ttdeci">STEEL_CONST int kCols</div><div class="ttdef"><b>Definition</b> mma.h:189</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a98357339ec98f804a1b12597937b318f"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f">mlx::steel::MMATile::kElemsPerTile</a></div><div class="ttdeci">STEEL_CONST int kElemsPerTile</div><div class="ttdef"><b>Definition</b> mma.h:192</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a948784652e93830887ee8ad506ec3257"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">mlx::steel::MMATile::kCols</a></div><div class="ttdeci">STEEL_CONST int kCols</div><div class="ttdef"><b>Definition</b> mma.h:234</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a98357339ec98f804a1b12597937b318f"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f">mlx::steel::MMATile::kElemsPerTile</a></div><div class="ttdeci">STEEL_CONST int kElemsPerTile</div><div class="ttdef"><b>Definition</b> mma.h:237</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa3a4af67813908109da08ce7352f82da"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">mlx::steel::MMATile::load_safe</a></div><div class="ttdeci">METAL_FUNC void load_safe(const device U *src, const int ld, const short2 src_tile_dims)</div><div class="ttdef"><b>Definition</b> mma.h:301</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa3fb310dd08ec23c334511f7b316d1b6"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6">mlx::steel::MMATile::MMATile</a></div><div class="ttdeci">METAL_FUNC MMATile() thread</div><div class="ttdef"><b>Definition</b> mma.h:199</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa5426c6beabfb3ee41b58f01b3392a96"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96">mlx::steel::MMATile::load</a></div><div class="ttdeci">METAL_FUNC void load(const threadgroup U *src)</div><div class="ttdef"><b>Definition</b> mma.h:236</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa97a98e423827a889c13a92217626ec7"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">mlx::steel::MMATile::clear</a></div><div class="ttdeci">METAL_FUNC constexpr void clear()</div><div class="ttdef"><b>Definition</b> mma.h:201</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa9e484d8cae936503898d5b772c573f9"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9">mlx::steel::MMATile::load</a></div><div class="ttdeci">METAL_FUNC void load(const device U *src, const int ld)</div><div class="ttdef"><b>Definition</b> mma.h:270</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_abe33de70e34300745bad9aa822fd0382"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382">mlx::steel::MMATile::MMAFrag_t</a></div><div class="ttdeci">MMAFrag_ MMAFrag_t</div><div class="ttdef"><b>Definition</b> mma.h:179</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ac4fb73ebc4e7b47a44b8bd6cadda5d44"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">mlx::steel::MMATile::val_frags</a></div><div class="ttdeci">frag_type val_frags[kNumFrags]</div><div class="ttdef"><b>Definition</b> mma.h:197</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ad095371db98e7c335ec41ca77c10f906"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">mlx::steel::MMATile::kFragCols</a></div><div class="ttdeci">STEEL_CONST int kFragCols</div><div class="ttdef"><b>Definition</b> mma.h:182</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aac25cd0a9bdf24aa2af809c95f0bd171"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">mlx::steel::MMATile::frag_type</a></div><div class="ttdeci">MMAFrag_t::frag_type frag_type</div><div class="ttdef"><b>Definition</b> mma.h:243</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ad095371db98e7c335ec41ca77c10f906"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">mlx::steel::MMATile::kFragCols</a></div><div class="ttdeci">STEEL_CONST int kFragCols</div><div class="ttdef"><b>Definition</b> mma.h:227</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ad476e1d9a12178fb35c207312339e485"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">mlx::steel::MMATile::frag_at</a></div><div class="ttdeci">METAL_FUNC constexpr const thread frag_type &amp; frag_at(const short i, const short j) const</div><div class="ttdef"><b>Definition</b> mma.h:212</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ae21bb7cce701290de84c6015e064d8a1"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">mlx::steel::MMATile::elems</a></div><div class="ttdeci">METAL_FUNC const thread elem_type * elems() const</div><div class="ttdef"><b>Definition</b> mma.h:231</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ae326e7693eb77c22d5a6e3e9219019d3"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">mlx::steel::MMATile::kNumFrags</a></div><div class="ttdeci">STEEL_CONST int kNumFrags</div><div class="ttdef"><b>Definition</b> mma.h:191</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aef0ea2387e1ff5767bff8563b2d36bd6"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">mlx::steel::MMATile::kElemsPerFrag</a></div><div class="ttdeci">STEEL_CONST int kElemsPerFrag</div><div class="ttdef"><b>Definition</b> mma.h:183</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_ae326e7693eb77c22d5a6e3e9219019d3"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">mlx::steel::MMATile::kNumFrags</a></div><div class="ttdeci">STEEL_CONST int kNumFrags</div><div class="ttdef"><b>Definition</b> mma.h:236</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aef0ea2387e1ff5767bff8563b2d36bd6"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">mlx::steel::MMATile::kElemsPerFrag</a></div><div class="ttdeci">STEEL_CONST int kElemsPerFrag</div><div class="ttdef"><b>Definition</b> mma.h:228</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1integral__constant_html"><div class="ttname"><a href="structmlx_1_1steel_1_1integral__constant.html">mlx::steel::integral_constant</a></div><div class="ttdef"><b>Definition</b> integral_constant.h:18</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/gemm_8h.html b/docs/build/html/gemm_8h.html
index 52a1f5549..ee97c404b 100644
--- a/docs/build/html/gemm_8h.html
+++ b/docs/build/html/gemm_8h.html
@@ -95,7 +95,7 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 <div class="textblock"><code>#include &quot;<a class="el" href="gemm_2loader_8h_source.html">mlx/backend/metal/kernels/steel/gemm/loader.h</a>&quot;</code><br />
-<code>#include &quot;<a class="el" href="mma_8h_source.html">mlx/backend/metal/kernels/steel/gemm/mma.h</a>&quot;</code><br />
+<code>#include &quot;<a class="el" href="gemm_2mma_8h_source.html">mlx/backend/metal/kernels/steel/gemm/mma.h</a>&quot;</code><br />
 <code>#include &quot;<a class="el" href="gemm_2params_8h_source.html">mlx/backend/metal/kernels/steel/gemm/params.h</a>&quot;</code><br />
 <code>#include &quot;<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">mlx/backend/metal/kernels/steel/gemm/transforms.h</a>&quot;</code><br />
 <code>#include &quot;<a class="el" href="backend_2metal_2kernels_2steel_2utils_8h_source.html">mlx/backend/metal/kernels/steel/utils.h</a>&quot;</code><br />
diff --git a/docs/build/html/gemm_8h_source.html b/docs/build/html/gemm_8h_source.html
index 85e04f0a2..4a4c90542 100644
--- a/docs/build/html/gemm_8h_source.html
+++ b/docs/build/html/gemm_8h_source.html
@@ -96,7 +96,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="preprocessor">#pragma once</span></div>
 <div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
 <div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#include &quot;<a class="code" href="gemm_2loader_8h.html">mlx/backend/metal/kernels/steel/gemm/loader.h</a>&quot;</span></div>
-<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span><span class="preprocessor">#include &quot;<a class="code" href="mma_8h.html">mlx/backend/metal/kernels/steel/gemm/mma.h</a>&quot;</span></div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span><span class="preprocessor">#include &quot;<a class="code" href="gemm_2mma_8h.html">mlx/backend/metal/kernels/steel/gemm/mma.h</a>&quot;</span></div>
 <div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="preprocessor">#include &quot;<a class="code" href="gemm_2params_8h.html">mlx/backend/metal/kernels/steel/gemm/params.h</a>&quot;</span></div>
 <div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html">mlx/backend/metal/kernels/steel/gemm/transforms.h</a>&quot;</span></div>
 <div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span><span class="preprocessor">#include &quot;<a class="code" href="backend_2metal_2kernels_2steel_2utils_8h.html">mlx/backend/metal/kernels/steel/utils.h</a>&quot;</span></div>
@@ -123,35 +123,35 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    <span class="keywordtype">bool</span> transpose_b,</div>
 <div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>    <span class="keywordtype">bool</span> MN_aligned,</div>
 <div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    <span class="keywordtype">bool</span> K_aligned,</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>    <span class="keyword">typename</span> AccumType = <span class="keyword">typename</span> <a class="code hl_typedef" href="structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da">AccumHelper&lt;T&gt;::accum_type</a>,</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>    <span class="keyword">typename</span> AccumType = <span class="keyword">typename</span> <a class="code hl_typedef" href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">AccumHelper&lt;T&gt;::accum_type</a>,</div>
 <div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>    <span class="keyword">typename</span> Epilogue = <a class="code hl_struct" href="structmlx_1_1steel_1_1_transform_none.html">TransformNone&lt;U, AccumType&gt;</a>&gt;</div>
 <div class="foldopen" id="foldopen00037" data-start="{" data-end="};">
 <div class="line"><a id="l00037" name="l00037"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">   37</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">GEMMKernel</a> {</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">   38</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a> = 16 / <span class="keyword">sizeof</span>(T);</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">   39</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a> = 16 / <span class="keyword">sizeof</span>(T);</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">   40</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">tgp_mem_size_a</a> =</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a> = 16 / <span class="keyword">sizeof</span>(T);</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a> = 16 / <span class="keyword">sizeof</span>(T);</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">tgp_mem_size_a</a> =</div>
 <div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>      transpose_a ? BK * (BM + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>) : BM * (BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>);</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">   42</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">tgp_mem_size_b</a> =</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">tgp_mem_size_b</a> =</div>
 <div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>      transpose_b ? BN * (BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>) : BK * (BN + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>);</div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da">   44</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da">tgp_mem_size</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">tgp_mem_size_a</a> + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">tgp_mem_size_b</a>;</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da">tgp_mem_size</a> = <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">tgp_mem_size_a</a> + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">tgp_mem_size_b</a>;</div>
 <div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span> </div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">   46</a></span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a> = WM * WN * 32;</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  <a class="code hl_define" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a> = WM * WN * 32;</div>
 <div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span> </div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa8a04ed74d2259f99b337d4662c64d83">   48</a></span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_a_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a>&lt;</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_a_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a>&lt;</div>
 <div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>      T,</div>
 <div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>      transpose_a ? BK : BM,</div>
 <div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>      transpose_a ? BM : BK,</div>
 <div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>      transpose_a ? BM + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a> : BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>,</div>
 <div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>      !transpose_a,</div>
 <div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a>&gt;;</div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa98f32278b5fd98c93ae5483c3596395">   55</a></span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_b_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a>&lt;</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">loader_b_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a>&lt;</div>
 <div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>      T,</div>
 <div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>      transpose_b ? BN : BK,</div>
 <div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>      transpose_b ? BK : BN,</div>
 <div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>      transpose_b ? BK + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a> : BN + <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>,</div>
 <div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>      transpose_b,</div>
 <div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>      <a class="code hl_variable" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a>&gt;;</div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno"><a class="line" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#add8c6a31011a4895667c2a94a5af3782">   62</a></span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_m_m_a.html">mma_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_m_m_a.html">BlockMMA</a>&lt;</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>  <span class="keyword">using </span><a class="code hl_struct" href="structmlx_1_1steel_1_1_block_m_m_a.html">mma_t</a> = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_m_m_a.html">BlockMMA</a>&lt;</div>
 <div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>      T,</div>
 <div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>      U,</div>
 <div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>      BM,</div>
@@ -391,24 +391,24 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2gemm_2transforms_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html">transforms.h</a></div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html">utils.h</a></div></div>
 <div class="ttc" id="agemm_2loader_8h_html"><div class="ttname"><a href="gemm_2loader_8h.html">loader.h</a></div></div>
+<div class="ttc" id="agemm_2mma_8h_html"><div class="ttname"><a href="gemm_2mma_8h.html">mma.h</a></div></div>
 <div class="ttc" id="agemm_2params_8h_html"><div class="ttname"><a href="gemm_2params_8h.html">params.h</a></div></div>
-<div class="ttc" id="amma_8h_html"><div class="ttname"><a href="mma_8h.html">mma.h</a></div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
 <div class="ttc" id="anamespacemlx_html"><div class="ttname"><a href="namespacemlx.html">mlx</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="asteel_2defines_8h_html_a90b91c866313ffa46eff6d9cc944ad2b"><div class="ttname"><a href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a></div><div class="ttdeci">#define STEEL_CONST</div><div class="ttdef"><b>Definition</b> defines.h:3</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_accum_helper_html_ab594958b88746f759aa7ca573f1903da"><div class="ttname"><a href="structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da">mlx::steel::AccumHelper::accum_type</a></div><div class="ttdeci">float accum_type</div><div class="ttdef"><b>Definition</b> transforms.h:57</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_accum_helper_html_ae52abf69e7ba6af1a73d65d57182ed26"><div class="ttname"><a href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">mlx::steel::AccumHelper::accum_type</a></div><div class="ttdeci">float accum_type</div><div class="ttdef"><b>Definition</b> transforms.h:57</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a></div><div class="ttdef"><b>Definition</b> loader.h:25</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a></div><div class="ttdef"><b>Definition</b> mma.h:377</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a></div><div class="ttdef"><b>Definition</b> gemm.h:37</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a00e55d4a161758350ed7310817d2d2a5"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5">mlx::steel::GEMMKernel::run</a></div><div class="ttdeci">static METAL_FUNC void run(const device T *A, const device T *B, device U *D, const constant GEMMParams *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</div><div class="ttdef"><b>Definition</b> gemm.h:140</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a105af1069668028c6f1bc6d6dd162298"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">mlx::steel::GEMMKernel::tgp_mem_size_b</a></div><div class="ttdeci">STEEL_CONST short tgp_mem_size_b</div><div class="ttdef"><b>Definition</b> gemm.h:42</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a1ec583584e69dcbbb72106390a4fc5da"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da">mlx::steel::GEMMKernel::tgp_mem_size</a></div><div class="ttdeci">STEEL_CONST short tgp_mem_size</div><div class="ttdef"><b>Definition</b> gemm.h:44</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a105af1069668028c6f1bc6d6dd162298"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a105af1069668028c6f1bc6d6dd162298">mlx::steel::GEMMKernel::tgp_mem_size_b</a></div><div class="ttdeci">STEEL_CONST short tgp_mem_size_b</div><div class="ttdef"><b>Definition</b> attn.h:43</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a1ec583584e69dcbbb72106390a4fc5da"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da">mlx::steel::GEMMKernel::tgp_mem_size</a></div><div class="ttdeci">STEEL_CONST short tgp_mem_size</div><div class="ttdef"><b>Definition</b> attn.h:45</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a756d7bbcc96e2919cd65eec4bc135780"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">mlx::steel::GEMMKernel::gemm_loop</a></div><div class="ttdeci">static METAL_FUNC void gemm_loop(threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread loader_a_t &amp;loader_a, thread loader_b_t &amp;loader_b, thread mma_t &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, LoopAlignment&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})</div><div class="ttdef"><b>Definition</b> gemm.h:79</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a9058ddb73e30e83fb9c548ba22817d64"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">mlx::steel::GEMMKernel::tgp_size</a></div><div class="ttdeci">STEEL_CONST short tgp_size</div><div class="ttdef"><b>Definition</b> gemm.h:46</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ac00b149d76a903c2f91b0f477dc5037f"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">mlx::steel::GEMMKernel::tgp_mem_size_a</a></div><div class="ttdeci">STEEL_CONST short tgp_mem_size_a</div><div class="ttdef"><b>Definition</b> gemm.h:40</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad1b03941e869017558423c08b08bc094"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">mlx::steel::GEMMKernel::tgp_padding_b</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_b</div><div class="ttdef"><b>Definition</b> gemm.h:39</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad547704ccbff6c2076abeffa6628c5a0"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">mlx::steel::GEMMKernel::tgp_padding_a</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_a</div><div class="ttdef"><b>Definition</b> gemm.h:38</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a9058ddb73e30e83fb9c548ba22817d64"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64">mlx::steel::GEMMKernel::tgp_size</a></div><div class="ttdeci">STEEL_CONST short tgp_size</div><div class="ttdef"><b>Definition</b> attn.h:47</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ac00b149d76a903c2f91b0f477dc5037f"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">mlx::steel::GEMMKernel::tgp_mem_size_a</a></div><div class="ttdeci">STEEL_CONST short tgp_mem_size_a</div><div class="ttdef"><b>Definition</b> attn.h:41</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad1b03941e869017558423c08b08bc094"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094">mlx::steel::GEMMKernel::tgp_padding_b</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_b</div><div class="ttdef"><b>Definition</b> attn.h:40</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad547704ccbff6c2076abeffa6628c5a0"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad547704ccbff6c2076abeffa6628c5a0">mlx::steel::GEMMKernel::tgp_padding_a</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_a</div><div class="ttdef"><b>Definition</b> attn.h:39</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_params_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_g_e_m_m_params.html">mlx::steel::GEMMParams</a></div><div class="ttdef"><b>Definition</b> params.h:12</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_loop_alignment_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_loop_alignment.html">mlx::steel::LoopAlignment</a></div><div class="ttdef"><b>Definition</b> gemm.h:21</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_transform_none_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_transform_none.html">mlx::steel::TransformNone</a></div><div class="ttdef"><b>Definition</b> transforms.h:15</div></div>
diff --git a/docs/build/html/genindex.html b/docs/build/html/genindex.html
index dd32a9126..181d50e66 100644
--- a/docs/build/html/genindex.html
+++ b/docs/build/html/genindex.html
@@ -7,7 +7,7 @@
   <head>
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>Index &#8212; MLX 0.20.0 documentation</title>
+    <title>Index &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -38,7 +38,7 @@
   <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="_static/documentation_options.js?v=174dfe6e"></script>
     <script src="_static/doctools.js?v=9a2dae69"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -48,7 +48,7 @@
     <link rel="search" title="Search" href="search.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -129,8 +129,8 @@
       
     
     
-    <img src="_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -443,7 +443,6 @@
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -520,6 +519,7 @@
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -549,6 +549,7 @@
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -885,8 +886,6 @@
       <li><a href="cpp/ops.html#_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice">addmm (C++ function)</a>
 </li>
       <li><a href="python/_autosummary/mlx.core.addmm.html#mlx.core.addmm">addmm() (in module mlx.core)</a>
-</li>
-      <li><a href="python/_autosummary/mlx.core.fast.affine_quantize.html#mlx.core.fast.affine_quantize">affine_quantize() (in module mlx.core.fast)</a>
 </li>
       <li><a href="python/nn/_autosummary/mlx.nn.ALiBi.html#mlx.nn.ALiBi">ALiBi (class in mlx.nn)</a>
 </li>
@@ -934,10 +933,10 @@
 </li>
       <li><a href="cpp/ops.html#_CPPv46arcsinRK5array14StreamOrDevice">arcsin (C++ function)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python/_autosummary/mlx.core.arcsin.html#mlx.core.arcsin">arcsin() (in module mlx.core)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="cpp/ops.html#_CPPv47arcsinhRK5array14StreamOrDevice">arcsinh (C++ function)</a>
 </li>
       <li><a href="python/_autosummary/mlx.core.arcsinh.html#mlx.core.arcsinh">arcsinh() (in module mlx.core)</a>
@@ -1009,6 +1008,8 @@
       <li><a href="python/nn/_autosummary/mlx.nn.AvgPool1d.html#mlx.nn.AvgPool1d">AvgPool1d (class in mlx.nn)</a>
 </li>
       <li><a href="python/nn/_autosummary/mlx.nn.AvgPool2d.html#mlx.nn.AvgPool2d">AvgPool2d (class in mlx.nn)</a>
+</li>
+      <li><a href="python/nn/_autosummary/mlx.nn.AvgPool3d.html#mlx.nn.AvgPool3d">AvgPool3d (class in mlx.nn)</a>
 </li>
   </ul></td>
 </tr></table>
@@ -1093,6 +1094,8 @@
       <li><a href="python/_autosummary/mlx.core.conjugate.html#mlx.core.conjugate">conjugate() (in module mlx.core)</a>
 </li>
       <li><a href="python/nn/_autosummary/mlx.nn.init.constant.html#mlx.nn.init.constant">constant() (in module mlx.nn.init)</a>
+</li>
+      <li><a href="cpp/ops.html#_CPPv410contiguousRK5arrayb14StreamOrDevice">contiguous (C++ function)</a>
 </li>
       <li><a href="cpp/ops.html#_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice">conv1d (C++ function)</a>
 </li>
@@ -1115,11 +1118,11 @@
       <li><a href="cpp/ops.html#_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice">conv_general (C++ function)</a>, <a href="cpp/ops.html#_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice">[1]</a>
 </li>
       <li><a href="python/_autosummary/mlx.core.conv_general.html#mlx.core.conv_general">conv_general() (in module mlx.core)</a>
-</li>
-      <li><a href="cpp/ops.html#_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice">conv_transpose1d (C++ function)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="cpp/ops.html#_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice">conv_transpose1d (C++ function)</a>
+</li>
       <li><a href="python/_autosummary/mlx.core.conv_transpose1d.html#mlx.core.conv_transpose1d">conv_transpose1d() (in module mlx.core)</a>
 </li>
       <li><a href="cpp/ops.html#_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice">conv_transpose2d (C++ function)</a>
@@ -1701,6 +1704,8 @@
       <li><a href="python/nn/_autosummary/mlx.nn.MaxPool1d.html#mlx.nn.MaxPool1d">MaxPool1d (class in mlx.nn)</a>
 </li>
       <li><a href="python/nn/_autosummary/mlx.nn.MaxPool2d.html#mlx.nn.MaxPool2d">MaxPool2d (class in mlx.nn)</a>
+</li>
+      <li><a href="python/nn/_autosummary/mlx.nn.MaxPool3d.html#mlx.nn.MaxPool3d">MaxPool3d (class in mlx.nn)</a>
 </li>
       <li><a href="cpp/ops.html#_CPPv44meanRK5array14StreamOrDevice">mean (C++ function)</a>, <a href="cpp/ops.html#_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice">[1]</a>, <a href="cpp/ops.html#_CPPv44meanRK5arrayb14StreamOrDevice">[2]</a>, <a href="cpp/ops.html#_CPPv44meanRK5arrayib14StreamOrDevice">[3]</a>
 </li>
diff --git a/docs/build/html/globals_a.html b/docs/build/html/globals_a.html
index eafed81ce..5955bbaa0 100644
--- a/docs/build/html/globals_a.html
+++ b/docs/build/html/globals_a.html
@@ -91,13 +91,14 @@ $(function(){ initResizable(false); });
 <li>adjust_matrix_offsets()&#160;:&#160;<a class="el" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">quantized.h</a></li>
 <li>affine_dequantize()&#160;:&#160;<a class="el" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">quantized.h</a></li>
 <li>affine_quantize()&#160;:&#160;<a class="el" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">quantized.h</a></li>
-<li>affine_quantize_scales_biases()&#160;:&#160;<a class="el" href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c">quantized.h</a></li>
-<li>align_K&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">steel_gemm_fused.h</a></li>
+<li>align_K&#160;:&#160;<a class="el" href="steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">steel_attention.h</a>, <a class="el" href="steel__gemm__fused_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">steel_gemm_fused.h</a></li>
 <li>align_M&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#a55af226dc74b0026b7d4b865142a6d21">steel_gemm_fused.h</a></li>
 <li>align_N&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#aa3b267252df2dcbfdde8c5f174d27036">steel_gemm_fused.h</a></li>
-<li>all_reduce()&#160;:&#160;<a class="el" href="reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d">reduce_all.h</a></li>
+<li>align_Q&#160;:&#160;<a class="el" href="steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982">steel_attention.h</a></li>
+<li>all_reduce()&#160;:&#160;<a class="el" href="reduce__all_8h.html#a9086a585eda5a887160ee24baae0a7b8">reduce_all.h</a></li>
 <li>arange()&#160;:&#160;<a class="el" href="metal_2kernels_2arange_8h.html#a1e5126ee6ae0164c2343230c4d87c03e">arange.h</a></li>
 <li>arange_kernels&#160;:&#160;<a class="el" href="metal_2jit_2arange_8h.html#a2f49fb7bdc0a90230077fe2023e6e5c0">arange.h</a></li>
+<li>attention()&#160;:&#160;<a class="el" href="steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33">steel_attention.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/globals_b.html b/docs/build/html/globals_b.html
index ad776c65f..533ac9a6a 100644
--- a/docs/build/html/globals_b.html
+++ b/docs/build/html/globals_b.html
@@ -87,22 +87,22 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all file members with links to the files they belong to:</div>
 
 <h3><a id="index_b" name="index_b"></a>- b -</h3><ul>
-<li>bfloat16_t&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bf16.h</a></li>
-<li>bfloat16_to_uint16&#160;:&#160;<a class="el" href="bf16__math_8h.html#a51cfdd4502e755310f6f3456f039bea7">bf16_math.h</a></li>
-<li>bfloat_binop&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bf16.h</a></li>
-<li>bfloat_binop_base&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70">bf16.h</a></li>
-<li>bfloat_binop_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594">bf16.h</a></li>
+<li>bfloat16_t&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bf16.h</a></li>
+<li>bfloat16_to_uint16()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">bf16.h</a></li>
+<li>bfloat_binop&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bf16.h</a></li>
+<li>bfloat_binop_base&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70">bf16.h</a></li>
+<li>bfloat_binop_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594">bf16.h</a></li>
 <li>bfloat_bitop&#160;:&#160;<a class="el" href="types_2bf16_8h.html#aac9ba86d4bf05bcda1936494f9b9b4d3">bf16.h</a></li>
-<li>bfloat_bits_to_float()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bf16.h</a></li>
-<li>bfloat_compop&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bf16.h</a></li>
+<li>bfloat_bits_to_float()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bf16.h</a></li>
+<li>bfloat_compop&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bf16.h</a></li>
 <li>bfloat_inplace_bitop&#160;:&#160;<a class="el" href="types_2bf16_8h.html#af13b46bc58e6e6f675ae47aabec37711">bf16.h</a></li>
-<li>bfloat_inplace_op&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bf16.h</a></li>
-<li>bfloat_inplace_op_addr_space_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bf16.h</a></li>
-<li>bfloat_inplace_op_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d">bf16.h</a></li>
-<li>binary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a1f3f5d6bfbf3914f365790dd1434c10b">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a6297badf47dece518bb4e67f02cffea8">binary_two.h</a></li>
+<li>bfloat_inplace_op&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bf16.h</a></li>
+<li>bfloat_inplace_op_addr_space_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bf16.h</a></li>
+<li>bfloat_inplace_op_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d">bf16.h</a></li>
+<li>binary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#ab1b49438a70f6c707c18afd5bce12bb3">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#aaf6edb734cea627bca4f6540dc338fbd">binary_two.h</a></li>
 <li>binary_g_nd1()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a6808bfb006cb5473da087a2758d0d867">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#ad1fad37c168192b212a4294f4cf78133">binary_two.h</a></li>
-<li>binary_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a8cd5989852ec704c6fd132ae28f4fc14">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a03f7c15a1607576755abb65c542ae347">binary_two.h</a></li>
-<li>binary_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#ac4979e60b993f7ffb602bcb91cd68bc9">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a43e5943460996c43060d1f3aa1309ba6">binary_two.h</a></li>
+<li>binary_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a6cefcfee68bd62f3a6924df0cd53dd49">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a97b5613aff654d32c49225209a19bb95">binary_two.h</a></li>
+<li>binary_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#abb15de8250f9a259de80618c6de46dfa">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#aae07014f8dffa3649a5c7f4671e1268e">binary_two.h</a></li>
 <li>binary_ss()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a242b8b29a852c255467e50628c6dccf5">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#af8a791ac7ca88d32cd8f4e9ac0f9ab4f">binary_two.h</a></li>
 <li>binary_sv()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a4116c35f2e4632366d1611d5a95ba141">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#ab18c6ecf5065275c93701efd095c916c">binary_two.h</a></li>
 <li>binary_sv2()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#aa8c48b1b21d8f5a181f5443de2346589">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a08822ff98ea6f61a98b49a9e9a38b891">binary_two.h</a></li>
diff --git a/docs/build/html/globals_c.html b/docs/build/html/globals_c.html
index a9eed8c2d..f1f820f99 100644
--- a/docs/build/html/globals_c.html
+++ b/docs/build/html/globals_c.html
@@ -87,28 +87,28 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all file members with links to the files they belong to:</div>
 
 <h3><a id="index_c" name="index_c"></a>- c -</h3><ul>
-<li>can_convert_from_bfloat&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">bf16.h</a></li>
+<li>can_convert_from_bfloat&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">bf16.h</a></li>
 <li>can_convert_from_complex64&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#ab149db78f6f19b8da6297dac4c36d893">complex.h</a></li>
-<li>can_convert_to_bfloat&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">bf16.h</a></li>
+<li>can_convert_to_bfloat&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">bf16.h</a></li>
 <li>can_convert_to_complex64&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a4f90ad54f4fae363e8d3cc41d539557b">complex.h</a></li>
 <li>ceildiv()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">utils.h</a></li>
-<li>col_reduce_2pass()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">reduce_col.h</a></li>
-<li>col_reduce_longcolumn()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">reduce_col.h</a></li>
-<li>col_reduce_looped()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">reduce_col.h</a></li>
-<li>col_reduce_small()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">reduce_col.h</a></li>
+<li>col_reduce_2pass()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a9a7be400d810700b47fc1a998032ce29">reduce_col.h</a></li>
+<li>col_reduce_longcolumn()&#160;:&#160;<a class="el" href="reduce__col_8h.html#aa3287cd98e97123b67b5d3920d984ca2">reduce_col.h</a></li>
+<li>col_reduce_looped()&#160;:&#160;<a class="el" href="reduce__col_8h.html#ae8f9354e1c595142d05b33fe13988f02">reduce_col.h</a></li>
+<li>col_reduce_small()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a82cd031d8014c02e61dc9a817ea6d4ec">reduce_col.h</a></li>
 <li>complex_binop&#160;:&#160;<a class="el" href="types_2complex_8h.html#a9c7995d495359894e1b30c0f1678d6bd">complex.h</a></li>
 <li>complex_binop_helper&#160;:&#160;<a class="el" href="types_2complex_8h.html#ac6890f9852de12339b09b65757ebc8c4">complex.h</a></li>
 <li>complex_mul()&#160;:&#160;<a class="el" href="radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6">radix.h</a></li>
 <li>complex_mul_conj()&#160;:&#160;<a class="el" href="radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3">radix.h</a></li>
 <li>contiguous_scan()&#160;:&#160;<a class="el" href="scan_8h.html#a60d279b9add7d56639bb209408f09d79">scan.h</a></li>
-<li>copy_g()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36">copy.h</a></li>
+<li>copy_g()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a71e4103db4689d90ef6f9d5ba93604cf">copy.h</a></li>
 <li>copy_g_nd1()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77">copy.h</a></li>
-<li>copy_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c">copy.h</a></li>
-<li>copy_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff">copy.h</a></li>
-<li>copy_gg()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5">copy.h</a></li>
+<li>copy_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a39ec5b7b8351e4332b842982a2ee6260">copy.h</a></li>
+<li>copy_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#aab82689380897ff4716b5eafd6ef3ecc">copy.h</a></li>
+<li>copy_gg()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#ade9a9eea9b8262a854a11721fe2bb9fa">copy.h</a></li>
 <li>copy_gg_nd1()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1">copy.h</a></li>
-<li>copy_gg_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950">copy.h</a></li>
-<li>copy_gg_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd">copy.h</a></li>
+<li>copy_gg_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#af0b06ac3a96852a64fa4274a94b58301">copy.h</a></li>
+<li>copy_gg_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a3f3836ad0b6545ec9b9e1864224f7a13">copy.h</a></li>
 <li>copy_s()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea">copy.h</a></li>
 <li>copy_s2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3">copy.h</a></li>
 <li>copy_v()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659">copy.h</a></li>
diff --git a/docs/build/html/globals_defs.html b/docs/build/html/globals_defs.html
index d88838865..de3c40162 100644
--- a/docs/build/html/globals_defs.html
+++ b/docs/build/html/globals_defs.html
@@ -98,16 +98,15 @@ $(function(){ initResizable(false); });
 
 
 <h3><a id="index_b" name="index_b"></a>- b -</h3><ul>
-<li>bfloat16_to_uint16&#160;:&#160;<a class="el" href="bf16__math_8h.html#a51cfdd4502e755310f6f3456f039bea7">bf16_math.h</a></li>
-<li>bfloat_binop&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bf16.h</a></li>
-<li>bfloat_binop_base&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70">bf16.h</a></li>
-<li>bfloat_binop_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594">bf16.h</a></li>
+<li>bfloat_binop&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bf16.h</a></li>
+<li>bfloat_binop_base&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70">bf16.h</a></li>
+<li>bfloat_binop_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594">bf16.h</a></li>
 <li>bfloat_bitop&#160;:&#160;<a class="el" href="types_2bf16_8h.html#aac9ba86d4bf05bcda1936494f9b9b4d3">bf16.h</a></li>
-<li>bfloat_compop&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bf16.h</a></li>
+<li>bfloat_compop&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bf16.h</a></li>
 <li>bfloat_inplace_bitop&#160;:&#160;<a class="el" href="types_2bf16_8h.html#af13b46bc58e6e6f675ae47aabec37711">bf16.h</a></li>
-<li>bfloat_inplace_op&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bf16.h</a></li>
-<li>bfloat_inplace_op_addr_space_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bf16.h</a></li>
-<li>bfloat_inplace_op_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d">bf16.h</a></li>
+<li>bfloat_inplace_op&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bf16.h</a></li>
+<li>bfloat_inplace_op_addr_space_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8">bf16.h</a></li>
+<li>bfloat_inplace_op_helper&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d">bf16.h</a></li>
 </ul>
 
 
@@ -156,6 +155,13 @@ $(function(){ initResizable(false); });
 </ul>
 
 
+<h3><a id="index_j" name="index_j"></a>- j -</h3><ul>
+<li>jit_else&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2jit_2bf16_8h.html#a4b2f08732045407adc7ee181e39e5ae3">bf16.h</a></li>
+<li>jit_endif&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2jit_2bf16_8h.html#a5049b44a1fffcb837e0c470ae4cafc56">bf16.h</a></li>
+<li>jit_if&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2jit_2bf16_8h.html#aaf5bb88c2349054a6c4c2aefee63d3d2">bf16.h</a></li>
+</ul>
+
+
 <h3><a id="index_m" name="index_m"></a>- m -</h3><ul>
 <li>MAX_OUTPUT_SIZE&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#a28d683cf067736d76f867f30c066317e">fft.h</a></li>
 <li>MAX_RADIX&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#a7b6e56afa21f022c5e754b000955735a">fft.h</a>, <a class="el" href="readwrite_8h.html#a7b6e56afa21f022c5e754b000955735a">readwrite.h</a></li>
@@ -199,11 +205,6 @@ $(function(){ initResizable(false); });
 <li>STEEL_CONST&#160;:&#160;<a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">defines.h</a></li>
 <li>STEEL_PRAGMA_UNROLL&#160;:&#160;<a class="el" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">defines.h</a></li>
 </ul>
-
-
-<h3><a id="index_u" name="index_u"></a>- u -</h3><ul>
-<li>uint16_to_bfloat16&#160;:&#160;<a class="el" href="bf16__math_8h.html#a030d871474c0e7d907fccffcc8c047e0">bf16_math.h</a></li>
-</ul>
 </div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/globals_e.html b/docs/build/html/globals_e.html
index 6ab24d6b7..bc592b356 100644
--- a/docs/build/html/globals_e.html
+++ b/docs/build/html/globals_e.html
@@ -87,12 +87,12 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all file members with links to the files they belong to:</div>
 
 <h3><a id="index_e" name="index_e"></a>- e -</h3><ul>
-<li>elem_to_loc()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">utils.h</a></li>
-<li>elem_to_loc_1()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">utils.h</a></li>
-<li>elem_to_loc_2()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">utils.h</a></li>
-<li>elem_to_loc_2_nd()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">utils.h</a></li>
-<li>elem_to_loc_3()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">utils.h</a></li>
-<li>elem_to_loc_3_nd()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b">utils.h</a></li>
+<li>elem_to_loc()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">utils.h</a></li>
+<li>elem_to_loc_1()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">utils.h</a></li>
+<li>elem_to_loc_2()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">utils.h</a></li>
+<li>elem_to_loc_2_nd()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">utils.h</a></li>
+<li>elem_to_loc_3()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">utils.h</a></li>
+<li>elem_to_loc_3_nd()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733">utils.h</a></li>
 <li>elem_to_loc_broadcast()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">utils.h</a></li>
 <li>elems_per_thread_&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#ad395c11e6f2aee72cd1928fba93a35a3">fft.h</a></li>
 <li>erf()&#160;:&#160;<a class="el" href="erf_8h.html#a6ce199ee56105c67adbf8c48c019a8b2">erf.h</a></li>
diff --git a/docs/build/html/globals_f.html b/docs/build/html/globals_f.html
index d6346d1ea..d133729ce 100644
--- a/docs/build/html/globals_f.html
+++ b/docs/build/html/globals_f.html
@@ -90,7 +90,7 @@ $(function(){ initResizable(false); });
 <li>f&#160;:&#160;<a class="el" href="types_2bf16_8h.html#af900396d7b72ff2a7002e8befe8cf8f1">bf16.h</a>, <a class="el" href="fp16_8h.html#af900396d7b72ff2a7002e8befe8cf8f1">fp16.h</a></li>
 <li>fft()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#a4010b0e151e5f01e610e9c32234458c7">fft.h</a></li>
 <li>float16_t&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">utils.h</a></li>
-<li>float_to_bfloat_bits()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">bf16.h</a></li>
+<li>float_to_bfloat_bits()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">bf16.h</a></li>
 <li>four_step_fft()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#a6558a8205ee4c3e4767bafa93f7606de">fft.h</a></li>
 <li>fp16_bf16_binop_helper&#160;:&#160;<a class="el" href="half__types_8h.html#a1f0d5d395d403bde764fffe4846617f9">half_types.h</a></li>
 </ul>
diff --git a/docs/build/html/globals_func.html b/docs/build/html/globals_func.html
index 4ffeea2b4..e87820a18 100644
--- a/docs/build/html/globals_func.html
+++ b/docs/build/html/globals_func.html
@@ -90,9 +90,9 @@ $(function(){ initResizable(false); });
 <li>adjust_matrix_offsets()&#160;:&#160;<a class="el" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">quantized.h</a></li>
 <li>affine_dequantize()&#160;:&#160;<a class="el" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">quantized.h</a></li>
 <li>affine_quantize()&#160;:&#160;<a class="el" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">quantized.h</a></li>
-<li>affine_quantize_scales_biases()&#160;:&#160;<a class="el" href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c">quantized.h</a></li>
-<li>all_reduce()&#160;:&#160;<a class="el" href="reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d">reduce_all.h</a></li>
+<li>all_reduce()&#160;:&#160;<a class="el" href="reduce__all_8h.html#a9086a585eda5a887160ee24baae0a7b8">reduce_all.h</a></li>
 <li>arange()&#160;:&#160;<a class="el" href="metal_2kernels_2arange_8h.html#a1e5126ee6ae0164c2343230c4d87c03e">arange.h</a></li>
+<li>attention()&#160;:&#160;<a class="el" href="steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33">steel_attention.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/globals_func_b.html b/docs/build/html/globals_func_b.html
index 7e5b6db70..a5629d088 100644
--- a/docs/build/html/globals_func_b.html
+++ b/docs/build/html/globals_func_b.html
@@ -87,11 +87,12 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all functions with links to the files they belong to:</div>
 
 <h3><a id="index_b" name="index_b"></a>- b -</h3><ul>
-<li>bfloat_bits_to_float()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bf16.h</a></li>
-<li>binary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a1f3f5d6bfbf3914f365790dd1434c10b">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a6297badf47dece518bb4e67f02cffea8">binary_two.h</a></li>
+<li>bfloat16_to_uint16()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088">bf16.h</a></li>
+<li>bfloat_bits_to_float()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1">bf16.h</a></li>
+<li>binary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#ab1b49438a70f6c707c18afd5bce12bb3">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#aaf6edb734cea627bca4f6540dc338fbd">binary_two.h</a></li>
 <li>binary_g_nd1()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a6808bfb006cb5473da087a2758d0d867">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#ad1fad37c168192b212a4294f4cf78133">binary_two.h</a></li>
-<li>binary_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a8cd5989852ec704c6fd132ae28f4fc14">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a03f7c15a1607576755abb65c542ae347">binary_two.h</a></li>
-<li>binary_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#ac4979e60b993f7ffb602bcb91cd68bc9">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a43e5943460996c43060d1f3aa1309ba6">binary_two.h</a></li>
+<li>binary_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a6cefcfee68bd62f3a6924df0cd53dd49">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a97b5613aff654d32c49225209a19bb95">binary_two.h</a></li>
+<li>binary_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#abb15de8250f9a259de80618c6de46dfa">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#aae07014f8dffa3649a5c7f4671e1268e">binary_two.h</a></li>
 <li>binary_ss()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a242b8b29a852c255467e50628c6dccf5">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#af8a791ac7ca88d32cd8f4e9ac0f9ab4f">binary_two.h</a></li>
 <li>binary_sv()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#a4116c35f2e4632366d1611d5a95ba141">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#ab18c6ecf5065275c93701efd095c916c">binary_two.h</a></li>
 <li>binary_sv2()&#160;:&#160;<a class="el" href="metal_2kernels_2binary_8h.html#aa8c48b1b21d8f5a181f5443de2346589">binary.h</a>, <a class="el" href="metal_2kernels_2binary__two_8h.html#a08822ff98ea6f61a98b49a9e9a38b891">binary_two.h</a></li>
diff --git a/docs/build/html/globals_func_c.html b/docs/build/html/globals_func_c.html
index 227276f3e..c4edff67b 100644
--- a/docs/build/html/globals_func_c.html
+++ b/docs/build/html/globals_func_c.html
@@ -88,21 +88,21 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_c" name="index_c"></a>- c -</h3><ul>
 <li>ceildiv()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">utils.h</a></li>
-<li>col_reduce_2pass()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">reduce_col.h</a></li>
-<li>col_reduce_longcolumn()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">reduce_col.h</a></li>
-<li>col_reduce_looped()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">reduce_col.h</a></li>
-<li>col_reduce_small()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">reduce_col.h</a></li>
+<li>col_reduce_2pass()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a9a7be400d810700b47fc1a998032ce29">reduce_col.h</a></li>
+<li>col_reduce_longcolumn()&#160;:&#160;<a class="el" href="reduce__col_8h.html#aa3287cd98e97123b67b5d3920d984ca2">reduce_col.h</a></li>
+<li>col_reduce_looped()&#160;:&#160;<a class="el" href="reduce__col_8h.html#ae8f9354e1c595142d05b33fe13988f02">reduce_col.h</a></li>
+<li>col_reduce_small()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a82cd031d8014c02e61dc9a817ea6d4ec">reduce_col.h</a></li>
 <li>complex_mul()&#160;:&#160;<a class="el" href="radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6">radix.h</a></li>
 <li>complex_mul_conj()&#160;:&#160;<a class="el" href="radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3">radix.h</a></li>
 <li>contiguous_scan()&#160;:&#160;<a class="el" href="scan_8h.html#a60d279b9add7d56639bb209408f09d79">scan.h</a></li>
-<li>copy_g()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36">copy.h</a></li>
+<li>copy_g()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a71e4103db4689d90ef6f9d5ba93604cf">copy.h</a></li>
 <li>copy_g_nd1()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77">copy.h</a></li>
-<li>copy_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c">copy.h</a></li>
-<li>copy_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff">copy.h</a></li>
-<li>copy_gg()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5">copy.h</a></li>
+<li>copy_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a39ec5b7b8351e4332b842982a2ee6260">copy.h</a></li>
+<li>copy_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#aab82689380897ff4716b5eafd6ef3ecc">copy.h</a></li>
+<li>copy_gg()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#ade9a9eea9b8262a854a11721fe2bb9fa">copy.h</a></li>
 <li>copy_gg_nd1()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1">copy.h</a></li>
-<li>copy_gg_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950">copy.h</a></li>
-<li>copy_gg_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd">copy.h</a></li>
+<li>copy_gg_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#af0b06ac3a96852a64fa4274a94b58301">copy.h</a></li>
+<li>copy_gg_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a3f3836ad0b6545ec9b9e1864224f7a13">copy.h</a></li>
 <li>copy_s()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea">copy.h</a></li>
 <li>copy_s2()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3">copy.h</a></li>
 <li>copy_v()&#160;:&#160;<a class="el" href="metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659">copy.h</a></li>
diff --git a/docs/build/html/globals_func_e.html b/docs/build/html/globals_func_e.html
index 3c7c5676b..ce6f5a90f 100644
--- a/docs/build/html/globals_func_e.html
+++ b/docs/build/html/globals_func_e.html
@@ -87,12 +87,12 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all functions with links to the files they belong to:</div>
 
 <h3><a id="index_e" name="index_e"></a>- e -</h3><ul>
-<li>elem_to_loc()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">utils.h</a></li>
-<li>elem_to_loc_1()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">utils.h</a></li>
-<li>elem_to_loc_2()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">utils.h</a></li>
-<li>elem_to_loc_2_nd()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">utils.h</a></li>
-<li>elem_to_loc_3()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">utils.h</a></li>
-<li>elem_to_loc_3_nd()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b">utils.h</a></li>
+<li>elem_to_loc()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">utils.h</a></li>
+<li>elem_to_loc_1()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">utils.h</a></li>
+<li>elem_to_loc_2()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">utils.h</a></li>
+<li>elem_to_loc_2_nd()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">utils.h</a></li>
+<li>elem_to_loc_3()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">utils.h</a></li>
+<li>elem_to_loc_3_nd()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733">utils.h</a></li>
 <li>elem_to_loc_broadcast()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">utils.h</a></li>
 <li>erf()&#160;:&#160;<a class="el" href="erf_8h.html#a6ce199ee56105c67adbf8c48c019a8b2">erf.h</a></li>
 <li>erfinv()&#160;:&#160;<a class="el" href="erf_8h.html#a1846e0d683c7aff826bb32addcc3b885">erf.h</a></li>
diff --git a/docs/build/html/globals_func_f.html b/docs/build/html/globals_func_f.html
index 5de02297e..68aabaeaf 100644
--- a/docs/build/html/globals_func_f.html
+++ b/docs/build/html/globals_func_f.html
@@ -88,7 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_f" name="index_f"></a>- f -</h3><ul>
 <li>fft()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#a4010b0e151e5f01e610e9c32234458c7">fft.h</a></li>
-<li>float_to_bfloat_bits()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">bf16.h</a></li>
+<li>float_to_bfloat_bits()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1">bf16.h</a></li>
 <li>four_step_fft()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#a6558a8205ee4c3e4767bafa93f7606de">fft.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/globals_func_g.html b/docs/build/html/globals_func_g.html
index 8da40256f..98949963f 100644
--- a/docs/build/html/globals_func_g.html
+++ b/docs/build/html/globals_func_g.html
@@ -87,7 +87,7 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all functions with links to the files they belong to:</div>
 
 <h3><a id="index_g" name="index_g"></a>- g -</h3><ul>
-<li>gather_impl()&#160;:&#160;<a class="el" href="gather_8h.html#abdec470e1af0109563ddae3e85e6526c">gather.h</a></li>
+<li>gather_impl()&#160;:&#160;<a class="el" href="gather_8h.html#a767d7c5be6f2f649101f581449af5599">gather.h</a></li>
 <li>gemm()&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#aa40dd40b9a0bbf20c8911032ed0c3e6d">steel_gemm_fused.h</a></li>
 <li>gemm_splitk()&#160;:&#160;<a class="el" href="steel__gemm__splitk_8h.html#a3be6e095a0a026d3ecf57a3e67f76188">steel_gemm_splitk.h</a></li>
 <li>gemm_splitk_accum()&#160;:&#160;<a class="el" href="steel__gemm__splitk_8h.html#abeb921bf1dc7941125188ddd390b0907">steel_gemm_splitk.h</a></li>
diff --git a/docs/build/html/globals_func_o.html b/docs/build/html/globals_func_o.html
index b610641bf..4338342b9 100644
--- a/docs/build/html/globals_func_o.html
+++ b/docs/build/html/globals_func_o.html
@@ -87,22 +87,22 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all functions with links to the files they belong to:</div>
 
 <h3><a id="index_o" name="index_o"></a>- o -</h3><ul>
-<li>offset_neg_idx()&#160;:&#160;<a class="el" href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df">indexing.h</a></li>
-<li>operator!=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55">bf16.h</a></li>
+<li>offset_neg_idx()&#160;:&#160;<a class="el" href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8">indexing.h</a></li>
+<li>operator!=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55">bf16.h</a></li>
 <li>operator%()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#aaf53122a07c8eca858b5a8e38ae280e0">complex.h</a></li>
-<li>operator*()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a8f06316063fc91747533105f256b55b5">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#a681d4fb076973f58f7dac894ec62a385">complex.h</a></li>
-<li>operator*=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419">bf16.h</a></li>
-<li>operator+()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a09c1a797eb7f43742578680899932f50">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#ad6af5c6c5ed4898b49758618e5aee189">complex.h</a></li>
-<li>operator+=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400">bf16.h</a></li>
-<li>operator-()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#a226cfd54d49f02e35c5aab3139c7596b">complex.h</a></li>
-<li>operator-=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca">bf16.h</a></li>
-<li>operator/()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#ae6a708f67d6fd9b0962aa8877cec6d35">complex.h</a></li>
-<li>operator/=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095">bf16.h</a></li>
-<li>operator&lt;()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#a67674e32596a9dae2258bb8e0e6a2058">complex.h</a></li>
-<li>operator&lt;=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#aee04c9a63c6716a99a027418354debb0">complex.h</a></li>
-<li>operator==()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#abfc19f03616441245dfc7726b278f190">complex.h</a></li>
-<li>operator&gt;()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#a032a8d3eec2384c9f03066f7fd945995">complex.h</a></li>
-<li>operator&gt;=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#aafbd686c180398c98b33d7643f893a46">complex.h</a></li>
+<li>operator*()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a681d4fb076973f58f7dac894ec62a385">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8f06316063fc91747533105f256b55b5">bf16.h</a></li>
+<li>operator*=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419">bf16.h</a></li>
+<li>operator+()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#ad6af5c6c5ed4898b49758618e5aee189">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a09c1a797eb7f43742578680899932f50">bf16.h</a></li>
+<li>operator+=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400">bf16.h</a></li>
+<li>operator-()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a226cfd54d49f02e35c5aab3139c7596b">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">bf16.h</a></li>
+<li>operator-=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca">bf16.h</a></li>
+<li>operator/()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#ae6a708f67d6fd9b0962aa8877cec6d35">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c">bf16.h</a></li>
+<li>operator/=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095">bf16.h</a></li>
+<li>operator&lt;()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a67674e32596a9dae2258bb8e0e6a2058">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25">bf16.h</a></li>
+<li>operator&lt;=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#aee04c9a63c6716a99a027418354debb0">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05">bf16.h</a></li>
+<li>operator==()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#abfc19f03616441245dfc7726b278f190">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065">bf16.h</a></li>
+<li>operator&gt;()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a032a8d3eec2384c9f03066f7fd945995">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57">bf16.h</a></li>
+<li>operator&gt;=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#aafbd686c180398c98b33d7643f893a46">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f">bf16.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/globals_func_r.html b/docs/build/html/globals_func_r.html
index afccdf04f..6e38d9011 100644
--- a/docs/build/html/globals_func_r.html
+++ b/docs/build/html/globals_func_r.html
@@ -102,9 +102,9 @@ $(function(){ initResizable(false); });
 <li>radix_butterfly()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#a278d980ed397e6841ce0af44b9aa4396">fft.h</a></li>
 <li>radix_func()&#160;:&#160;<a class="el" href="metal_2kernels_2hadamard_8h.html#a590e5366adc78bab4fe44e37885d413f">hadamard.h</a></li>
 <li>radix_n_steps()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#aba8d32e0911499671df93678f39da08b">fft.h</a></li>
-<li>row_reduce_looped()&#160;:&#160;<a class="el" href="reduce__row_8h.html#ad98332d74a6824aa7499df3e2f2246ae">reduce_row.h</a></li>
-<li>row_reduce_simple()&#160;:&#160;<a class="el" href="reduce__row_8h.html#ac01d30987668930c8b38900e47b8308b">reduce_row.h</a></li>
-<li>row_reduce_small()&#160;:&#160;<a class="el" href="reduce__row_8h.html#a27e75312086e31f6bd1bbf4b366679da">reduce_row.h</a></li>
+<li>row_reduce_looped()&#160;:&#160;<a class="el" href="reduce__row_8h.html#afba85f5a1c935c124ef52e986d4b2c49">reduce_row.h</a></li>
+<li>row_reduce_simple()&#160;:&#160;<a class="el" href="reduce__row_8h.html#aef628dfccdb1361da5546f8b17c510bf">reduce_row.h</a></li>
+<li>row_reduce_small()&#160;:&#160;<a class="el" href="reduce__row_8h.html#aeb49e89f1163cb3093770bb710df9f5e">reduce_row.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/globals_func_s.html b/docs/build/html/globals_func_s.html
index ce63bfa00..ddb78afe1 100644
--- a/docs/build/html/globals_func_s.html
+++ b/docs/build/html/globals_func_s.html
@@ -87,8 +87,10 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all functions with links to the files they belong to:</div>
 
 <h3><a id="index_s" name="index_s"></a>- s -</h3><ul>
-<li>scatter_impl()&#160;:&#160;<a class="el" href="scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1">scatter.h</a></li>
+<li>scatter_impl()&#160;:&#160;<a class="el" href="scatter_8h.html#a0df7206d4519defb48a6275afc12f87c">scatter.h</a></li>
 <li>sdpa_vector()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae">sdpa_vector.h</a></li>
+<li>sdpa_vector_2pass_1()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#ae070ec482c79c5b3bd19dd03ea42ec74">sdpa_vector.h</a></li>
+<li>sdpa_vector_2pass_2()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#a1368cf3618a4e03dbf743b3463205efe">sdpa_vector.h</a></li>
 <li>simd_shuffle()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2">utils.h</a></li>
 <li>simd_shuffle_and_fill_up()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">utils.h</a></li>
 <li>simd_shuffle_down()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c">utils.h</a></li>
diff --git a/docs/build/html/globals_func_t.html b/docs/build/html/globals_func_t.html
index c42b84b4a..c02ed0b51 100644
--- a/docs/build/html/globals_func_t.html
+++ b/docs/build/html/globals_func_t.html
@@ -87,10 +87,10 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all functions with links to the files they belong to:</div>
 
 <h3><a id="index_t" name="index_t"></a>- t -</h3><ul>
-<li>ternary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#adf8b5989de971e43829875dc0097cdfb">ternary.h</a></li>
+<li>ternary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#ab2051fd944c2e24c57d5b4af54894d72">ternary.h</a></li>
 <li>ternary_g_nd1()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a1bd5918559850f3f80e3adee2391fe6a">ternary.h</a></li>
-<li>ternary_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#afdf0d9d0cb21fcb3f176500785076af8">ternary.h</a></li>
-<li>ternary_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a113df0c8a841b0e986900d580644e047">ternary.h</a></li>
+<li>ternary_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#adec9ca8a8bf527cb15d70da5857af15d">ternary.h</a></li>
+<li>ternary_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a046dcbf67cd2318d45355dc7516e3ff4">ternary.h</a></li>
 <li>ternary_v()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a83f93644d21ee774e06e8190d0725ccb">ternary.h</a></li>
 <li>ternary_v2()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a3e610f3b01966bdbf23fdfebe5d2c508">ternary.h</a></li>
 <li>thread_reduce()&#160;:&#160;<a class="el" href="reduce__row_8h.html#afd80a25fa84e6cc884dcc8698859ade1">reduce_row.h</a></li>
diff --git a/docs/build/html/globals_func_u.html b/docs/build/html/globals_func_u.html
index 3980fdff9..170a6c9ef 100644
--- a/docs/build/html/globals_func_u.html
+++ b/docs/build/html/globals_func_u.html
@@ -87,7 +87,8 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all functions with links to the files they belong to:</div>
 
 <h3><a id="index_u" name="index_u"></a>- u -</h3><ul>
-<li>unary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2unary_8h.html#ac965f8d3ed62f8580dbfb645e83d4ae5">unary.h</a></li>
+<li>uint16_to_bfloat16()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">bf16.h</a></li>
+<li>unary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2unary_8h.html#ac2a85fee50af49620ff62c1a71e2575d">unary.h</a></li>
 <li>unary_v()&#160;:&#160;<a class="el" href="metal_2kernels_2unary_8h.html#a64e4f6737edddb72122e262977ee3014">unary.h</a></li>
 <li>unary_v2()&#160;:&#160;<a class="el" href="metal_2kernels_2unary_8h.html#a7c7690f0df9d2acc60b63be58d9c7777">unary.h</a></li>
 </ul>
diff --git a/docs/build/html/globals_g.html b/docs/build/html/globals_g.html
index d2378ad13..0ad41339e 100644
--- a/docs/build/html/globals_g.html
+++ b/docs/build/html/globals_g.html
@@ -88,7 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_g" name="index_g"></a>- g -</h3><ul>
 <li>gather_bias&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#aaaf17233201156be684f858bfd0f1b67">steel_gemm_fused.h</a></li>
-<li>gather_impl()&#160;:&#160;<a class="el" href="gather_8h.html#abdec470e1af0109563ddae3e85e6526c">gather.h</a></li>
+<li>gather_impl()&#160;:&#160;<a class="el" href="gather_8h.html#a767d7c5be6f2f649101f581449af5599">gather.h</a></li>
 <li>gather_kernels&#160;:&#160;<a class="el" href="jit_2indexing_8h.html#a1a03318128191891a84707602b57b3cf">indexing.h</a></li>
 <li>gemm()&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#aa40dd40b9a0bbf20c8911032ed0c3e6d">steel_gemm_fused.h</a></li>
 <li>gemm_splitk()&#160;:&#160;<a class="el" href="steel__gemm__splitk_8h.html#a3be6e095a0a026d3ecf57a3e67f76188">steel_gemm_splitk.h</a></li>
diff --git a/docs/build/html/globals_j.html b/docs/build/html/globals_j.html
new file mode 100644
index 000000000..dedf76549
--- /dev/null
+++ b/docs/build/html/globals_j.html
@@ -0,0 +1,101 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: File Members</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+</div><!-- top -->
+<div id="doc-content">
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div class="contents">
+<div class="textblock">Here is a list of all file members with links to the files they belong to:</div>
+
+<h3><a id="index_j" name="index_j"></a>- j -</h3><ul>
+<li>jit_else&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2jit_2bf16_8h.html#a4b2f08732045407adc7ee181e39e5ae3">bf16.h</a></li>
+<li>jit_endif&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2jit_2bf16_8h.html#a5049b44a1fffcb837e0c470ae4cafc56">bf16.h</a></li>
+<li>jit_if&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2jit_2bf16_8h.html#aaf5bb88c2349054a6c4c2aefee63d3d2">bf16.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/globals_o.html b/docs/build/html/globals_o.html
index 2327f6337..b6b65452d 100644
--- a/docs/build/html/globals_o.html
+++ b/docs/build/html/globals_o.html
@@ -87,23 +87,23 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all file members with links to the files they belong to:</div>
 
 <h3><a id="index_o" name="index_o"></a>- o -</h3><ul>
-<li>offset_neg_idx()&#160;:&#160;<a class="el" href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df">indexing.h</a></li>
+<li>offset_neg_idx()&#160;:&#160;<a class="el" href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8">indexing.h</a></li>
 <li>op&#160;:&#160;<a class="el" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">binary.h</a></li>
-<li>operator!=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55">bf16.h</a></li>
+<li>operator!=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55">bf16.h</a></li>
 <li>operator%()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#aaf53122a07c8eca858b5a8e38ae280e0">complex.h</a></li>
-<li>operator*()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a8f06316063fc91747533105f256b55b5">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#a681d4fb076973f58f7dac894ec62a385">complex.h</a></li>
-<li>operator*=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419">bf16.h</a></li>
-<li>operator+()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a09c1a797eb7f43742578680899932f50">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#ad6af5c6c5ed4898b49758618e5aee189">complex.h</a></li>
-<li>operator+=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400">bf16.h</a></li>
-<li>operator-()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#a226cfd54d49f02e35c5aab3139c7596b">complex.h</a></li>
-<li>operator-=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca">bf16.h</a></li>
-<li>operator/()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#ae6a708f67d6fd9b0962aa8877cec6d35">complex.h</a></li>
-<li>operator/=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095">bf16.h</a></li>
-<li>operator&lt;()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#a67674e32596a9dae2258bb8e0e6a2058">complex.h</a></li>
-<li>operator&lt;=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#aee04c9a63c6716a99a027418354debb0">complex.h</a></li>
-<li>operator==()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#abfc19f03616441245dfc7726b278f190">complex.h</a></li>
-<li>operator&gt;()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#a032a8d3eec2384c9f03066f7fd945995">complex.h</a></li>
-<li>operator&gt;=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f">bf16.h</a>, <a class="el" href="backend_2metal_2kernels_2complex_8h.html#aafbd686c180398c98b33d7643f893a46">complex.h</a></li>
+<li>operator*()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a681d4fb076973f58f7dac894ec62a385">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8f06316063fc91747533105f256b55b5">bf16.h</a></li>
+<li>operator*=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419">bf16.h</a></li>
+<li>operator+()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#ad6af5c6c5ed4898b49758618e5aee189">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a09c1a797eb7f43742578680899932f50">bf16.h</a></li>
+<li>operator+=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400">bf16.h</a></li>
+<li>operator-()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a226cfd54d49f02e35c5aab3139c7596b">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855">bf16.h</a></li>
+<li>operator-=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca">bf16.h</a></li>
+<li>operator/()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#ae6a708f67d6fd9b0962aa8877cec6d35">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c">bf16.h</a></li>
+<li>operator/=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095">bf16.h</a></li>
+<li>operator&lt;()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a67674e32596a9dae2258bb8e0e6a2058">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25">bf16.h</a></li>
+<li>operator&lt;=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#aee04c9a63c6716a99a027418354debb0">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05">bf16.h</a></li>
+<li>operator==()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#abfc19f03616441245dfc7726b278f190">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065">bf16.h</a></li>
+<li>operator&gt;()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a032a8d3eec2384c9f03066f7fd945995">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57">bf16.h</a></li>
+<li>operator&gt;=()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#aafbd686c180398c98b33d7643f893a46">complex.h</a>, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f">bf16.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/globals_r.html b/docs/build/html/globals_r.html
index 7b0987b71..bc626d8ee 100644
--- a/docs/build/html/globals_r.html
+++ b/docs/build/html/globals_r.html
@@ -127,9 +127,9 @@ $(function(){ initResizable(false); });
 <li>REDUCE_N_WRITES&#160;:&#160;<a class="el" href="defines_8h.html#a68c33274e15a2f163f7631a36280d82f">defines.h</a></li>
 <li>RMS_LOOPED_LIMIT&#160;:&#160;<a class="el" href="defines_8h.html#a717a175676c3f96d74adfde7e751a541">defines.h</a></li>
 <li>RMS_N_READS&#160;:&#160;<a class="el" href="defines_8h.html#a89c0a33ba39a881ad3458ffdde62a24f">defines.h</a></li>
-<li>row_reduce_looped()&#160;:&#160;<a class="el" href="reduce__row_8h.html#ad98332d74a6824aa7499df3e2f2246ae">reduce_row.h</a></li>
-<li>row_reduce_simple()&#160;:&#160;<a class="el" href="reduce__row_8h.html#ac01d30987668930c8b38900e47b8308b">reduce_row.h</a></li>
-<li>row_reduce_small()&#160;:&#160;<a class="el" href="reduce__row_8h.html#a27e75312086e31f6bd1bbf4b366679da">reduce_row.h</a></li>
+<li>row_reduce_looped()&#160;:&#160;<a class="el" href="reduce__row_8h.html#afba85f5a1c935c124ef52e986d4b2c49">reduce_row.h</a></li>
+<li>row_reduce_simple()&#160;:&#160;<a class="el" href="reduce__row_8h.html#aef628dfccdb1361da5546f8b17c510bf">reduce_row.h</a></li>
+<li>row_reduce_small()&#160;:&#160;<a class="el" href="reduce__row_8h.html#aeb49e89f1163cb3093770bb710df9f5e">reduce_row.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/globals_s.html b/docs/build/html/globals_s.html
index 00ccc52f6..f0c6f86cf 100644
--- a/docs/build/html/globals_s.html
+++ b/docs/build/html/globals_s.html
@@ -87,9 +87,11 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all file members with links to the files they belong to:</div>
 
 <h3><a id="index_s" name="index_s"></a>- s -</h3><ul>
-<li>scatter_impl()&#160;:&#160;<a class="el" href="scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1">scatter.h</a></li>
+<li>scatter_impl()&#160;:&#160;<a class="el" href="scatter_8h.html#a0df7206d4519defb48a6275afc12f87c">scatter.h</a></li>
 <li>scatter_kernels&#160;:&#160;<a class="el" href="jit_2indexing_8h.html#a768c949cd650a44c6b402fc1440c1a56">indexing.h</a></li>
 <li>sdpa_vector()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae">sdpa_vector.h</a></li>
+<li>sdpa_vector_2pass_1()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#ae070ec482c79c5b3bd19dd03ea42ec74">sdpa_vector.h</a></li>
+<li>sdpa_vector_2pass_2()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#a1368cf3618a4e03dbf743b3463205efe">sdpa_vector.h</a></li>
 <li>simd_shuffle()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2">utils.h</a></li>
 <li>simd_shuffle_and_fill_up()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">utils.h</a></li>
 <li>simd_shuffle_down()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c">utils.h</a></li>
diff --git a/docs/build/html/globals_t.html b/docs/build/html/globals_t.html
index 9a610e601..77091388d 100644
--- a/docs/build/html/globals_t.html
+++ b/docs/build/html/globals_t.html
@@ -87,10 +87,10 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all file members with links to the files they belong to:</div>
 
 <h3><a id="index_t" name="index_t"></a>- t -</h3><ul>
-<li>ternary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#adf8b5989de971e43829875dc0097cdfb">ternary.h</a></li>
+<li>ternary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#ab2051fd944c2e24c57d5b4af54894d72">ternary.h</a></li>
 <li>ternary_g_nd1()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a1bd5918559850f3f80e3adee2391fe6a">ternary.h</a></li>
-<li>ternary_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#afdf0d9d0cb21fcb3f176500785076af8">ternary.h</a></li>
-<li>ternary_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a113df0c8a841b0e986900d580644e047">ternary.h</a></li>
+<li>ternary_g_nd2()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#adec9ca8a8bf527cb15d70da5857af15d">ternary.h</a></li>
+<li>ternary_g_nd3()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a046dcbf67cd2318d45355dc7516e3ff4">ternary.h</a></li>
 <li>ternary_v()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a83f93644d21ee774e06e8190d0725ccb">ternary.h</a></li>
 <li>ternary_v2()&#160;:&#160;<a class="el" href="metal_2kernels_2ternary_8h.html#a3e610f3b01966bdbf23fdfebe5d2c508">ternary.h</a></li>
 <li>thread_reduce()&#160;:&#160;<a class="el" href="reduce__row_8h.html#afd80a25fa84e6cc884dcc8698859ade1">reduce_row.h</a></li>
diff --git a/docs/build/html/globals_type.html b/docs/build/html/globals_type.html
index 37803c1a9..95d848429 100644
--- a/docs/build/html/globals_type.html
+++ b/docs/build/html/globals_type.html
@@ -85,7 +85,7 @@ $(function(){ initResizable(false); });
 
 <div class="contents">
 <div class="textblock">Here is a list of all typedefs with links to the files they belong to:</div><ul>
-<li>bfloat16_t&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bf16.h</a></li>
+<li>bfloat16_t&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bf16.h</a></li>
 <li>float16_t&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">utils.h</a></li>
 <li>nomask_t&#160;:&#160;<a class="el" href="kernels_2gemv__masked_8h.html#a1480c8cdff1cae1462a5a71632969bca">gemv_masked.h</a></li>
 <li>RadixFunc&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2fft_8h.html#a6ba62eabfd5428644aabf89ddaa0128d">fft.h</a></li>
diff --git a/docs/build/html/globals_u.html b/docs/build/html/globals_u.html
index cb9de4538..fa6148617 100644
--- a/docs/build/html/globals_u.html
+++ b/docs/build/html/globals_u.html
@@ -88,8 +88,8 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_u" name="index_u"></a>- u -</h3><ul>
 <li>u&#160;:&#160;<a class="el" href="types_2bf16_8h.html#aa21e554721eddcf127b7fcfa7fdc56bd">bf16.h</a>, <a class="el" href="fp16_8h.html#aa21e554721eddcf127b7fcfa7fdc56bd">fp16.h</a></li>
-<li>uint16_to_bfloat16&#160;:&#160;<a class="el" href="bf16__math_8h.html#a030d871474c0e7d907fccffcc8c047e0">bf16_math.h</a></li>
-<li>unary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2unary_8h.html#ac965f8d3ed62f8580dbfb645e83d4ae5">unary.h</a></li>
+<li>uint16_to_bfloat16()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4">bf16.h</a></li>
+<li>unary_g()&#160;:&#160;<a class="el" href="metal_2kernels_2unary_8h.html#ac2a85fee50af49620ff62c1a71e2575d">unary.h</a></li>
 <li>unary_v()&#160;:&#160;<a class="el" href="metal_2kernels_2unary_8h.html#a64e4f6737edddb72122e262977ee3014">unary.h</a></li>
 <li>unary_v2()&#160;:&#160;<a class="el" href="metal_2kernels_2unary_8h.html#a7c7690f0df9d2acc60b63be58d9c7777">unary.h</a></li>
 <li>use_out_source&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#a3fe4e4382bda8a419557a5e6f77bc084">steel_gemm_fused.h</a></li>
diff --git a/docs/build/html/globals_vars.html b/docs/build/html/globals_vars.html
index 8364b3e51..a6d6a7c0c 100644
--- a/docs/build/html/globals_vars.html
+++ b/docs/build/html/globals_vars.html
@@ -87,9 +87,10 @@ $(function(){ initResizable(false); });
 <div class="textblock">Here is a list of all variables with links to the files they belong to:</div>
 
 <h3><a id="index_a" name="index_a"></a>- a -</h3><ul>
-<li>align_K&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">steel_gemm_fused.h</a></li>
+<li>align_K&#160;:&#160;<a class="el" href="steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">steel_attention.h</a>, <a class="el" href="steel__gemm__fused_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">steel_gemm_fused.h</a></li>
 <li>align_M&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#a55af226dc74b0026b7d4b865142a6d21">steel_gemm_fused.h</a></li>
 <li>align_N&#160;:&#160;<a class="el" href="steel__gemm__fused_8h.html#aa3b267252df2dcbfdde8c5f174d27036">steel_gemm_fused.h</a></li>
+<li>align_Q&#160;:&#160;<a class="el" href="steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982">steel_attention.h</a></li>
 <li>arange_kernels&#160;:&#160;<a class="el" href="metal_2jit_2arange_8h.html#a2f49fb7bdc0a90230077fe2023e6e5c0">arange.h</a></li>
 </ul>
 
@@ -100,9 +101,9 @@ $(function(){ initResizable(false); });
 
 
 <h3><a id="index_c" name="index_c"></a>- c -</h3><ul>
-<li>can_convert_from_bfloat&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">bf16.h</a></li>
+<li>can_convert_from_bfloat&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a">bf16.h</a></li>
 <li>can_convert_from_complex64&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#ab149db78f6f19b8da6297dac4c36d893">complex.h</a></li>
-<li>can_convert_to_bfloat&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">bf16.h</a></li>
+<li>can_convert_to_bfloat&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">bf16.h</a></li>
 <li>can_convert_to_complex64&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a4f90ad54f4fae363e8d3cc41d539557b">complex.h</a></li>
 </ul>
 
diff --git a/docs/build/html/group__ops.html b/docs/build/html/group__ops.html
index af8de661b..557ad4e2b 100644
--- a/docs/build/html/group__ops.html
+++ b/docs/build/html/group__ops.html
@@ -919,6 +919,8 @@ Functions</h2></td></tr>
 <tr class="separator:gaf8913cabeb9fb193ba687aaeb2087764"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga7ff592a64d528f0cf4f3d098465da029" id="r_ga7ff592a64d528f0cf4f3d098465da029"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ga7ff592a64d528f0cf4f3d098465da029">mlx::core::imag</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
 <tr class="separator:ga7ff592a64d528f0cf4f3d098465da029"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga8ab10aa6c41416d739791164a52b25d5" id="r_ga8ab10aa6c41416d739791164a52b25d5"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ga8ab10aa6c41416d739791164a52b25d5">mlx::core::contiguous</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a, bool allow_col_major=false, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
+<tr class="separator:ga8ab10aa6c41416d739791164a52b25d5"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
 <h2 class="groupheader">Function Documentation</h2>
@@ -2586,6 +2588,32 @@ Functions</h2></td></tr>
       </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="ga8ab10aa6c41416d739791164a52b25d5" name="ga8ab10aa6c41416d739791164a52b25d5"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga8ab10aa6c41416d739791164a52b25d5">&#9670;&#160;</a></span>contiguous()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="classmlx_1_1core_1_1array.html">array</a> mlx::core::contiguous </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>a</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">bool</td>          <td class="paramname"><span class="paramname"><em>allow_col_major</em></span><span class="paramdefsep"> = </span><span class="paramdefval">false</span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a></td>          <td class="paramname"><span class="paramname"><em>s</em></span><span class="paramdefsep"> = </span><span class="paramdefval">{}</span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="ga30d47e08093c03a3676f235f9f559411" name="ga30d47e08093c03a3676f235f9f559411"></a>
diff --git a/docs/build/html/half__types_8h.html b/docs/build/html/half__types_8h.html
index 7d26d5e02..9e1bebfc5 100644
--- a/docs/build/html/half__types_8h.html
+++ b/docs/build/html/half__types_8h.html
@@ -124,21 +124,21 @@ Typedefs</h2></td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:a95fd207028f125eefbafe9e0522407fe" id="r_a95fd207028f125eefbafe9e0522407fe"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a95fd207028f125eefbafe9e0522407fe">mlx::core::operator+</a> (<a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
+<tr class="memitem:a95fd207028f125eefbafe9e0522407fe" id="r_a95fd207028f125eefbafe9e0522407fe"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a95fd207028f125eefbafe9e0522407fe">mlx::core::operator+</a> (<a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
 <tr class="separator:a95fd207028f125eefbafe9e0522407fe"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:abc6425a3fbb386f5ea5964b42507e989" id="r_abc6425a3fbb386f5ea5964b42507e989"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#abc6425a3fbb386f5ea5964b42507e989">mlx::core::operator+</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
+<tr class="memitem:abc6425a3fbb386f5ea5964b42507e989" id="r_abc6425a3fbb386f5ea5964b42507e989"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#abc6425a3fbb386f5ea5964b42507e989">mlx::core::operator+</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
 <tr class="separator:abc6425a3fbb386f5ea5964b42507e989"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2631e78c6f0a602f6754ac577ec75f83" id="r_a2631e78c6f0a602f6754ac577ec75f83"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a2631e78c6f0a602f6754ac577ec75f83">mlx::core::operator-</a> (<a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
+<tr class="memitem:a2631e78c6f0a602f6754ac577ec75f83" id="r_a2631e78c6f0a602f6754ac577ec75f83"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a2631e78c6f0a602f6754ac577ec75f83">mlx::core::operator-</a> (<a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
 <tr class="separator:a2631e78c6f0a602f6754ac577ec75f83"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a73d79cbd75d543d0837b8a51bf103f9e" id="r_a73d79cbd75d543d0837b8a51bf103f9e"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a73d79cbd75d543d0837b8a51bf103f9e">mlx::core::operator-</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
+<tr class="memitem:a73d79cbd75d543d0837b8a51bf103f9e" id="r_a73d79cbd75d543d0837b8a51bf103f9e"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a73d79cbd75d543d0837b8a51bf103f9e">mlx::core::operator-</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
 <tr class="separator:a73d79cbd75d543d0837b8a51bf103f9e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:acaaa86b59c7ceb2e092ac07f2a75225c" id="r_acaaa86b59c7ceb2e092ac07f2a75225c"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#acaaa86b59c7ceb2e092ac07f2a75225c">mlx::core::operator*</a> (<a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
+<tr class="memitem:acaaa86b59c7ceb2e092ac07f2a75225c" id="r_acaaa86b59c7ceb2e092ac07f2a75225c"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#acaaa86b59c7ceb2e092ac07f2a75225c">mlx::core::operator*</a> (<a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
 <tr class="separator:acaaa86b59c7ceb2e092ac07f2a75225c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a067d47823a322b88043cce7ce4a3ec78" id="r_a067d47823a322b88043cce7ce4a3ec78"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a067d47823a322b88043cce7ce4a3ec78">mlx::core::operator*</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
+<tr class="memitem:a067d47823a322b88043cce7ce4a3ec78" id="r_a067d47823a322b88043cce7ce4a3ec78"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a067d47823a322b88043cce7ce4a3ec78">mlx::core::operator*</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
 <tr class="separator:a067d47823a322b88043cce7ce4a3ec78"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a97efcd96d6be666e5608034ae77289ef" id="r_a97efcd96d6be666e5608034ae77289ef"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a97efcd96d6be666e5608034ae77289ef">mlx::core::operator/</a> (<a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
+<tr class="memitem:a97efcd96d6be666e5608034ae77289ef" id="r_a97efcd96d6be666e5608034ae77289ef"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a97efcd96d6be666e5608034ae77289ef">mlx::core::operator/</a> (<a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
 <tr class="separator:a97efcd96d6be666e5608034ae77289ef"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a899851f85dbddd96f9d36319b82542a0" id="r_a899851f85dbddd96f9d36319b82542a0"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a899851f85dbddd96f9d36319b82542a0">mlx::core::operator/</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
+<tr class="memitem:a899851f85dbddd96f9d36319b82542a0" id="r_a899851f85dbddd96f9d36319b82542a0"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a899851f85dbddd96f9d36319b82542a0">mlx::core::operator/</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
 <tr class="separator:a899851f85dbddd96f9d36319b82542a0"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Macro Definition Documentation</h2>
@@ -194,8 +194,8 @@ Functions</h2></td></tr>
 <div class="line">  <span class="keyword">inline</span> <span class="keywordtype">float</span> __operator__(<a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a> lhs, <a class="code hl_typedef" href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a> rhs) {       \</div>
 <div class="line">    <span class="keywordflow">return</span> <span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(lhs) __op__ <span class="keyword">static_cast&lt;</span><span class="keywordtype">float</span><span class="keyword">&gt;</span>(rhs); \</div>
 <div class="line">  }</div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_acb8ddf4a29129846b673c50ba7078773"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a></div><div class="ttdeci">half float16_t</div><div class="ttdef"><b>Definition</b> utils.h:10</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_acb8ddf4a29129846b673c50ba7078773"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a></div><div class="ttdeci">half float16_t</div><div class="ttdef"><b>Definition</b> utils.h:16</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
 </div><!-- fragment -->
 </div>
 </div>
diff --git a/docs/build/html/half__types_8h_source.html b/docs/build/html/half__types_8h_source.html
index 733679282..98b83d5da 100644
--- a/docs/build/html/half__types_8h_source.html
+++ b/docs/build/html/half__types_8h_source.html
@@ -114,7 +114,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span> </div>
 <div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="preprocessor">#include &lt;arm_bf16.h&gt;</span></div>
 <div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemlx_1_1core.html">mlx::core</a> {</div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span><span class="keyword">typedef</span> __bf16 <a class="code hl_typedef" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>;</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span><span class="keyword">typedef</span> __bf16 <a class="code hl_typedef" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>;</div>
 <div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span> </div>
 <div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span><span class="preprocessor">#else</span></div>
@@ -149,12 +149,12 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span> </div>
 <div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span><span class="preprocessor">#endif</span></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a7782de82393104dd4ad754ce3b316e82"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></div><div class="ttdeci">struct _MLX_BFloat16 bfloat16_t</div><div class="ttdef"><b>Definition</b> bf16.h:257</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a7782de82393104dd4ad754ce3b316e82"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></div><div class="ttdeci">struct _MLX_BFloat16 bfloat16_t</div><div class="ttdef"><b>Definition</b> bf16.h:251</div></div>
 <div class="ttc" id="afp16_8h_html"><div class="ttname"><a href="fp16_8h.html">fp16.h</a></div></div>
 <div class="ttc" id="ahalf__types_8h_html_a1f0d5d395d403bde764fffe4846617f9"><div class="ttname"><a href="half__types_8h.html#a1f0d5d395d403bde764fffe4846617f9">fp16_bf16_binop_helper</a></div><div class="ttdeci">#define fp16_bf16_binop_helper(__op__, __operator__)</div><div class="ttdef"><b>Definition</b> half_types.h:41</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_afbd2769c30e721afc85a7b9fb55b8e52"><div class="ttname"><a href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">mlx::core::float16_t</a></div><div class="ttdeci">struct _MLX_Float16 float16_t</div><div class="ttdef"><b>Definition</b> half_types.h:16</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1___m_l_x___float16_html"><div class="ttname"><a href="structmlx_1_1core_1_1___m_l_x___float16.html">mlx::core::_MLX_Float16</a></div><div class="ttdef"><b>Definition</b> fp16.h:21</div></div>
 <div class="ttc" id="atypes_2bf16_8h_html"><div class="ttname"><a href="types_2bf16_8h.html">bf16.h</a></div></div>
 </div><!-- fragment --></div><!-- contents -->
diff --git a/docs/build/html/hierarchy.html b/docs/build/html/hierarchy.html
index 419c12644..8789f9171 100644
--- a/docs/build/html/hierarchy.html
+++ b/docs/build/html/hierarchy.html
@@ -130,379 +130,390 @@ $(function(){ initResizable(false); });
 <tr id="row_33_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1ndarr.html" target="_self">pocketfft::detail::ndarr&lt; T &gt;</a></td><td class="desc"></td></tr>
 <tr id="row_34_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1array.html" target="_self">mlx::core::array</a></td><td class="desc"></td></tr>
 <tr id="row_35_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1array_1_1_array_iterator.html" target="_self">mlx::core::array::ArrayIterator</a></td><td class="desc"></td></tr>
-<tr id="row_36_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html" target="_self">mlx::steel::BaseMMAFrag&lt; T, kFragRows_, kFragCols_ &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_37_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html" target="_self">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_38_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html" target="_self">_MLX_BFloat16::bits_to_bfloat_struct</a></td><td class="desc"></td></tr>
-<tr id="row_39_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_bitwise_and.html" target="_self">BitwiseAnd</a></td><td class="desc"></td></tr>
-<tr id="row_40_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_and.html" target="_self">mlx::core::detail::BitwiseAnd</a></td><td class="desc"></td></tr>
-<tr id="row_41_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_bitwise_or.html" target="_self">BitwiseOr</a></td><td class="desc"></td></tr>
-<tr id="row_42_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_or.html" target="_self">mlx::core::detail::BitwiseOr</a></td><td class="desc"></td></tr>
-<tr id="row_43_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_bitwise_xor.html" target="_self">BitwiseXor</a></td><td class="desc"></td></tr>
-<tr id="row_44_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_xor.html" target="_self">mlx::core::detail::BitwiseXor</a></td><td class="desc"></td></tr>
-<tr id="row_45_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader.html" target="_self">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_46_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_block_merge_sort.html" target="_self">BlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_47_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html" target="_self">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_48_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html" target="_self">mlx::steel::BlockSwizzle</a></td><td class="desc"></td></tr>
-<tr id="row_49_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="unionbool4__or__uint.html" target="_self">bool4_or_uint</a></td><td class="desc"></td></tr>
-<tr id="row_50_" class="even"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_50_" class="arrow" onclick="dynsection.toggleFolder('50_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><b>metal::bool_constant</b></td><td class="desc"></td></tr>
-<tr id="row_50_0_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1is__empty.html" target="_self">metal::is_empty&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_50_1_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1is__static.html" target="_self">metal::is_static&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_51_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1allocator_1_1_buffer.html" target="_self">mlx::core::allocator::Buffer</a></td><td class="desc"></td></tr>
-<tr id="row_52_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_ceil.html" target="_self">Ceil</a></td><td class="desc"></td></tr>
-<tr id="row_53_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_ceil.html" target="_self">mlx::core::detail::Ceil</a></td><td class="desc"></td></tr>
-<tr id="row_54_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1cfftp.html" target="_self">pocketfft::detail::cfftp&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_55_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper.html" target="_self">mlx::steel::ChannelHelper&lt; n_channels_ &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_56_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html" target="_self">mlx::steel::ChannelHelper&lt; 1 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_57_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html" target="_self">mlx::steel::ChannelHelper&lt; 2 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_58_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html" target="_self">mlx::steel::ChannelHelper&lt; 3 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_59_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html" target="_self">mlx::steel::ChannelHelper&lt; 4 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_60_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1cmplx.html" target="_self">pocketfft::detail::cmplx&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_61_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1cmplx.html" target="_self">pocketfft::detail::cmplx&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_62_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1cmplx.html" target="_self">pocketfft::detail::cmplx&lt; Thigh &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_63_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html" target="_self">mlx::core::metal::CommandEncoder</a></td><td class="desc"></td></tr>
-<tr id="row_64_" class="even"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_64_" class="arrow" onclick="dynsection.toggleFolder('64_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><b>std::complex</b></td><td class="desc"></td></tr>
-<tr id="row_64_0_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1complex128__t.html" target="_self">mlx::core::complex128_t</a></td><td class="desc"></td></tr>
-<tr id="row_64_1_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1complex64__t.html" target="_self">mlx::core::complex64_t</a></td><td class="desc"></td></tr>
-<tr id="row_65_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcomplex64__t.html" target="_self">complex64_t</a></td><td class="desc"></td></tr>
-<tr id="row_66_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html" target="_self">pocketfft::detail::threading::concurrent_queue&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_67_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html" target="_self">pocketfft::detail::threading::concurrent_queue&lt; std::function&lt; void()&gt; &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_68_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html" target="_self">mlx::core::metal::CommandEncoder::ConcurrentContext</a></td><td class="desc"></td></tr>
-<tr id="row_69_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_conjugate.html" target="_self">Conjugate</a></td><td class="desc"></td></tr>
-<tr id="row_70_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_conjugate.html" target="_self">mlx::core::detail::Conjugate</a></td><td class="desc"></td></tr>
-<tr id="row_71_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html" target="_self">mlx::core::ContiguousIterator&lt; StrideT &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_72_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_base_info.html" target="_self">mlx::steel::Conv2DGeneralBaseInfo</a></td><td class="desc"></td></tr>
-<tr id="row_73_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_jump_params.html" target="_self">mlx::steel::Conv2DGeneralJumpParams</a></td><td class="desc"></td></tr>
-<tr id="row_74_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html" target="_self">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_75_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html" target="_self">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_76_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html" target="_self">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_77_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html" target="_self">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_78_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html" target="_self">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_79_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html" target="_self">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_80_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html" target="_self">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_81_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cos.html" target="_self">Cos</a></td><td class="desc"></td></tr>
-<tr id="row_82_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_cos.html" target="_self">mlx::core::detail::Cos</a></td><td class="desc"></td></tr>
-<tr id="row_83_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cosh.html" target="_self">Cosh</a></td><td class="desc"></td></tr>
-<tr id="row_84_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_cosh.html" target="_self">mlx::core::detail::Cosh</a></td><td class="desc"></td></tr>
-<tr id="row_85_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_max.html" target="_self">CumMax&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_86_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_min.html" target="_self">CumMin&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_87_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_prod.html" target="_self">CumProd&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_88_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_prod_3_01bool_01_4.html" target="_self">CumProd&lt; bool &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_89_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_sum.html" target="_self">CumSum&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_90_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html" target="_self">mlx::core::fast::CustomKernelShapeInfo</a></td><td class="desc"></td></tr>
-<tr id="row_91_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1array_1_1_data.html" target="_self">mlx::core::array::Data</a></td><td class="desc"></td></tr>
-<tr id="row_92_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_default_contiguous_reduce.html" target="_self">mlx::core::DefaultContiguousReduce&lt; T, U, Op &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_93_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_default_strided_reduce.html" target="_self">mlx::core::DefaultStridedReduce&lt; T, U, Op &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_94_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_device.html" target="_self">mlx::core::Device</a></td><td class="desc"></td></tr>
-<tr id="row_95_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html" target="_self">mlx::core::metal::Device</a></td><td class="desc"></td></tr>
-<tr id="row_96_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html" target="_self">mlx::core::metal::DeviceStream</a></td><td class="desc"></td></tr>
-<tr id="row_97_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_divide.html" target="_self">Divide</a></td><td class="desc"></td></tr>
-<tr id="row_98_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_divide.html" target="_self">mlx::core::detail::Divide</a></td><td class="desc"></td></tr>
-<tr id="row_99_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_div_mod.html" target="_self">DivMod</a></td><td class="desc"></td></tr>
-<tr id="row_100_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_dtype.html" target="_self">mlx::core::Dtype</a></td><td class="desc"></td></tr>
-<tr id="row_101_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_equal.html" target="_self">Equal</a></td><td class="desc"></td></tr>
-<tr id="row_102_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_equal.html" target="_self">mlx::core::detail::Equal</a></td><td class="desc"></td></tr>
-<tr id="row_103_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_erf.html" target="_self">Erf</a></td><td class="desc"></td></tr>
-<tr id="row_104_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_erf.html" target="_self">mlx::core::detail::Erf</a></td><td class="desc"></td></tr>
-<tr id="row_105_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_erf_inv.html" target="_self">ErfInv</a></td><td class="desc"></td></tr>
-<tr id="row_106_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_erf_inv.html" target="_self">mlx::core::detail::ErfInv</a></td><td class="desc"></td></tr>
-<tr id="row_107_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_event.html" target="_self">mlx::core::Event</a></td><td class="desc"></td></tr>
-<tr id="row_108_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_exec_c2_c.html" target="_self">pocketfft::detail::ExecC2C</a></td><td class="desc"></td></tr>
-<tr id="row_109_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_exec_dcst.html" target="_self">pocketfft::detail::ExecDcst</a></td><td class="desc"></td></tr>
-<tr id="row_110_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_exec_hartley.html" target="_self">pocketfft::detail::ExecHartley</a></td><td class="desc"></td></tr>
-<tr id="row_111_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_exec_r2_r.html" target="_self">pocketfft::detail::ExecR2R</a></td><td class="desc"></td></tr>
-<tr id="row_112_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_exp.html" target="_self">Exp</a></td><td class="desc"></td></tr>
-<tr id="row_113_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_exp.html" target="_self">mlx::core::detail::Exp</a></td><td class="desc"></td></tr>
-<tr id="row_114_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_expm1.html" target="_self">Expm1</a></td><td class="desc"></td></tr>
-<tr id="row_115_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_expm1.html" target="_self">mlx::core::detail::Expm1</a></td><td class="desc"></td></tr>
-<tr id="row_116_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1metal_1_1_fence.html" target="_self">mlx::core::metal::Fence</a></td><td class="desc"></td></tr>
-<tr id="row_117_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1fftblue.html" target="_self">pocketfft::detail::fftblue&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_118_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html" target="_self">mlx::core::array::Flags</a></td><td class="desc"></td></tr>
-<tr id="row_119_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_floor.html" target="_self">Floor</a></td><td class="desc"></td></tr>
-<tr id="row_120_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_floor.html" target="_self">mlx::core::detail::Floor</a></td><td class="desc"></td></tr>
-<tr id="row_121_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_floor_divide.html" target="_self">FloorDivide</a></td><td class="desc"></td></tr>
-<tr id="row_122_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html" target="_self">mlx::steel::GEMMAddMMParams</a></td><td class="desc"></td></tr>
-<tr id="row_123_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html" target="_self">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_124_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html" target="_self">mlx::steel::GEMMParams</a></td><td class="desc"></td></tr>
-<tr id="row_125_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html" target="_self">mlx::steel::GEMMSpiltKParams</a></td><td class="desc"></td></tr>
-<tr id="row_126_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_g_e_m_v_kernel.html" target="_self">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_127_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_g_e_m_v_t_kernel.html" target="_self">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></td><td class="desc">Vector matrix multiplication </td></tr>
-<tr id="row_128_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_greater.html" target="_self">Greater</a></td><td class="desc"></td></tr>
-<tr id="row_129_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_greater.html" target="_self">mlx::core::detail::Greater</a></td><td class="desc"></td></tr>
-<tr id="row_130_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_greater_equal.html" target="_self">GreaterEqual</a></td><td class="desc"></td></tr>
-<tr id="row_131_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_greater_equal.html" target="_self">mlx::core::detail::GreaterEqual</a></td><td class="desc"></td></tr>
-<tr id="row_132_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1distributed_1_1_group.html" target="_self">mlx::core::distributed::Group</a></td><td class="desc">A <a class="el" href="structmlx_1_1core_1_1distributed_1_1_group.html" title="A distributed::Group represents a group of independent mlx processes that can communicate.">distributed::Group</a> represents a group of independent mlx processes that can communicate </td></tr>
-<tr id="row_133_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_imag.html" target="_self">Imag</a></td><td class="desc"></td></tr>
-<tr id="row_134_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_imag.html" target="_self">mlx::core::detail::Imag</a></td><td class="desc"></td></tr>
-<tr id="row_135_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html" target="_self">mlx::steel::ImplicitGemmConv2DParams</a></td><td class="desc"></td></tr>
-<tr id="row_136_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_indices.html" target="_self">Indices&lt; IdxT, NIDX &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_137_" class="odd"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_137_" class="arrow" onclick="dynsection.toggleFolder('137_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1integral__constant.html" target="_self">mlx::steel::integral_constant&lt; T, v &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_137_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1is__integral.html" target="_self">mlx::steel::is_integral&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_137_1_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1is__integral_3_01integral__constant_3_01_t_00_01v_01_4_01_4.html" target="_self">mlx::steel::is_integral&lt; integral_constant&lt; T, v &gt; &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_138_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="unionmlx_1_1core_1_1detail_1_1_int_or_float.html" target="_self">mlx::core::detail::IntOrFloat</a></td><td class="desc"></td></tr>
-<tr id="row_139_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_in_tracing.html" target="_self">mlx::core::detail::InTracing</a></td><td class="desc"></td></tr>
-<tr id="row_140_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_kernel_merge_sort.html" target="_self">KernelMergeSort&lt; T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_141_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_kernel_multi_block_merge_sort.html" target="_self">KernelMultiBlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_142_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1random_1_1_key_sequence.html" target="_self">mlx::core::random::KeySequence</a></td><td class="desc"></td></tr>
-<tr id="row_143_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html" target="_self">pocketfft::detail::threading::latch</a></td><td class="desc"></td></tr>
-<tr id="row_144_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_left_shift.html" target="_self">LeftShift</a></td><td class="desc"></td></tr>
-<tr id="row_145_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_left_shift.html" target="_self">mlx::core::detail::LeftShift</a></td><td class="desc"></td></tr>
-<tr id="row_146_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less.html" target="_self">Less</a></td><td class="desc"></td></tr>
-<tr id="row_147_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_less.html" target="_self">mlx::core::detail::Less</a></td><td class="desc"></td></tr>
-<tr id="row_148_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less_equal.html" target="_self">LessEqual</a></td><td class="desc"></td></tr>
-<tr id="row_149_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_less_equal.html" target="_self">mlx::core::detail::LessEqual</a></td><td class="desc"></td></tr>
-<tr id="row_150_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less_than.html" target="_self">LessThan&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_151_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits.html" target="_self">Limits&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_152_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01bfloat16__t_01_4.html" target="_self">Limits&lt; bfloat16_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_153_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01bool_01_4.html" target="_self">Limits&lt; bool &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_154_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01complex64__t_01_4.html" target="_self">Limits&lt; complex64_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_155_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01float_01_4.html" target="_self">Limits&lt; float &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_156_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01half_01_4.html" target="_self">Limits&lt; half &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_157_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int16__t_01_4.html" target="_self">Limits&lt; int16_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_158_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int32__t_01_4.html" target="_self">Limits&lt; int32_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_159_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int64__t_01_4.html" target="_self">Limits&lt; int64_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_160_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int8__t_01_4.html" target="_self">Limits&lt; int8_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_161_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint16__t_01_4.html" target="_self">Limits&lt; uint16_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_162_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint32__t_01_4.html" target="_self">Limits&lt; uint32_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_163_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint64__t_01_4.html" target="_self">Limits&lt; uint64_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_164_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint8__t_01_4.html" target="_self">Limits&lt; uint8_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_165_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log.html" target="_self">Log</a></td><td class="desc"></td></tr>
-<tr id="row_166_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log.html" target="_self">mlx::core::detail::Log</a></td><td class="desc"></td></tr>
-<tr id="row_167_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log10.html" target="_self">Log10</a></td><td class="desc"></td></tr>
-<tr id="row_168_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log10.html" target="_self">mlx::core::detail::Log10</a></td><td class="desc"></td></tr>
-<tr id="row_169_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log1p.html" target="_self">Log1p</a></td><td class="desc"></td></tr>
-<tr id="row_170_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log1p.html" target="_self">mlx::core::detail::Log1p</a></td><td class="desc"></td></tr>
-<tr id="row_171_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log2.html" target="_self">Log2</a></td><td class="desc"></td></tr>
-<tr id="row_172_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log2.html" target="_self">mlx::core::detail::Log2</a></td><td class="desc"></td></tr>
-<tr id="row_173_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log_add_exp.html" target="_self">LogAddExp</a></td><td class="desc"></td></tr>
-<tr id="row_174_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log_add_exp.html" target="_self">mlx::core::detail::LogAddExp</a></td><td class="desc"></td></tr>
-<tr id="row_175_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_and.html" target="_self">LogicalAnd</a></td><td class="desc"></td></tr>
-<tr id="row_176_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_and.html" target="_self">mlx::core::detail::LogicalAnd</a></td><td class="desc"></td></tr>
-<tr id="row_177_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_not.html" target="_self">LogicalNot</a></td><td class="desc"></td></tr>
-<tr id="row_178_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_not.html" target="_self">mlx::core::detail::LogicalNot</a></td><td class="desc"></td></tr>
-<tr id="row_179_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_or.html" target="_self">LogicalOr</a></td><td class="desc"></td></tr>
-<tr id="row_180_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_or.html" target="_self">mlx::core::detail::LogicalOr</a></td><td class="desc"></td></tr>
-<tr id="row_181_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html" target="_self">mlx::steel::LoopAlignment&lt; M_aligned, N_aligned, K_aligned &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_182_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structlooped__elem__to__loc.html" target="_self">looped_elem_to_loc&lt; dim, offset_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_183_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html" target="_self">looped_elem_to_loc&lt; 0, offset_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_184_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html" target="_self">looped_elem_to_loc&lt; 1, offset_t &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_185_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1make__void.html" target="_self">metal::make_void&lt; Ts &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_186_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_max.html" target="_self">Max&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_187_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_maximum.html" target="_self">Maximum</a></td><td class="desc"></td></tr>
-<tr id="row_188_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_maximum.html" target="_self">mlx::core::detail::Maximum</a></td><td class="desc"></td></tr>
-<tr id="row_189_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_min.html" target="_self">Min&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_190_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_minimum.html" target="_self">Minimum</a></td><td class="desc"></td></tr>
-<tr id="row_191_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_minimum.html" target="_self">mlx::core::detail::Minimum</a></td><td class="desc"></td></tr>
-<tr id="row_192_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx__atomic.html" target="_self">mlx_atomic&lt; T, typename &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_193_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html" target="_self">mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_194_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_conv_params.html" target="_self">MLXConvParams&lt; NDIM &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_195_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_conv_params.html" target="_self">MLXConvParams&lt; 2 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_196_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_fast_attention_params.html" target="_self">MLXFastAttentionParams</a></td><td class="desc"></td></tr>
-<tr id="row_197_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html" target="_self">MLXScaledDotProductAttentionParams</a></td><td class="desc"></td></tr>
-<tr id="row_198_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_199_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">mlx::steel::MMATile&lt; float, 1, TN, mlx::steel::BaseMMAFrag &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">mlx::steel::MMATile&lt; float, TM, 1, mlx::steel::BaseMMAFrag &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_201_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">mlx::steel::MMATile&lt; float, TM, TN, mlx::steel::BaseMMAFrag &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_202_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1multi__iter.html" target="_self">pocketfft::detail::multi_iter&lt; N &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_203_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_multiply.html" target="_self">mlx::core::detail::Multiply</a></td><td class="desc"></td></tr>
-<tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_multiply.html" target="_self">Multiply</a></td><td class="desc"></td></tr>
-<tr id="row_205_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_na_n_equal.html" target="_self">mlx::core::detail::NaNEqual</a></td><td class="desc"></td></tr>
-<tr id="row_206_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_na_n_equal.html" target="_self">NaNEqual</a></td><td class="desc"></td></tr>
-<tr id="row_207_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_negative.html" target="_self">mlx::core::detail::Negative</a></td><td class="desc"></td></tr>
-<tr id="row_208_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_negative.html" target="_self">Negative</a></td><td class="desc"></td></tr>
-<tr id="row_209_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_node_namer.html" target="_self">mlx::core::NodeNamer</a></td><td class="desc"></td></tr>
-<tr id="row_210_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_none.html" target="_self">None</a></td><td class="desc"></td></tr>
-<tr id="row_211_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_not_equal.html" target="_self">mlx::core::detail::NotEqual</a></td><td class="desc"></td></tr>
-<tr id="row_212_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_not_equal.html" target="_self">NotEqual</a></td><td class="desc"></td></tr>
-<tr id="row_213_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_or.html" target="_self">Or&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_214_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1pocketfft__c.html" target="_self">pocketfft::detail::pocketfft_c&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_215_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1pocketfft__r.html" target="_self">pocketfft::detail::pocketfft_r&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element.html" target="_self">metal::pointer_element&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_217_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element_3_01constant_01_t_01_5_01_4.html" target="_self">metal::pointer_element&lt; constant T * &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_218_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element_3_01device_01_t_01_5_01_4.html" target="_self">metal::pointer_element&lt; device T * &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_219_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element_3_01thread_01_t_01_5_01_4.html" target="_self">metal::pointer_element&lt; thread T * &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_220_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element_3_01threadgroup_01_t_01_5_01_4.html" target="_self">metal::pointer_element&lt; threadgroup T * &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_221_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_power.html" target="_self">mlx::core::detail::Power</a></td><td class="desc"></td></tr>
-<tr id="row_222_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_power.html" target="_self">Power</a></td><td class="desc"></td></tr>
-<tr id="row_223_" class="odd"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_223_" class="arrow" onclick="dynsection.toggleFolder('223_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_primitive.html" target="_self">mlx::core::Primitive</a></td><td class="desc"></td></tr>
-<tr id="row_223_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_compiled.html" target="_self">mlx::core::Compiled</a></td><td class="desc"></td></tr>
-<tr id="row_223_1_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_custom_transforms.html" target="_self">mlx::core::CustomTransforms</a></td><td class="desc"></td></tr>
-<tr id="row_223_2_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_depends.html" target="_self">mlx::core::Depends</a></td><td class="desc"></td></tr>
-<tr id="row_223_3_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_div_mod.html" target="_self">mlx::core::DivMod</a></td><td class="desc"></td></tr>
-<tr id="row_223_4_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_eigh.html" target="_self">mlx::core::Eigh</a></td><td class="desc"></td></tr>
-<tr id="row_223_5_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_q_r_f.html" target="_self">mlx::core::QRF</a></td><td class="desc"></td></tr>
-<tr id="row_223_6_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_s_v_d.html" target="_self">mlx::core::SVD</a></td><td class="desc"></td></tr>
-<tr id="row_223_7_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_split.html" target="_self">mlx::core::Split</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_223_8_" class="arrow" onclick="dynsection.toggleFolder('223_8_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html" target="_self">mlx::core::UnaryPrimitive</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_abs.html" target="_self">mlx::core::Abs</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_add.html" target="_self">mlx::core::Add</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_add_m_m.html" target="_self">mlx::core::AddMM</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arange.html" target="_self">mlx::core::Arange</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_cos.html" target="_self">mlx::core::ArcCos</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_cosh.html" target="_self">mlx::core::ArcCosh</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_sin.html" target="_self">mlx::core::ArcSin</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_sinh.html" target="_self">mlx::core::ArcSinh</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_8_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_tan.html" target="_self">mlx::core::ArcTan</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_9_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_tan2.html" target="_self">mlx::core::ArcTan2</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_10_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_tanh.html" target="_self">mlx::core::ArcTanh</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_11_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arg_partition.html" target="_self">mlx::core::ArgPartition</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_12_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arg_reduce.html" target="_self">mlx::core::ArgReduce</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_13_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arg_sort.html" target="_self">mlx::core::ArgSort</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_14_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_as_strided.html" target="_self">mlx::core::AsStrided</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_15_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_as_type.html" target="_self">mlx::core::AsType</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_16_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html" target="_self">mlx::core::BitwiseBinary</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_17_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html" target="_self">mlx::core::BlockMaskedMM</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_18_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_broadcast.html" target="_self">mlx::core::Broadcast</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_19_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_ceil.html" target="_self">mlx::core::Ceil</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_20_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cholesky.html" target="_self">mlx::core::Cholesky</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_21_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_concatenate.html" target="_self">mlx::core::Concatenate</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_22_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_conjugate.html" target="_self">mlx::core::Conjugate</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_23_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_convolution.html" target="_self">mlx::core::Convolution</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_24_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_copy.html" target="_self">mlx::core::Copy</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_25_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cos.html" target="_self">mlx::core::Cos</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_26_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cosh.html" target="_self">mlx::core::Cosh</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_27_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_divide.html" target="_self">mlx::core::Divide</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_28_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_equal.html" target="_self">mlx::core::Equal</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_29_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_erf.html" target="_self">mlx::core::Erf</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_30_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_erf_inv.html" target="_self">mlx::core::ErfInv</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_31_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_exp.html" target="_self">mlx::core::Exp</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_32_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_expm1.html" target="_self">mlx::core::Expm1</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_33_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_f_f_t.html" target="_self">mlx::core::FFT</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_34_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_floor.html" target="_self">mlx::core::Floor</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_35_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_full.html" target="_self">mlx::core::Full</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_36_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather.html" target="_self">mlx::core::Gather</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_37_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather_m_m.html" target="_self">mlx::core::GatherMM</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_38_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html" target="_self">mlx::core::GatherQMM</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_39_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_greater.html" target="_self">mlx::core::Greater</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_40_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_greater_equal.html" target="_self">mlx::core::GreaterEqual</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_41_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_hadamard.html" target="_self">mlx::core::Hadamard</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_42_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_imag.html" target="_self">mlx::core::Imag</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_43_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_inverse.html" target="_self">mlx::core::Inverse</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_44_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_less.html" target="_self">mlx::core::Less</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_45_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_less_equal.html" target="_self">mlx::core::LessEqual</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_46_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_load.html" target="_self">mlx::core::Load</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_47_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log.html" target="_self">mlx::core::Log</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_48_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log1p.html" target="_self">mlx::core::Log1p</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_49_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log_add_exp.html" target="_self">mlx::core::LogAddExp</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_50_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_and.html" target="_self">mlx::core::LogicalAnd</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_51_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_not.html" target="_self">mlx::core::LogicalNot</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_52_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_or.html" target="_self">mlx::core::LogicalOr</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_53_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_matmul.html" target="_self">mlx::core::Matmul</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_54_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_maximum.html" target="_self">mlx::core::Maximum</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_55_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_minimum.html" target="_self">mlx::core::Minimum</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_56_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_multiply.html" target="_self">mlx::core::Multiply</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_57_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_negative.html" target="_self">mlx::core::Negative</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_58_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_not_equal.html" target="_self">mlx::core::NotEqual</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_59_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_number_of_elements.html" target="_self">mlx::core::NumberOfElements</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_60_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_pad.html" target="_self">mlx::core::Pad</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_61_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_partition.html" target="_self">mlx::core::Partition</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_62_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_power.html" target="_self">mlx::core::Power</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_63_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html" target="_self">mlx::core::QuantizedMatmul</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_64_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_random_bits.html" target="_self">mlx::core::RandomBits</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_65_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_real.html" target="_self">mlx::core::Real</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_66_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_reduce.html" target="_self">mlx::core::Reduce</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_67_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_remainder.html" target="_self">mlx::core::Remainder</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_68_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_reshape.html" target="_self">mlx::core::Reshape</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_69_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_round.html" target="_self">mlx::core::Round</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_70_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_scan.html" target="_self">mlx::core::Scan</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_71_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_scatter.html" target="_self">mlx::core::Scatter</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_72_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_select.html" target="_self">mlx::core::Select</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_73_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sigmoid.html" target="_self">mlx::core::Sigmoid</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_74_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sign.html" target="_self">mlx::core::Sign</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_75_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sin.html" target="_self">mlx::core::Sin</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_76_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sinh.html" target="_self">mlx::core::Sinh</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_77_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_slice.html" target="_self">mlx::core::Slice</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_78_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_slice_update.html" target="_self">mlx::core::SliceUpdate</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_79_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_softmax.html" target="_self">mlx::core::Softmax</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_80_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sort.html" target="_self">mlx::core::Sort</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_81_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sqrt.html" target="_self">mlx::core::Sqrt</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_82_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_square.html" target="_self">mlx::core::Square</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_83_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_stop_gradient.html" target="_self">mlx::core::StopGradient</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_84_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_subtract.html" target="_self">mlx::core::Subtract</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_85_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_tan.html" target="_self">mlx::core::Tan</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_86_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_tanh.html" target="_self">mlx::core::Tanh</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_87_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_transpose.html" target="_self">mlx::core::Transpose</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_88_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_uniform.html" target="_self">mlx::core::Uniform</a></td><td class="desc"></td></tr>
-<tr id="row_223_8_89_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_view.html" target="_self">mlx::core::View</a></td><td class="desc"></td></tr>
-<tr id="row_223_9_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_223_9_" class="arrow" onclick="dynsection.toggleFolder('223_9_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_dist_primitive.html" target="_self">mlx::core::distributed::DistPrimitive</a></td><td class="desc"></td></tr>
-<tr id="row_223_9_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html" target="_self">mlx::core::distributed::AllGather</a></td><td class="desc"></td></tr>
-<tr id="row_223_9_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html" target="_self">mlx::core::distributed::AllReduce</a></td><td class="desc"></td></tr>
-<tr id="row_223_9_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html" target="_self">mlx::core::distributed::Recv</a></td><td class="desc"></td></tr>
-<tr id="row_223_9_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html" target="_self">mlx::core::distributed::Send</a></td><td class="desc"></td></tr>
-<tr id="row_223_10_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_223_10_" class="arrow" onclick="dynsection.toggleFolder('223_10_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html" target="_self">mlx::core::fast::Custom</a></td><td class="desc"></td></tr>
-<tr id="row_223_10_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html" target="_self">mlx::core::fast::AffineQuantize</a></td><td class="desc"></td></tr>
-<tr id="row_223_10_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html" target="_self">mlx::core::fast::LayerNorm</a></td><td class="desc"></td></tr>
-<tr id="row_223_10_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html" target="_self">mlx::core::fast::LayerNormVJP</a></td><td class="desc"></td></tr>
-<tr id="row_223_10_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html" target="_self">mlx::core::fast::RMSNorm</a></td><td class="desc"></td></tr>
-<tr id="row_223_10_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html" target="_self">mlx::core::fast::RMSNormVJP</a></td><td class="desc"></td></tr>
-<tr id="row_223_10_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html" target="_self">mlx::core::fast::RoPE</a></td><td class="desc"></td></tr>
-<tr id="row_223_10_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html" target="_self">mlx::core::fast::ScaledDotProductAttention</a></td><td class="desc"></td></tr>
-<tr id="row_223_11_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html" target="_self">mlx::core::fast::CustomKernel</a></td><td class="desc"></td></tr>
-<tr id="row_224_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_print_formatter.html" target="_self">mlx::core::PrintFormatter</a></td><td class="desc"></td></tr>
-<tr id="row_225_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_prod.html" target="_self">Prod&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_226_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_quantized_block_loader.html" target="_self">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_227_" class="odd"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_227_" class="arrow" onclick="dynsection.toggleFolder('227_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1io_1_1_reader.html" target="_self">mlx::core::io::Reader</a></td><td class="desc"></td></tr>
-<tr id="row_227_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1io_1_1_parallel_file_reader.html" target="_self">mlx::core::io::ParallelFileReader</a></td><td class="desc"></td></tr>
-<tr id="row_228_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html" target="_self">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a></td><td class="desc"></td></tr>
-<tr id="row_229_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_read_writer.html" target="_self">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_230_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_real.html" target="_self">mlx::core::detail::Real</a></td><td class="desc"></td></tr>
-<tr id="row_231_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_real.html" target="_self">Real</a></td><td class="desc"></td></tr>
-<tr id="row_232_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_reduction_plan.html" target="_self">mlx::core::ReductionPlan</a></td><td class="desc"></td></tr>
-<tr id="row_233_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_remainder.html" target="_self">mlx::core::detail::Remainder</a></td><td class="desc"></td></tr>
-<tr id="row_234_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_remainder.html" target="_self">Remainder</a></td><td class="desc"></td></tr>
-<tr id="row_235_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1metal_1_1_residency_set.html" target="_self">mlx::core::metal::ResidencySet</a></td><td class="desc"></td></tr>
-<tr id="row_236_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_retain_graph.html" target="_self">mlx::core::detail::RetainGraph</a></td><td class="desc"></td></tr>
-<tr id="row_237_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1rev__iter.html" target="_self">pocketfft::detail::rev_iter</a></td><td class="desc"></td></tr>
-<tr id="row_238_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1rfftp.html" target="_self">pocketfft::detail::rfftp&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_239_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_right_shift.html" target="_self">mlx::core::detail::RightShift</a></td><td class="desc"></td></tr>
-<tr id="row_240_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_right_shift.html" target="_self">RightShift</a></td><td class="desc"></td></tr>
-<tr id="row_241_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_round.html" target="_self">mlx::core::detail::Round</a></td><td class="desc"></td></tr>
-<tr id="row_242_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_round.html" target="_self">Round</a></td><td class="desc"></td></tr>
-<tr id="row_243_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_rsqrt.html" target="_self">mlx::core::detail::Rsqrt</a></td><td class="desc"></td></tr>
-<tr id="row_244_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_rsqrt.html" target="_self">Rsqrt</a></td><td class="desc"></td></tr>
-<tr id="row_245_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_scale_op.html" target="_self">ScaleOp&lt; OutT, InT &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_246_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html" target="_self">mlx::core::scheduler::Scheduler</a></td><td class="desc"></td></tr>
-<tr id="row_247_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_select.html" target="_self">mlx::core::detail::Select</a></td><td class="desc"></td></tr>
-<tr id="row_248_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_select.html" target="_self">Select</a></td><td class="desc"></td></tr>
-<tr id="row_249_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sigmoid.html" target="_self">mlx::core::detail::Sigmoid</a></td><td class="desc"></td></tr>
-<tr id="row_250_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sigmoid.html" target="_self">Sigmoid</a></td><td class="desc"></td></tr>
-<tr id="row_251_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sign.html" target="_self">mlx::core::detail::Sign</a></td><td class="desc"></td></tr>
-<tr id="row_252_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sign.html" target="_self">Sign</a></td><td class="desc"></td></tr>
-<tr id="row_253_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1simple__iter.html" target="_self">pocketfft::detail::simple_iter</a></td><td class="desc"></td></tr>
-<tr id="row_254_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sin.html" target="_self">mlx::core::detail::Sin</a></td><td class="desc"></td></tr>
-<tr id="row_255_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sin.html" target="_self">Sin</a></td><td class="desc"></td></tr>
-<tr id="row_256_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1sincos__2pibyn.html" target="_self">pocketfft::detail::sincos_2pibyn&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_257_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sinh.html" target="_self">mlx::core::detail::Sinh</a></td><td class="desc"></td></tr>
-<tr id="row_258_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sinh.html" target="_self">Sinh</a></td><td class="desc"></td></tr>
-<tr id="row_259_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sqrt.html" target="_self">mlx::core::detail::Sqrt</a></td><td class="desc"></td></tr>
-<tr id="row_260_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sqrt.html" target="_self">Sqrt</a></td><td class="desc"></td></tr>
-<tr id="row_261_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_square.html" target="_self">mlx::core::detail::Square</a></td><td class="desc"></td></tr>
-<tr id="row_262_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_square.html" target="_self">Square</a></td><td class="desc"></td></tr>
-<tr id="row_263_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_stream.html" target="_self">mlx::core::Stream</a></td><td class="desc"></td></tr>
-<tr id="row_264_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_stream_context.html" target="_self">mlx::core::StreamContext</a></td><td class="desc"></td></tr>
-<tr id="row_265_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html" target="_self">mlx::core::scheduler::StreamThread</a></td><td class="desc"></td></tr>
-<tr id="row_266_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_subtract.html" target="_self">mlx::core::detail::Subtract</a></td><td class="desc"></td></tr>
-<tr id="row_267_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_subtract.html" target="_self">Subtract</a></td><td class="desc"></td></tr>
-<tr id="row_268_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sum.html" target="_self">Sum&lt; U &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_269_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1_t__dcst23.html" target="_self">pocketfft::detail::T_dcst23&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_270_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1_t__dcst4.html" target="_self">pocketfft::detail::T_dcst4&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_271_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1_t__dct1.html" target="_self">pocketfft::detail::T_dct1&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_272_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1_t__dst1.html" target="_self">pocketfft::detail::T_dst1&lt; T0 &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_273_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_tan.html" target="_self">mlx::core::detail::Tan</a></td><td class="desc"></td></tr>
-<tr id="row_274_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_tan.html" target="_self">Tan</a></td><td class="desc"></td></tr>
-<tr id="row_275_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_tanh.html" target="_self">mlx::core::detail::Tanh</a></td><td class="desc"></td></tr>
-<tr id="row_276_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_tanh.html" target="_self">Tanh</a></td><td class="desc"></td></tr>
-<tr id="row_277_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1thread__pool.html" target="_self">pocketfft::detail::threading::thread_pool</a></td><td class="desc"></td></tr>
-<tr id="row_278_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="class_thread_pool.html" target="_self">ThreadPool</a></td><td class="desc"></td></tr>
-<tr id="row_279_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_thread_sort.html" target="_self">ThreadSort&lt; val_t, idx_t, ARG_SORT, N_PER_THREAD, CompareOp &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_280_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_add.html" target="_self">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_281_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html" target="_self">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_282_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_none.html" target="_self">mlx::steel::TransformNone&lt; OutT, InT &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_283_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_type_to_dtype.html" target="_self">mlx::core::TypeToDtype&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_284_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1util.html" target="_self">pocketfft::detail::util</a></td><td class="desc"></td></tr>
-<tr id="row_285_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_v_l_e_n.html" target="_self">pocketfft::detail::VLEN&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_286_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_v_t_y_p_e.html" target="_self">pocketfft::detail::VTYPE&lt; T &gt;</a></td><td class="desc"></td></tr>
-<tr id="row_287_" class="odd"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_287_" class="arrow" onclick="dynsection.toggleFolder('287_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1io_1_1_writer.html" target="_self">mlx::core::io::Writer</a></td><td class="desc"></td></tr>
-<tr id="row_287_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1io_1_1_file_writer.html" target="_self">mlx::core::io::FileWriter</a></td><td class="desc"></td></tr>
+<tr id="row_36_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_attn_params.html" target="_self">mlx::steel::AttnParams</a></td><td class="desc"></td></tr>
+<tr id="row_37_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html" target="_self">mlx::steel::BaseMMAFrag&lt; T, kFragRows_, kFragCols_ &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_38_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html" target="_self">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_39_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html" target="_self">_MLX_BFloat16::bits_to_bfloat_struct</a></td><td class="desc"></td></tr>
+<tr id="row_40_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_bitwise_and.html" target="_self">BitwiseAnd</a></td><td class="desc"></td></tr>
+<tr id="row_41_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_and.html" target="_self">mlx::core::detail::BitwiseAnd</a></td><td class="desc"></td></tr>
+<tr id="row_42_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_bitwise_or.html" target="_self">BitwiseOr</a></td><td class="desc"></td></tr>
+<tr id="row_43_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_or.html" target="_self">mlx::core::detail::BitwiseOr</a></td><td class="desc"></td></tr>
+<tr id="row_44_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_bitwise_xor.html" target="_self">BitwiseXor</a></td><td class="desc"></td></tr>
+<tr id="row_45_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_bitwise_xor.html" target="_self">mlx::core::detail::BitwiseXor</a></td><td class="desc"></td></tr>
+<tr id="row_46_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader.html" target="_self">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_47_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html" target="_self">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_48_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_block_merge_sort.html" target="_self">BlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_49_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html" target="_self">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_50_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html" target="_self">mlx::steel::BlockSwizzle</a></td><td class="desc"></td></tr>
+<tr id="row_51_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="unionbool4__or__uint.html" target="_self">bool4_or_uint</a></td><td class="desc"></td></tr>
+<tr id="row_52_" class="even"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_52_" class="arrow" onclick="dynsection.toggleFolder('52_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><b>metal::bool_constant</b></td><td class="desc"></td></tr>
+<tr id="row_52_0_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1is__empty.html" target="_self">metal::is_empty&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_52_1_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1is__static.html" target="_self">metal::is_static&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_53_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1allocator_1_1_buffer.html" target="_self">mlx::core::allocator::Buffer</a></td><td class="desc"></td></tr>
+<tr id="row_54_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_ceil.html" target="_self">Ceil</a></td><td class="desc"></td></tr>
+<tr id="row_55_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_ceil.html" target="_self">mlx::core::detail::Ceil</a></td><td class="desc"></td></tr>
+<tr id="row_56_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1cfftp.html" target="_self">pocketfft::detail::cfftp&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_57_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper.html" target="_self">mlx::steel::ChannelHelper&lt; n_channels_ &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_58_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html" target="_self">mlx::steel::ChannelHelper&lt; 1 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_59_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html" target="_self">mlx::steel::ChannelHelper&lt; 2 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_60_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html" target="_self">mlx::steel::ChannelHelper&lt; 3 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_61_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html" target="_self">mlx::steel::ChannelHelper&lt; 4 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_62_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1cmplx.html" target="_self">pocketfft::detail::cmplx&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_63_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1cmplx.html" target="_self">pocketfft::detail::cmplx&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_64_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1cmplx.html" target="_self">pocketfft::detail::cmplx&lt; Thigh &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_65_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html" target="_self">mlx::core::metal::CommandEncoder</a></td><td class="desc"></td></tr>
+<tr id="row_66_" class="even"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_66_" class="arrow" onclick="dynsection.toggleFolder('66_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><b>std::complex</b></td><td class="desc"></td></tr>
+<tr id="row_66_0_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1complex128__t.html" target="_self">mlx::core::complex128_t</a></td><td class="desc"></td></tr>
+<tr id="row_66_1_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1complex64__t.html" target="_self">mlx::core::complex64_t</a></td><td class="desc"></td></tr>
+<tr id="row_67_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structcomplex64__t.html" target="_self">complex64_t</a></td><td class="desc"></td></tr>
+<tr id="row_68_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html" target="_self">pocketfft::detail::threading::concurrent_queue&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_69_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html" target="_self">pocketfft::detail::threading::concurrent_queue&lt; std::function&lt; void()&gt; &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_70_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html" target="_self">mlx::core::metal::CommandEncoder::ConcurrentContext</a></td><td class="desc"></td></tr>
+<tr id="row_71_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_conjugate.html" target="_self">Conjugate</a></td><td class="desc"></td></tr>
+<tr id="row_72_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_conjugate.html" target="_self">mlx::core::detail::Conjugate</a></td><td class="desc"></td></tr>
+<tr id="row_73_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html" target="_self">mlx::core::ContiguousIterator&lt; StrideT &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_74_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_base_info.html" target="_self">mlx::steel::Conv2DGeneralBaseInfo</a></td><td class="desc"></td></tr>
+<tr id="row_75_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_general_jump_params.html" target="_self">mlx::steel::Conv2DGeneralJumpParams</a></td><td class="desc"></td></tr>
+<tr id="row_76_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html" target="_self">mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_77_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html" target="_self">mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_78_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html" target="_self">mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_79_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html" target="_self">mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_80_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html" target="_self">mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_81_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html" target="_self">mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_82_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html" target="_self">mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_83_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cos.html" target="_self">Cos</a></td><td class="desc"></td></tr>
+<tr id="row_84_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_cos.html" target="_self">mlx::core::detail::Cos</a></td><td class="desc"></td></tr>
+<tr id="row_85_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cosh.html" target="_self">Cosh</a></td><td class="desc"></td></tr>
+<tr id="row_86_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_cosh.html" target="_self">mlx::core::detail::Cosh</a></td><td class="desc"></td></tr>
+<tr id="row_87_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_c_shape.html" target="_self">mlx::steel::CShape&lt; R, C &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_88_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_max.html" target="_self">CumMax&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_89_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_min.html" target="_self">CumMin&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_90_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_prod.html" target="_self">CumProd&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_91_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_prod_3_01bool_01_4.html" target="_self">CumProd&lt; bool &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_92_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_cum_sum.html" target="_self">CumSum&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_93_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html" target="_self">mlx::core::fast::CustomKernelShapeInfo</a></td><td class="desc"></td></tr>
+<tr id="row_94_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1array_1_1_data.html" target="_self">mlx::core::array::Data</a></td><td class="desc"></td></tr>
+<tr id="row_95_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_default_contiguous_reduce.html" target="_self">mlx::core::DefaultContiguousReduce&lt; T, U, Op &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_96_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_default_strided_reduce.html" target="_self">mlx::core::DefaultStridedReduce&lt; T, U, Op &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_97_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_device.html" target="_self">mlx::core::Device</a></td><td class="desc"></td></tr>
+<tr id="row_98_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html" target="_self">mlx::core::metal::Device</a></td><td class="desc"></td></tr>
+<tr id="row_99_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1metal_1_1_device_stream.html" target="_self">mlx::core::metal::DeviceStream</a></td><td class="desc"></td></tr>
+<tr id="row_100_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_divide.html" target="_self">Divide</a></td><td class="desc"></td></tr>
+<tr id="row_101_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_divide.html" target="_self">mlx::core::detail::Divide</a></td><td class="desc"></td></tr>
+<tr id="row_102_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_div_mod.html" target="_self">DivMod</a></td><td class="desc"></td></tr>
+<tr id="row_103_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_div_op.html" target="_self">DivOp</a></td><td class="desc"></td></tr>
+<tr id="row_104_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_dtype.html" target="_self">mlx::core::Dtype</a></td><td class="desc"></td></tr>
+<tr id="row_105_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_equal.html" target="_self">Equal</a></td><td class="desc"></td></tr>
+<tr id="row_106_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_equal.html" target="_self">mlx::core::detail::Equal</a></td><td class="desc"></td></tr>
+<tr id="row_107_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_erf.html" target="_self">Erf</a></td><td class="desc"></td></tr>
+<tr id="row_108_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_erf.html" target="_self">mlx::core::detail::Erf</a></td><td class="desc"></td></tr>
+<tr id="row_109_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_erf_inv.html" target="_self">ErfInv</a></td><td class="desc"></td></tr>
+<tr id="row_110_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_erf_inv.html" target="_self">mlx::core::detail::ErfInv</a></td><td class="desc"></td></tr>
+<tr id="row_111_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_event.html" target="_self">mlx::core::Event</a></td><td class="desc"></td></tr>
+<tr id="row_112_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_exec_c2_c.html" target="_self">pocketfft::detail::ExecC2C</a></td><td class="desc"></td></tr>
+<tr id="row_113_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_exec_dcst.html" target="_self">pocketfft::detail::ExecDcst</a></td><td class="desc"></td></tr>
+<tr id="row_114_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_exec_hartley.html" target="_self">pocketfft::detail::ExecHartley</a></td><td class="desc"></td></tr>
+<tr id="row_115_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_exec_r2_r.html" target="_self">pocketfft::detail::ExecR2R</a></td><td class="desc"></td></tr>
+<tr id="row_116_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_exp.html" target="_self">Exp</a></td><td class="desc"></td></tr>
+<tr id="row_117_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_exp.html" target="_self">mlx::core::detail::Exp</a></td><td class="desc"></td></tr>
+<tr id="row_118_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_expm1.html" target="_self">Expm1</a></td><td class="desc"></td></tr>
+<tr id="row_119_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_expm1.html" target="_self">mlx::core::detail::Expm1</a></td><td class="desc"></td></tr>
+<tr id="row_120_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_exp_sub_op.html" target="_self">ExpSubOp</a></td><td class="desc"></td></tr>
+<tr id="row_121_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1metal_1_1_fence.html" target="_self">mlx::core::metal::Fence</a></td><td class="desc"></td></tr>
+<tr id="row_122_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1fftblue.html" target="_self">pocketfft::detail::fftblue&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_123_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html" target="_self">mlx::core::array::Flags</a></td><td class="desc"></td></tr>
+<tr id="row_124_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_floor.html" target="_self">Floor</a></td><td class="desc"></td></tr>
+<tr id="row_125_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_floor.html" target="_self">mlx::core::detail::Floor</a></td><td class="desc"></td></tr>
+<tr id="row_126_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_floor_divide.html" target="_self">FloorDivide</a></td><td class="desc"></td></tr>
+<tr id="row_127_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html" target="_self">mlx::steel::GEMMAddMMParams</a></td><td class="desc"></td></tr>
+<tr id="row_128_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html" target="_self">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_129_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html" target="_self">mlx::steel::GEMMParams</a></td><td class="desc"></td></tr>
+<tr id="row_130_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html" target="_self">mlx::steel::GEMMSpiltKParams</a></td><td class="desc"></td></tr>
+<tr id="row_131_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_g_e_m_v_kernel.html" target="_self">GEMVKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_132_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_g_e_m_v_t_kernel.html" target="_self">GEMVTKernel&lt; T, out_mask_t, op_mask_t, BM, BN, SM, SN, TM, TN &gt;</a></td><td class="desc">Vector matrix multiplication </td></tr>
+<tr id="row_133_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_greater.html" target="_self">Greater</a></td><td class="desc"></td></tr>
+<tr id="row_134_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_greater.html" target="_self">mlx::core::detail::Greater</a></td><td class="desc"></td></tr>
+<tr id="row_135_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_greater_equal.html" target="_self">GreaterEqual</a></td><td class="desc"></td></tr>
+<tr id="row_136_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_greater_equal.html" target="_self">mlx::core::detail::GreaterEqual</a></td><td class="desc"></td></tr>
+<tr id="row_137_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1distributed_1_1_group.html" target="_self">mlx::core::distributed::Group</a></td><td class="desc">A <a class="el" href="structmlx_1_1core_1_1distributed_1_1_group.html" title="A distributed::Group represents a group of independent mlx processes that can communicate.">distributed::Group</a> represents a group of independent mlx processes that can communicate </td></tr>
+<tr id="row_138_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_imag.html" target="_self">Imag</a></td><td class="desc"></td></tr>
+<tr id="row_139_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_imag.html" target="_self">mlx::core::detail::Imag</a></td><td class="desc"></td></tr>
+<tr id="row_140_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html" target="_self">mlx::steel::ImplicitGemmConv2DParams</a></td><td class="desc"></td></tr>
+<tr id="row_141_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_indices.html" target="_self">Indices&lt; IdxT, NIDX &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_142_" class="even"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_142_" class="arrow" onclick="dynsection.toggleFolder('142_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1integral__constant.html" target="_self">mlx::steel::integral_constant&lt; T, v &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_142_0_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1is__integral.html" target="_self">mlx::steel::is_integral&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_142_1_" class="odd" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1is__integral_3_01integral__constant_3_01_t_00_01v_01_4_01_4.html" target="_self">mlx::steel::is_integral&lt; integral_constant&lt; T, v &gt; &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_143_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="unionmlx_1_1core_1_1detail_1_1_int_or_float.html" target="_self">mlx::core::detail::IntOrFloat</a></td><td class="desc"></td></tr>
+<tr id="row_144_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_in_tracing.html" target="_self">mlx::core::detail::InTracing</a></td><td class="desc"></td></tr>
+<tr id="row_145_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_kernel_merge_sort.html" target="_self">KernelMergeSort&lt; T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_146_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_kernel_multi_block_merge_sort.html" target="_self">KernelMultiBlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_147_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1random_1_1_key_sequence.html" target="_self">mlx::core::random::KeySequence</a></td><td class="desc"></td></tr>
+<tr id="row_148_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1latch.html" target="_self">pocketfft::detail::threading::latch</a></td><td class="desc"></td></tr>
+<tr id="row_149_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html" target="_self">mlx::steel::Layout2D&lt; Shape, Layout &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_150_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_left_shift.html" target="_self">LeftShift</a></td><td class="desc"></td></tr>
+<tr id="row_151_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_left_shift.html" target="_self">mlx::core::detail::LeftShift</a></td><td class="desc"></td></tr>
+<tr id="row_152_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less.html" target="_self">Less</a></td><td class="desc"></td></tr>
+<tr id="row_153_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_less.html" target="_self">mlx::core::detail::Less</a></td><td class="desc"></td></tr>
+<tr id="row_154_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less_equal.html" target="_self">LessEqual</a></td><td class="desc"></td></tr>
+<tr id="row_155_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_less_equal.html" target="_self">mlx::core::detail::LessEqual</a></td><td class="desc"></td></tr>
+<tr id="row_156_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_less_than.html" target="_self">LessThan&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_157_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits.html" target="_self">Limits&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_158_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01bfloat16__t_01_4.html" target="_self">Limits&lt; bfloat16_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_159_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01bool_01_4.html" target="_self">Limits&lt; bool &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_160_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01complex64__t_01_4.html" target="_self">Limits&lt; complex64_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_161_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01float_01_4.html" target="_self">Limits&lt; float &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_162_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01half_01_4.html" target="_self">Limits&lt; half &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_163_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int16__t_01_4.html" target="_self">Limits&lt; int16_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_164_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int32__t_01_4.html" target="_self">Limits&lt; int32_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_165_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int64__t_01_4.html" target="_self">Limits&lt; int64_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_166_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01int8__t_01_4.html" target="_self">Limits&lt; int8_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_167_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint16__t_01_4.html" target="_self">Limits&lt; uint16_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_168_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint32__t_01_4.html" target="_self">Limits&lt; uint32_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_169_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint64__t_01_4.html" target="_self">Limits&lt; uint64_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_170_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_limits_3_01uint8__t_01_4.html" target="_self">Limits&lt; uint8_t &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_171_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log.html" target="_self">Log</a></td><td class="desc"></td></tr>
+<tr id="row_172_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log.html" target="_self">mlx::core::detail::Log</a></td><td class="desc"></td></tr>
+<tr id="row_173_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log10.html" target="_self">Log10</a></td><td class="desc"></td></tr>
+<tr id="row_174_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log10.html" target="_self">mlx::core::detail::Log10</a></td><td class="desc"></td></tr>
+<tr id="row_175_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log1p.html" target="_self">Log1p</a></td><td class="desc"></td></tr>
+<tr id="row_176_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log1p.html" target="_self">mlx::core::detail::Log1p</a></td><td class="desc"></td></tr>
+<tr id="row_177_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log2.html" target="_self">Log2</a></td><td class="desc"></td></tr>
+<tr id="row_178_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log2.html" target="_self">mlx::core::detail::Log2</a></td><td class="desc"></td></tr>
+<tr id="row_179_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_log_add_exp.html" target="_self">LogAddExp</a></td><td class="desc"></td></tr>
+<tr id="row_180_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_log_add_exp.html" target="_self">mlx::core::detail::LogAddExp</a></td><td class="desc"></td></tr>
+<tr id="row_181_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_and.html" target="_self">LogicalAnd</a></td><td class="desc"></td></tr>
+<tr id="row_182_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_and.html" target="_self">mlx::core::detail::LogicalAnd</a></td><td class="desc"></td></tr>
+<tr id="row_183_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_not.html" target="_self">LogicalNot</a></td><td class="desc"></td></tr>
+<tr id="row_184_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_not.html" target="_self">mlx::core::detail::LogicalNot</a></td><td class="desc"></td></tr>
+<tr id="row_185_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_logical_or.html" target="_self">LogicalOr</a></td><td class="desc"></td></tr>
+<tr id="row_186_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_logical_or.html" target="_self">mlx::core::detail::LogicalOr</a></td><td class="desc"></td></tr>
+<tr id="row_187_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html" target="_self">mlx::steel::LoopAlignment&lt; M_aligned, N_aligned, K_aligned &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_188_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_looped_elem_to_loc.html" target="_self">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_189_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html" target="_self">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_190_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html" target="_self">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_191_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1make__void.html" target="_self">metal::make_void&lt; Ts &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_192_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_max.html" target="_self">Max&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_193_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_maximum.html" target="_self">Maximum</a></td><td class="desc"></td></tr>
+<tr id="row_194_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_maximum.html" target="_self">mlx::core::detail::Maximum</a></td><td class="desc"></td></tr>
+<tr id="row_195_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_max_op.html" target="_self">MaxOp</a></td><td class="desc"></td></tr>
+<tr id="row_196_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_min.html" target="_self">Min&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_197_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_minimum.html" target="_self">Minimum</a></td><td class="desc"></td></tr>
+<tr id="row_198_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_minimum.html" target="_self">mlx::core::detail::Minimum</a></td><td class="desc"></td></tr>
+<tr id="row_199_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx__atomic.html" target="_self">mlx_atomic&lt; T, typename &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_200_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html" target="_self">mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_201_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_conv_params.html" target="_self">MLXConvParams&lt; NDIM &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_202_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_m_l_x_conv_params.html" target="_self">MLXConvParams&lt; 2 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_203_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_204_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">mlx::steel::MMATile&lt; float, 1, TN, mlx::steel::BaseMMAFrag &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_205_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">mlx::steel::MMATile&lt; float, TM, 1, mlx::steel::BaseMMAFrag &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_206_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html" target="_self">mlx::steel::MMATile&lt; float, TM, TN, mlx::steel::BaseMMAFrag &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_207_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_mul_op.html" target="_self">MulOp</a></td><td class="desc"></td></tr>
+<tr id="row_208_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1multi__iter.html" target="_self">pocketfft::detail::multi_iter&lt; N &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_209_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_multiply.html" target="_self">mlx::core::detail::Multiply</a></td><td class="desc"></td></tr>
+<tr id="row_210_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_multiply.html" target="_self">Multiply</a></td><td class="desc"></td></tr>
+<tr id="row_211_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_na_n_equal.html" target="_self">mlx::core::detail::NaNEqual</a></td><td class="desc"></td></tr>
+<tr id="row_212_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_na_n_equal.html" target="_self">NaNEqual</a></td><td class="desc"></td></tr>
+<tr id="row_213_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_negative.html" target="_self">mlx::core::detail::Negative</a></td><td class="desc"></td></tr>
+<tr id="row_214_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_negative.html" target="_self">Negative</a></td><td class="desc"></td></tr>
+<tr id="row_215_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_node_namer.html" target="_self">mlx::core::NodeNamer</a></td><td class="desc"></td></tr>
+<tr id="row_216_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_none.html" target="_self">None</a></td><td class="desc"></td></tr>
+<tr id="row_217_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_not_equal.html" target="_self">mlx::core::detail::NotEqual</a></td><td class="desc"></td></tr>
+<tr id="row_218_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_not_equal.html" target="_self">NotEqual</a></td><td class="desc"></td></tr>
+<tr id="row_219_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_or.html" target="_self">Or&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_220_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1pocketfft__c.html" target="_self">pocketfft::detail::pocketfft_c&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_221_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1pocketfft__r.html" target="_self">pocketfft::detail::pocketfft_r&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_222_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element.html" target="_self">metal::pointer_element&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_223_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element_3_01constant_01_t_01_5_01_4.html" target="_self">metal::pointer_element&lt; constant T * &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_224_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element_3_01device_01_t_01_5_01_4.html" target="_self">metal::pointer_element&lt; device T * &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_225_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element_3_01thread_01_t_01_5_01_4.html" target="_self">metal::pointer_element&lt; thread T * &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_226_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmetal_1_1pointer__element_3_01threadgroup_01_t_01_5_01_4.html" target="_self">metal::pointer_element&lt; threadgroup T * &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_227_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_power.html" target="_self">mlx::core::detail::Power</a></td><td class="desc"></td></tr>
+<tr id="row_228_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_power.html" target="_self">Power</a></td><td class="desc"></td></tr>
+<tr id="row_229_" class="odd"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_229_" class="arrow" onclick="dynsection.toggleFolder('229_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_primitive.html" target="_self">mlx::core::Primitive</a></td><td class="desc"></td></tr>
+<tr id="row_229_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_compiled.html" target="_self">mlx::core::Compiled</a></td><td class="desc"></td></tr>
+<tr id="row_229_1_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_custom_transforms.html" target="_self">mlx::core::CustomTransforms</a></td><td class="desc"></td></tr>
+<tr id="row_229_2_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_depends.html" target="_self">mlx::core::Depends</a></td><td class="desc"></td></tr>
+<tr id="row_229_3_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_div_mod.html" target="_self">mlx::core::DivMod</a></td><td class="desc"></td></tr>
+<tr id="row_229_4_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_eigh.html" target="_self">mlx::core::Eigh</a></td><td class="desc"></td></tr>
+<tr id="row_229_5_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_q_r_f.html" target="_self">mlx::core::QRF</a></td><td class="desc"></td></tr>
+<tr id="row_229_6_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_s_v_d.html" target="_self">mlx::core::SVD</a></td><td class="desc"></td></tr>
+<tr id="row_229_7_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_split.html" target="_self">mlx::core::Split</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_229_8_" class="arrow" onclick="dynsection.toggleFolder('229_8_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_unary_primitive.html" target="_self">mlx::core::UnaryPrimitive</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_abs.html" target="_self">mlx::core::Abs</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_add.html" target="_self">mlx::core::Add</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_add_m_m.html" target="_self">mlx::core::AddMM</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arange.html" target="_self">mlx::core::Arange</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_cos.html" target="_self">mlx::core::ArcCos</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_cosh.html" target="_self">mlx::core::ArcCosh</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_sin.html" target="_self">mlx::core::ArcSin</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_sinh.html" target="_self">mlx::core::ArcSinh</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_8_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_tan.html" target="_self">mlx::core::ArcTan</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_9_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_tan2.html" target="_self">mlx::core::ArcTan2</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_10_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arc_tanh.html" target="_self">mlx::core::ArcTanh</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_11_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arg_partition.html" target="_self">mlx::core::ArgPartition</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_12_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arg_reduce.html" target="_self">mlx::core::ArgReduce</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_13_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_arg_sort.html" target="_self">mlx::core::ArgSort</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_14_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_as_strided.html" target="_self">mlx::core::AsStrided</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_15_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_as_type.html" target="_self">mlx::core::AsType</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_16_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_bitwise_binary.html" target="_self">mlx::core::BitwiseBinary</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_17_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_block_masked_m_m.html" target="_self">mlx::core::BlockMaskedMM</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_18_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_broadcast.html" target="_self">mlx::core::Broadcast</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_19_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_ceil.html" target="_self">mlx::core::Ceil</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_20_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cholesky.html" target="_self">mlx::core::Cholesky</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_21_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_concatenate.html" target="_self">mlx::core::Concatenate</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_22_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_conjugate.html" target="_self">mlx::core::Conjugate</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_23_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_contiguous.html" target="_self">mlx::core::Contiguous</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_24_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_convolution.html" target="_self">mlx::core::Convolution</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_25_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_copy.html" target="_self">mlx::core::Copy</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_26_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cos.html" target="_self">mlx::core::Cos</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_27_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_cosh.html" target="_self">mlx::core::Cosh</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_28_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_divide.html" target="_self">mlx::core::Divide</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_29_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_equal.html" target="_self">mlx::core::Equal</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_30_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_erf.html" target="_self">mlx::core::Erf</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_31_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_erf_inv.html" target="_self">mlx::core::ErfInv</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_32_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_exp.html" target="_self">mlx::core::Exp</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_33_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_expm1.html" target="_self">mlx::core::Expm1</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_34_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_f_f_t.html" target="_self">mlx::core::FFT</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_35_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_floor.html" target="_self">mlx::core::Floor</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_36_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_full.html" target="_self">mlx::core::Full</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_37_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather.html" target="_self">mlx::core::Gather</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_38_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather_m_m.html" target="_self">mlx::core::GatherMM</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_39_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_gather_q_m_m.html" target="_self">mlx::core::GatherQMM</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_40_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_greater.html" target="_self">mlx::core::Greater</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_41_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_greater_equal.html" target="_self">mlx::core::GreaterEqual</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_42_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_hadamard.html" target="_self">mlx::core::Hadamard</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_43_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_imag.html" target="_self">mlx::core::Imag</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_44_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_inverse.html" target="_self">mlx::core::Inverse</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_45_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_less.html" target="_self">mlx::core::Less</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_46_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_less_equal.html" target="_self">mlx::core::LessEqual</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_47_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_load.html" target="_self">mlx::core::Load</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_48_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log.html" target="_self">mlx::core::Log</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_49_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log1p.html" target="_self">mlx::core::Log1p</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_50_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_log_add_exp.html" target="_self">mlx::core::LogAddExp</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_51_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_and.html" target="_self">mlx::core::LogicalAnd</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_52_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_not.html" target="_self">mlx::core::LogicalNot</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_53_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_logical_or.html" target="_self">mlx::core::LogicalOr</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_54_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_matmul.html" target="_self">mlx::core::Matmul</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_55_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_maximum.html" target="_self">mlx::core::Maximum</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_56_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_minimum.html" target="_self">mlx::core::Minimum</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_57_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_multiply.html" target="_self">mlx::core::Multiply</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_58_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_negative.html" target="_self">mlx::core::Negative</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_59_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_not_equal.html" target="_self">mlx::core::NotEqual</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_60_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_number_of_elements.html" target="_self">mlx::core::NumberOfElements</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_61_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_pad.html" target="_self">mlx::core::Pad</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_62_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_partition.html" target="_self">mlx::core::Partition</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_63_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_power.html" target="_self">mlx::core::Power</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_64_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_quantized_matmul.html" target="_self">mlx::core::QuantizedMatmul</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_65_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_random_bits.html" target="_self">mlx::core::RandomBits</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_66_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_real.html" target="_self">mlx::core::Real</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_67_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_reduce.html" target="_self">mlx::core::Reduce</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_68_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_remainder.html" target="_self">mlx::core::Remainder</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_69_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_reshape.html" target="_self">mlx::core::Reshape</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_70_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_round.html" target="_self">mlx::core::Round</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_71_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_scan.html" target="_self">mlx::core::Scan</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_72_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_scatter.html" target="_self">mlx::core::Scatter</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_73_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_select.html" target="_self">mlx::core::Select</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_74_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sigmoid.html" target="_self">mlx::core::Sigmoid</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_75_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sign.html" target="_self">mlx::core::Sign</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_76_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sin.html" target="_self">mlx::core::Sin</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_77_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sinh.html" target="_self">mlx::core::Sinh</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_78_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_slice.html" target="_self">mlx::core::Slice</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_79_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_slice_update.html" target="_self">mlx::core::SliceUpdate</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_80_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_softmax.html" target="_self">mlx::core::Softmax</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_81_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sort.html" target="_self">mlx::core::Sort</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_82_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_sqrt.html" target="_self">mlx::core::Sqrt</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_83_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_square.html" target="_self">mlx::core::Square</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_84_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_stop_gradient.html" target="_self">mlx::core::StopGradient</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_85_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_subtract.html" target="_self">mlx::core::Subtract</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_86_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_tan.html" target="_self">mlx::core::Tan</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_87_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_tanh.html" target="_self">mlx::core::Tanh</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_88_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_transpose.html" target="_self">mlx::core::Transpose</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_89_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_uniform.html" target="_self">mlx::core::Uniform</a></td><td class="desc"></td></tr>
+<tr id="row_229_8_90_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1_view.html" target="_self">mlx::core::View</a></td><td class="desc"></td></tr>
+<tr id="row_229_9_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_229_9_" class="arrow" onclick="dynsection.toggleFolder('229_9_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_dist_primitive.html" target="_self">mlx::core::distributed::DistPrimitive</a></td><td class="desc"></td></tr>
+<tr id="row_229_9_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_gather.html" target="_self">mlx::core::distributed::AllGather</a></td><td class="desc"></td></tr>
+<tr id="row_229_9_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html" target="_self">mlx::core::distributed::AllReduce</a></td><td class="desc"></td></tr>
+<tr id="row_229_9_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_recv.html" target="_self">mlx::core::distributed::Recv</a></td><td class="desc"></td></tr>
+<tr id="row_229_9_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1distributed_1_1_send.html" target="_self">mlx::core::distributed::Send</a></td><td class="desc"></td></tr>
+<tr id="row_229_10_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_229_10_" class="arrow" onclick="dynsection.toggleFolder('229_10_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom.html" target="_self">mlx::core::fast::Custom</a></td><td class="desc"></td></tr>
+<tr id="row_229_10_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_affine_quantize.html" target="_self">mlx::core::fast::AffineQuantize</a></td><td class="desc"></td></tr>
+<tr id="row_229_10_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm.html" target="_self">mlx::core::fast::LayerNorm</a></td><td class="desc"></td></tr>
+<tr id="row_229_10_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html" target="_self">mlx::core::fast::LayerNormVJP</a></td><td class="desc"></td></tr>
+<tr id="row_229_10_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html" target="_self">mlx::core::fast::RMSNorm</a></td><td class="desc"></td></tr>
+<tr id="row_229_10_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html" target="_self">mlx::core::fast::RMSNormVJP</a></td><td class="desc"></td></tr>
+<tr id="row_229_10_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_ro_p_e.html" target="_self">mlx::core::fast::RoPE</a></td><td class="desc"></td></tr>
+<tr id="row_229_10_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html" target="_self">mlx::core::fast::ScaledDotProductAttention</a></td><td class="desc"></td></tr>
+<tr id="row_229_11_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1fast_1_1_custom_kernel.html" target="_self">mlx::core::fast::CustomKernel</a></td><td class="desc"></td></tr>
+<tr id="row_230_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_print_formatter.html" target="_self">mlx::core::PrintFormatter</a></td><td class="desc"></td></tr>
+<tr id="row_231_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_prod.html" target="_self">Prod&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_232_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_quantized_block_loader.html" target="_self">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_233_" class="odd"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_233_" class="arrow" onclick="dynsection.toggleFolder('233_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1io_1_1_reader.html" target="_self">mlx::core::io::Reader</a></td><td class="desc"></td></tr>
+<tr id="row_233_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1io_1_1_parallel_file_reader.html" target="_self">mlx::core::io::ParallelFileReader</a></td><td class="desc"></td></tr>
+<tr id="row_234_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html" target="_self">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a></td><td class="desc"></td></tr>
+<tr id="row_235_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_read_writer.html" target="_self">ReadWriter&lt; in_T, out_T, step, four_step_real &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_236_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_real.html" target="_self">mlx::core::detail::Real</a></td><td class="desc"></td></tr>
+<tr id="row_237_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_real.html" target="_self">Real</a></td><td class="desc"></td></tr>
+<tr id="row_238_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_reduction_plan.html" target="_self">mlx::core::ReductionPlan</a></td><td class="desc"></td></tr>
+<tr id="row_239_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_remainder.html" target="_self">mlx::core::detail::Remainder</a></td><td class="desc"></td></tr>
+<tr id="row_240_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_remainder.html" target="_self">Remainder</a></td><td class="desc"></td></tr>
+<tr id="row_241_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1metal_1_1_residency_set.html" target="_self">mlx::core::metal::ResidencySet</a></td><td class="desc"></td></tr>
+<tr id="row_242_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_retain_graph.html" target="_self">mlx::core::detail::RetainGraph</a></td><td class="desc"></td></tr>
+<tr id="row_243_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1rev__iter.html" target="_self">pocketfft::detail::rev_iter</a></td><td class="desc"></td></tr>
+<tr id="row_244_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1rfftp.html" target="_self">pocketfft::detail::rfftp&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_245_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_right_shift.html" target="_self">mlx::core::detail::RightShift</a></td><td class="desc"></td></tr>
+<tr id="row_246_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_right_shift.html" target="_self">RightShift</a></td><td class="desc"></td></tr>
+<tr id="row_247_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_round.html" target="_self">mlx::core::detail::Round</a></td><td class="desc"></td></tr>
+<tr id="row_248_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_round.html" target="_self">Round</a></td><td class="desc"></td></tr>
+<tr id="row_249_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_rsqrt.html" target="_self">mlx::core::detail::Rsqrt</a></td><td class="desc"></td></tr>
+<tr id="row_250_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_rsqrt.html" target="_self">Rsqrt</a></td><td class="desc"></td></tr>
+<tr id="row_251_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_scale_op.html" target="_self">ScaleOp&lt; OutT, InT &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_252_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1scheduler_1_1_scheduler.html" target="_self">mlx::core::scheduler::Scheduler</a></td><td class="desc"></td></tr>
+<tr id="row_253_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_select.html" target="_self">mlx::core::detail::Select</a></td><td class="desc"></td></tr>
+<tr id="row_254_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_select.html" target="_self">Select</a></td><td class="desc"></td></tr>
+<tr id="row_255_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html" target="_self">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_256_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sigmoid.html" target="_self">mlx::core::detail::Sigmoid</a></td><td class="desc"></td></tr>
+<tr id="row_257_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sigmoid.html" target="_self">Sigmoid</a></td><td class="desc"></td></tr>
+<tr id="row_258_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sign.html" target="_self">mlx::core::detail::Sign</a></td><td class="desc"></td></tr>
+<tr id="row_259_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sign.html" target="_self">Sign</a></td><td class="desc"></td></tr>
+<tr id="row_260_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1simple__iter.html" target="_self">pocketfft::detail::simple_iter</a></td><td class="desc"></td></tr>
+<tr id="row_261_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sin.html" target="_self">mlx::core::detail::Sin</a></td><td class="desc"></td></tr>
+<tr id="row_262_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sin.html" target="_self">Sin</a></td><td class="desc"></td></tr>
+<tr id="row_263_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1sincos__2pibyn.html" target="_self">pocketfft::detail::sincos_2pibyn&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_264_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sinh.html" target="_self">mlx::core::detail::Sinh</a></td><td class="desc"></td></tr>
+<tr id="row_265_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sinh.html" target="_self">Sinh</a></td><td class="desc"></td></tr>
+<tr id="row_266_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_sqrt.html" target="_self">mlx::core::detail::Sqrt</a></td><td class="desc"></td></tr>
+<tr id="row_267_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sqrt.html" target="_self">Sqrt</a></td><td class="desc"></td></tr>
+<tr id="row_268_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_square.html" target="_self">mlx::core::detail::Square</a></td><td class="desc"></td></tr>
+<tr id="row_269_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_square.html" target="_self">Square</a></td><td class="desc"></td></tr>
+<tr id="row_270_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_stream.html" target="_self">mlx::core::Stream</a></td><td class="desc"></td></tr>
+<tr id="row_271_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_stream_context.html" target="_self">mlx::core::StreamContext</a></td><td class="desc"></td></tr>
+<tr id="row_272_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1scheduler_1_1_stream_thread.html" target="_self">mlx::core::scheduler::StreamThread</a></td><td class="desc"></td></tr>
+<tr id="row_273_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sub_op.html" target="_self">SubOp</a></td><td class="desc"></td></tr>
+<tr id="row_274_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_subtract.html" target="_self">mlx::core::detail::Subtract</a></td><td class="desc"></td></tr>
+<tr id="row_275_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_subtract.html" target="_self">Subtract</a></td><td class="desc"></td></tr>
+<tr id="row_276_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sum.html" target="_self">Sum&lt; U &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_277_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_sum_op.html" target="_self">SumOp</a></td><td class="desc"></td></tr>
+<tr id="row_278_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1_t__dcst23.html" target="_self">pocketfft::detail::T_dcst23&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_279_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1_t__dcst4.html" target="_self">pocketfft::detail::T_dcst4&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_280_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1_t__dct1.html" target="_self">pocketfft::detail::T_dct1&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_281_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1_t__dst1.html" target="_self">pocketfft::detail::T_dst1&lt; T0 &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_282_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_tan.html" target="_self">mlx::core::detail::Tan</a></td><td class="desc"></td></tr>
+<tr id="row_283_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_tan.html" target="_self">Tan</a></td><td class="desc"></td></tr>
+<tr id="row_284_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1detail_1_1_tanh.html" target="_self">mlx::core::detail::Tanh</a></td><td class="desc"></td></tr>
+<tr id="row_285_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_tanh.html" target="_self">Tanh</a></td><td class="desc"></td></tr>
+<tr id="row_286_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classpocketfft_1_1detail_1_1threading_1_1thread__pool.html" target="_self">pocketfft::detail::threading::thread_pool</a></td><td class="desc"></td></tr>
+<tr id="row_287_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="class_thread_pool.html" target="_self">ThreadPool</a></td><td class="desc"></td></tr>
+<tr id="row_288_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_thread_sort.html" target="_self">ThreadSort&lt; val_t, idx_t, ARG_SORT, N_PER_THREAD, CompareOp &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_289_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_add.html" target="_self">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_290_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html" target="_self">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_291_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1steel_1_1_transform_none.html" target="_self">mlx::steel::TransformNone&lt; OutT, InT &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_292_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="struct_transform_scale.html" target="_self">TransformScale&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_293_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structmlx_1_1core_1_1_type_to_dtype.html" target="_self">mlx::core::TypeToDtype&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_294_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1util.html" target="_self">pocketfft::detail::util</a></td><td class="desc"></td></tr>
+<tr id="row_295_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_v_l_e_n.html" target="_self">pocketfft::detail::VLEN&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_296_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="structpocketfft_1_1detail_1_1_v_t_y_p_e.html" target="_self">pocketfft::detail::VTYPE&lt; T &gt;</a></td><td class="desc"></td></tr>
+<tr id="row_297_" class="odd"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_297_" class="arrow" onclick="dynsection.toggleFolder('297_')">&#9658;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1io_1_1_writer.html" target="_self">mlx::core::io::Writer</a></td><td class="desc"></td></tr>
+<tr id="row_297_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classmlx_1_1core_1_1io_1_1_file_writer.html" target="_self">mlx::core::io::FileWriter</a></td><td class="desc"></td></tr>
 </table>
 </div><!-- directory -->
 </div><!-- contents -->
diff --git a/docs/build/html/index.html b/docs/build/html/index.html
index fa70de50d..eb6e99e1f 100644
--- a/docs/build/html/index.html
+++ b/docs/build/html/index.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>MLX &#8212; MLX 0.20.0 documentation</title>
+    <title>MLX &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="_static/documentation_options.js?v=174dfe6e"></script>
     <script src="_static/doctools.js?v=9a2dae69"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="next" title="Build and Install" href="install.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -132,8 +132,8 @@
       
     
     
-    <img src="_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -446,7 +446,6 @@
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -523,6 +522,7 @@
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -552,6 +552,7 @@
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/install.html b/docs/build/html/install.html
index 475f72fcf..01951f6c4 100644
--- a/docs/build/html/install.html
+++ b/docs/build/html/install.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Build and Install &#8212; MLX 0.20.0 documentation</title>
+    <title>Build and Install &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="_static/documentation_options.js?v=174dfe6e"></script>
     <script src="_static/doctools.js?v=9a2dae69"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="MLX" href="index.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -1066,7 +1067,7 @@ Metal library by run-time compiling kernels the first time they are used in MLX
 on a given machine. Note run-time compilation incurs a cold-start cost which can
 be anwywhere from a few hundred millisecond to a few seconds depending on the
 application. Once a kernel is compiled, it will be cached by the system. The
-Metal kernel cache persists accross reboots.</p>
+Metal kernel cache persists across reboots.</p>
 </section>
 </section>
 <section id="id2">
diff --git a/docs/build/html/jit_2indexing_8h_source.html b/docs/build/html/jit_2indexing_8h_source.html
index 1961a385f..cb2c1667b 100644
--- a/docs/build/html/jit_2indexing_8h_source.html
+++ b/docs/build/html/jit_2indexing_8h_source.html
@@ -94,7 +94,7 @@ $(function(){ initResizable(false); });
 <a href="jit_2indexing_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2023-2024 Apple Inc.</span></div>
 <div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
 <div class="line"><a id="l00003" name="l00003"></a><span class="lineno"><a class="line" href="jit_2indexing_8h.html#a1a03318128191891a84707602b57b3cf">    3</a></span><span class="keyword">constexpr</span> std::string_view <a class="code hl_variable" href="jit_2indexing_8h.html#a1a03318128191891a84707602b57b3cf">gather_kernels</a> = R<span class="stringliteral">&quot;(</span></div>
-<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span><span class="stringliteral">[[kernel]] void gather{0}_{3}_{6}(</span></div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span><span class="stringliteral">[[kernel]] void gather{0}_{3}_{6}_{7}(</span></div>
 <div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="stringliteral">    const device {1}* src [[buffer(0)]],</span></div>
 <div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span><span class="stringliteral">    device {1}* out [[buffer(1)]],</span></div>
 <div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="stringliteral">    const constant int* src_shape [[buffer(2)]],</span></div>
@@ -112,7 +112,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span><span class="stringliteral">  Indices&lt;{2}, {3}&gt; idxs{{</span></div>
 <div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span><span class="stringliteral">    {{ {5} }}, idx_shapes, idx_strides, idx_contigs, idx_ndim}};</span></div>
 <div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span><span class="stringliteral"></span> </div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="stringliteral">  return gather_impl&lt;{1}, {2}, {3}, {6}&gt;(</span></div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span><span class="stringliteral">  return gather_impl&lt;{1}, {2}, {3}, {6}, {7}&gt;(</span></div>
 <div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span><span class="stringliteral">      src,</span></div>
 <div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span><span class="stringliteral">      out,</span></div>
 <div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span><span class="stringliteral">      src_shape,</span></div>
@@ -127,7 +127,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span><span class="stringliteral">)&quot;;</span></div>
 <div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span><span class="stringliteral"></span> </div>
 <div class="line"><a id="l00036" name="l00036"></a><span class="lineno"><a class="line" href="jit_2indexing_8h.html#a768c949cd650a44c6b402fc1440c1a56">   36</a></span><span class="stringliteral"></span><span class="keyword">constexpr</span> std::string_view <a class="code hl_variable" href="jit_2indexing_8h.html#a768c949cd650a44c6b402fc1440c1a56">scatter_kernels</a> = R<span class="stringliteral">&quot;(</span></div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span><span class="stringliteral">[[kernel]] void scatter{0}_{4}_updc_{7}_nwork{8}(</span></div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span><span class="stringliteral">[[kernel]] void scatter{0}_{4}_updc_{7}_nwork{8}_{9}(</span></div>
 <div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span><span class="stringliteral">    const device {1}* updates [[buffer(1)]],</span></div>
 <div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span><span class="stringliteral">    device mlx_atomic&lt;{1}&gt;* out [[buffer(2)]],</span></div>
 <div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span><span class="stringliteral">    const constant int* upd_shape [[buffer(3)]],</span></div>
@@ -147,7 +147,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span><span class="stringliteral">    uint2 gid [[thread_position_in_grid]]) {{</span></div>
 <div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span><span class="stringliteral">  Indices&lt;{2}, {4}&gt; idxs{{ {{ {6} }}, idx_shapes, idx_strides, idx_contigs, idx_ndim}};</span></div>
 <div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span><span class="stringliteral"></span> </div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span><span class="stringliteral">  return scatter_impl&lt;{1}, {2}, {3}, {4}, {7}, {8}&gt;(</span></div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span><span class="stringliteral">  return scatter_impl&lt;{1}, {2}, {3}, {4}, {7}, {8}, {9}&gt;(</span></div>
 <div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span><span class="stringliteral">      updates,</span></div>
 <div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span><span class="stringliteral">      out,</span></div>
 <div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span><span class="stringliteral">      upd_shape,</span></div>
diff --git a/docs/build/html/kernels_2gemv__masked_8h_source.html b/docs/build/html/kernels_2gemv__masked_8h_source.html
index d5acec0c8..7caa0bc1f 100644
--- a/docs/build/html/kernels_2gemv__masked_8h_source.html
+++ b/docs/build/html/kernels_2gemv__masked_8h_source.html
@@ -777,12 +777,12 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00663" name="l00663"></a><span class="lineno">  663</span> </div>
 <div class="line"><a id="l00664" name="l00664"></a><span class="lineno">  664</span>  <span class="comment">// Update batch offsets</span></div>
 <div class="line"><a id="l00665" name="l00665"></a><span class="lineno">  665</span>  <span class="keywordflow">if</span> (kDoNCBatch) {</div>
-<div class="line"><a id="l00666" name="l00666"></a><span class="lineno">  666</span>    in_vec += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.z, batch_shape, vector_batch_stride, batch_ndim);</div>
-<div class="line"><a id="l00667" name="l00667"></a><span class="lineno">  667</span>    mat += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.z, batch_shape, matrix_batch_stride, batch_ndim);</div>
+<div class="line"><a id="l00666" name="l00666"></a><span class="lineno">  666</span>    in_vec += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.z, batch_shape, vector_batch_stride, batch_ndim);</div>
+<div class="line"><a id="l00667" name="l00667"></a><span class="lineno">  667</span>    mat += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.z, batch_shape, matrix_batch_stride, batch_ndim);</div>
 <div class="line"><a id="l00668" name="l00668"></a><span class="lineno">  668</span> </div>
 <div class="line"><a id="l00669" name="l00669"></a><span class="lineno">  669</span>    <span class="keywordflow">if</span> (has_output_mask) {</div>
 <div class="line"><a id="l00670" name="l00670"></a><span class="lineno">  670</span>      out_mask +=</div>
-<div class="line"><a id="l00671" name="l00671"></a><span class="lineno">  671</span>          <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.z, batch_shape, mask_batch_strides, batch_ndim);</div>
+<div class="line"><a id="l00671" name="l00671"></a><span class="lineno">  671</span>          <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.z, batch_shape, mask_batch_strides, batch_ndim);</div>
 <div class="line"><a id="l00672" name="l00672"></a><span class="lineno">  672</span>      mask_batch_strides += batch_ndim;</div>
 <div class="line"><a id="l00673" name="l00673"></a><span class="lineno">  673</span>    }</div>
 <div class="line"><a id="l00674" name="l00674"></a><span class="lineno">  674</span> </div>
@@ -876,12 +876,12 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00763" name="l00763"></a><span class="lineno">  763</span> </div>
 <div class="line"><a id="l00764" name="l00764"></a><span class="lineno">  764</span>  <span class="comment">// Update batch offsets</span></div>
 <div class="line"><a id="l00765" name="l00765"></a><span class="lineno">  765</span>  <span class="keywordflow">if</span> (kDoNCBatch) {</div>
-<div class="line"><a id="l00766" name="l00766"></a><span class="lineno">  766</span>    in_vec += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.z, batch_shape, vector_batch_stride, batch_ndim);</div>
-<div class="line"><a id="l00767" name="l00767"></a><span class="lineno">  767</span>    mat += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.z, batch_shape, matrix_batch_stride, batch_ndim);</div>
+<div class="line"><a id="l00766" name="l00766"></a><span class="lineno">  766</span>    in_vec += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.z, batch_shape, vector_batch_stride, batch_ndim);</div>
+<div class="line"><a id="l00767" name="l00767"></a><span class="lineno">  767</span>    mat += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.z, batch_shape, matrix_batch_stride, batch_ndim);</div>
 <div class="line"><a id="l00768" name="l00768"></a><span class="lineno">  768</span> </div>
 <div class="line"><a id="l00769" name="l00769"></a><span class="lineno">  769</span>    <span class="keywordflow">if</span> (has_output_mask) {</div>
 <div class="line"><a id="l00770" name="l00770"></a><span class="lineno">  770</span>      out_mask +=</div>
-<div class="line"><a id="l00771" name="l00771"></a><span class="lineno">  771</span>          <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.z, batch_shape, mask_batch_strides, batch_ndim);</div>
+<div class="line"><a id="l00771" name="l00771"></a><span class="lineno">  771</span>          <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.z, batch_shape, mask_batch_strides, batch_ndim);</div>
 <div class="line"><a id="l00772" name="l00772"></a><span class="lineno">  772</span>      mask_batch_strides += batch_ndim;</div>
 <div class="line"><a id="l00773" name="l00773"></a><span class="lineno">  773</span>    }</div>
 <div class="line"><a id="l00774" name="l00774"></a><span class="lineno">  774</span> </div>
@@ -933,13 +933,13 @@ $(function(){ initResizable(false); });
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html">utils.h</a></div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html_aaf4974425147d6f26d031691e321637f"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a></div><div class="ttdeci">METAL_FUNC ulong2 elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:7</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
 <div class="ttc" id="akernels_2gemv__masked_8h_html_a0386011c52d03e60885a31e6fbd903dd"><div class="ttname"><a href="kernels_2gemv__masked_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a></div><div class="ttdeci">#define MLX_MTL_CONST</div><div class="ttdef"><b>Definition</b> gemv_masked.h:7</div></div>
 <div class="ttc" id="akernels_2gemv__masked_8h_html_a069b682d7d21827461544817d722bfd3"><div class="ttname"><a href="kernels_2gemv__masked_8h.html#a069b682d7d21827461544817d722bfd3">MLX_MTL_PRAGMA_UNROLL</a></div><div class="ttdeci">#define MLX_MTL_PRAGMA_UNROLL</div><div class="ttdef"><b>Definition</b> gemv_masked.h:8</div></div>
 <div class="ttc" id="akernels_2gemv__masked_8h_html_a0c8d353fc453e448b2d0ed9a19431b63"><div class="ttname"><a href="kernels_2gemv__masked_8h.html#a0c8d353fc453e448b2d0ed9a19431b63">gemv_t_masked</a></div><div class="ttdeci">void gemv_t_masked(const device T *mat, const device T *in_vec, device T *out_vec, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;marix_ld, const constant int &amp;batch_ndim, const constant int *batch_shape, const constant size_t *vector_batch_stride, const constant size_t *matrix_batch_stride, const device out_mask_t *out_mask, const device op_mask_t *mat_mask, const device op_mask_t *vec_mask, const constant int *mask_strides, const constant size_t *mask_batch_strides, uint3 tid, uint3 lid, uint simd_gid, uint simd_lid)</div><div class="ttdoc">Vector matrix multiplication.</div><div class="ttdef"><b>Definition</b> gemv_masked.h:736</div></div>
 <div class="ttc" id="akernels_2gemv__masked_8h_html_ab3070d14cdecb1dd7dc220a551da6b7b"><div class="ttname"><a href="kernels_2gemv__masked_8h.html#ab3070d14cdecb1dd7dc220a551da6b7b">gemv_masked</a></div><div class="ttdeci">void gemv_masked(const device T *mat, const device T *in_vec, device T *out_vec, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;marix_ld, const constant int &amp;batch_ndim, const constant int *batch_shape, const constant size_t *vector_batch_stride, const constant size_t *matrix_batch_stride, const device out_mask_t *out_mask, const device op_mask_t *mat_mask, const device op_mask_t *vec_mask, const constant int *mask_strides, const constant size_t *mask_batch_strides, uint3 tid, uint3 lid, uint simd_gid, uint simd_lid)</div><div class="ttdoc">Matrix vector multiplication.</div><div class="ttdef"><b>Definition</b> gemv_masked.h:636</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_af6e2dd7ae087aba6abac4f0350b7611c"><div class="ttname"><a href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_down(bfloat16_t data, ushort delta)</div><div class="ttdef"><b>Definition</b> bf16_math.h:391</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_af6e2dd7ae087aba6abac4f0350b7611c"><div class="ttname"><a href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_down(bfloat16_t data, ushort delta)</div><div class="ttdef"><b>Definition</b> bf16_math.h:377</div></div>
 <div class="ttc" id="astruct___no_mask_html"><div class="ttname"><a href="struct___no_mask.html">_NoMask</a></div><div class="ttdef"><b>Definition</b> gemv_masked.h:10</div></div>
 <div class="ttc" id="astruct___no_mask_html_a0c4a4557d5c97ceafe3a2c4e521cdf7e"><div class="ttname"><a href="struct___no_mask.html#a0c4a4557d5c97ceafe3a2c4e521cdf7e">_NoMask::x</a></div><div class="ttdeci">char x</div><div class="ttdef"><b>Definition</b> gemv_masked.h:11</div></div>
 <div class="ttc" id="astruct_g_e_m_v_kernel_html"><div class="ttname"><a href="struct_g_e_m_v_kernel.html">GEMVKernel</a></div><div class="ttdef"><b>Definition</b> gemv_masked.h:48</div></div>
diff --git a/docs/build/html/kernels_2indexing_8h.html b/docs/build/html/kernels_2indexing_8h.html
index 320025e56..edd4a8bc3 100644
--- a/docs/build/html/kernels_2indexing_8h.html
+++ b/docs/build/html/kernels_2indexing_8h.html
@@ -105,13 +105,13 @@ Classes</h2></td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:ab41167dc537c06fbdb4df100972393df" id="r_ab41167dc537c06fbdb4df100972393df"><td class="memTemplParams" colspan="2">template&lt;typename IdxT &gt; </td></tr>
-<tr class="memitem:ab41167dc537c06fbdb4df100972393df"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC size_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ab41167dc537c06fbdb4df100972393df">offset_neg_idx</a> (IdxT idx, size_t size)</td></tr>
-<tr class="separator:ab41167dc537c06fbdb4df100972393df"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a58a65ea6215999cd4ccb4fe757cc2dc8" id="r_a58a65ea6215999cd4ccb4fe757cc2dc8"><td class="memTemplParams" colspan="2">template&lt;typename IdxT &gt; </td></tr>
+<tr class="memitem:a58a65ea6215999cd4ccb4fe757cc2dc8"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC size_t&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a58a65ea6215999cd4ccb4fe757cc2dc8">offset_neg_idx</a> (IdxT idx, int size)</td></tr>
+<tr class="separator:a58a65ea6215999cd4ccb4fe757cc2dc8"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="ab41167dc537c06fbdb4df100972393df" name="ab41167dc537c06fbdb4df100972393df"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ab41167dc537c06fbdb4df100972393df">&#9670;&#160;</a></span>offset_neg_idx()</h2>
+<a id="a58a65ea6215999cd4ccb4fe757cc2dc8" name="a58a65ea6215999cd4ccb4fe757cc2dc8"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a58a65ea6215999cd4ccb4fe757cc2dc8">&#9670;&#160;</a></span>offset_neg_idx()</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -126,7 +126,7 @@ template&lt;typename IdxT &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>size</em></span>&#160;)</td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>size</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
diff --git a/docs/build/html/kernels_2indexing_8h_source.html b/docs/build/html/kernels_2indexing_8h_source.html
index 732d9cc6f..753859f37 100644
--- a/docs/build/html/kernels_2indexing_8h_source.html
+++ b/docs/build/html/kernels_2indexing_8h_source.html
@@ -110,7 +110,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span> </div>
 <div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> IdxT&gt;</div>
 <div class="foldopen" id="foldopen00017" data-start="{" data-end="}">
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno"><a class="line" href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df">   17</a></span>METAL_FUNC <span class="keywordtype">size_t</span> <a class="code hl_function" href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df">offset_neg_idx</a>(IdxT idx, <span class="keywordtype">size_t</span> size) {</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno"><a class="line" href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8">   17</a></span>METAL_FUNC <span class="keywordtype">size_t</span> <a class="code hl_function" href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8">offset_neg_idx</a>(IdxT idx, <span class="keywordtype">int</span> size) {</div>
 <div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>  <span class="keywordflow">if</span> (is_unsigned_v&lt;IdxT&gt;) {</div>
 <div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    <span class="keywordflow">return</span> idx;</div>
 <div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>  } <span class="keywordflow">else</span> {</div>
@@ -118,7 +118,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  }</div>
 <div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>}</div>
 </div>
-<div class="ttc" id="akernels_2indexing_8h_html_ab41167dc537c06fbdb4df100972393df"><div class="ttname"><a href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df">offset_neg_idx</a></div><div class="ttdeci">METAL_FUNC size_t offset_neg_idx(IdxT idx, size_t size)</div><div class="ttdef"><b>Definition</b> indexing.h:17</div></div>
+<div class="ttc" id="akernels_2indexing_8h_html_a58a65ea6215999cd4ccb4fe757cc2dc8"><div class="ttname"><a href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8">offset_neg_idx</a></div><div class="ttdeci">METAL_FUNC size_t offset_neg_idx(IdxT idx, int size)</div><div class="ttdef"><b>Definition</b> indexing.h:17</div></div>
 <div class="ttc" id="astruct_indices_html"><div class="ttname"><a href="struct_indices.html">Indices</a></div><div class="ttdef"><b>Definition</b> indexing.h:8</div></div>
 <div class="ttc" id="astruct_indices_html_a255e340a39c6ac28ef2c232b106f85d1"><div class="ttname"><a href="struct_indices.html#a255e340a39c6ac28ef2c232b106f85d1">Indices::row_contiguous</a></div><div class="ttdeci">const constant bool * row_contiguous</div><div class="ttdef"><b>Definition</b> indexing.h:12</div></div>
 <div class="ttc" id="astruct_indices_html_a5ab170f1a77636180889ddfffd4f7d2f"><div class="ttname"><a href="struct_indices.html#a5ab170f1a77636180889ddfffd4f7d2f">Indices::shapes</a></div><div class="ttdeci">const constant int * shapes</div><div class="ttdef"><b>Definition</b> indexing.h:10</div></div>
diff --git a/docs/build/html/kernels_2softmax_8h_source.html b/docs/build/html/kernels_2softmax_8h_source.html
index 554cc1228..72c0f303a 100644
--- a/docs/build/html/kernels_2softmax_8h_source.html
+++ b/docs/build/html/kernels_2softmax_8h_source.html
@@ -287,12 +287,12 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>  }</div>
 <div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>}</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8e5a4b0fb5d018d7b078d147efe4f1e3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">ceildiv</a></div><div class="ttdeci">T ceildiv(T N, U M)</div><div class="ttdoc">Compute ceil((float)N/(float)M)</div><div class="ttdef"><b>Definition</b> utils.h:272</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8e5a4b0fb5d018d7b078d147efe4f1e3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">ceildiv</a></div><div class="ttdeci">T ceildiv(T N, U M)</div><div class="ttdoc">Compute ceil((float)N/(float)M)</div><div class="ttdef"><b>Definition</b> utils.h:313</div></div>
 <div class="ttc" id="akernels_2softmax_8h_html_a440d4031ee5e86159a4dd715e44a438b"><div class="ttname"><a href="kernels_2softmax_8h.html#a440d4031ee5e86159a4dd715e44a438b">softmax_exp</a></div><div class="ttdeci">T softmax_exp(T x)</div><div class="ttdef"><b>Definition</b> softmax.h:4</div></div>
 <div class="ttc" id="akernels_2softmax_8h_html_a815fe70f879f318e5d6e99acf043f52b"><div class="ttname"><a href="kernels_2softmax_8h.html#a815fe70f879f318e5d6e99acf043f52b">softmax_single_row</a></div><div class="ttdeci">void softmax_single_row(const device T *in, device T *out, constant int &amp;axis_size, uint gid, uint _lid, uint simd_lane_id, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> softmax.h:11</div></div>
 <div class="ttc" id="akernels_2softmax_8h_html_a8c47b0924ebfeebcca25f3dd17373276"><div class="ttname"><a href="kernels_2softmax_8h.html#a8c47b0924ebfeebcca25f3dd17373276">softmax_looped</a></div><div class="ttdeci">void softmax_looped(const device T *in, device T *out, constant int &amp;axis_size, uint gid, uint lid, uint lsize, uint simd_lane_id, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> softmax.h:101</div></div>
 <div class="ttc" id="aquantized_8h_html_a62969a218d93680f5e35d0c61b160b99"><div class="ttname"><a href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a></div><div class="ttdeci">static constant constexpr const int SIMD_SIZE</div><div class="ttdef"><b>Definition</b> quantized.h:10</div></div>
-<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:17</div></div>
+<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:23</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/kernels_2steel_2conv_2kernels_2steel__conv_8h_source.html b/docs/build/html/kernels_2steel_2conv_2kernels_2steel__conv_8h_source.html
index 1e6218fbd..a7d1db87e 100644
--- a/docs/build/html/kernels_2steel_2conv_2kernels_2steel__conv_8h_source.html
+++ b/docs/build/html/kernels_2steel_2conv_2kernels_2steel__conv_8h_source.html
@@ -270,9 +270,9 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>}</div>
 </div>
 <div class="ttc" id="akernels_2steel_2conv_2kernels_2steel__conv_8h_html_a5728711d1c2ee4038457babb7ac12888"><div class="ttname"><a href="kernels_2steel_2conv_2kernels_2steel__conv_8h.html#a5728711d1c2ee4038457babb7ac12888">implicit_gemm_conv_2d</a></div><div class="ttdeci">void implicit_gemm_conv_2d(const device T *A, const device T *B, device T *C, const constant MLXConvParams&lt; 2 &gt; *params, const constant ImplicitGemmConv2DParams *gemm_params, uint3 tid, uint3 lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> steel_conv.h:17</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> loader_channel_l.h:14</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> attn.h:19</div></div>
 <div class="ttc" id="astruct_m_l_x_conv_params_html"><div class="ttname"><a href="struct_m_l_x_conv_params.html">MLXConvParams</a></div><div class="ttdef"><b>Definition</b> params.h:6</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_m_m_a_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a></div><div class="ttdef"><b>Definition</b> mma.h:377</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html">mlx::steel::Conv2DInputBlockLoaderLargeFilter</a></div><div class="ttdef"><b>Definition</b> loader_channel_l.h:23</div></div>
diff --git a/docs/build/html/kernels_8h.html b/docs/build/html/kernels_8h.html
index 4a5c35d1c..d86e3bd63 100644
--- a/docs/build/html/kernels_8h.html
+++ b/docs/build/html/kernels_8h.html
@@ -129,10 +129,10 @@ Functions</h2></td></tr>
 <tr class="separator:a84ebe6275218070f0ea320f126f64e22"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:afb57825bb763050cc9a9d194aa41ac36" id="r_afb57825bb763050cc9a9d194aa41ac36"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#afb57825bb763050cc9a9d194aa41ac36">mlx::core::get_mb_sort_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;idx, int bn, int tn)</td></tr>
 <tr class="separator:afb57825bb763050cc9a9d194aa41ac36"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3bd386cb6db09f636963ce66ceaf8647" id="r_a3bd386cb6db09f636963ce66ceaf8647"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">mlx::core::get_reduce_init_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
-<tr class="separator:a3bd386cb6db09f636963ce66ceaf8647"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a7aa91fcfe8b9caa42d60a957f11bfe6b" id="r_a7aa91fcfe8b9caa42d60a957f11bfe6b"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">mlx::core::get_reduce_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, int ndim=-1, int bm=-1, int bn=-1)</td></tr>
-<tr class="separator:a7aa91fcfe8b9caa42d60a957f11bfe6b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae0470605dc819efeb6510183619f0299" id="r_ae0470605dc819efeb6510183619f0299"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#ae0470605dc819efeb6510183619f0299">mlx::core::get_reduce_init_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;out_type)</td></tr>
+<tr class="separator:ae0470605dc819efeb6510183619f0299"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1be32ba7d67137dde7ac191dfe83ff49" id="r_a1be32ba7d67137dde7ac191dfe83ff49"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a1be32ba7d67137dde7ac191dfe83ff49">mlx::core::get_reduce_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;in_type, const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;out_type, const std::string &amp;idx_t, int ndim=-1, int bm=-1, int bn=-1)</td></tr>
+<tr class="separator:a1be32ba7d67137dde7ac191dfe83ff49"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a84fa8e0aee321a9d614433a0b933103b" id="r_a84fa8e0aee321a9d614433a0b933103b"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">mlx::core::get_steel_gemm_fused_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;hash_name, const <a class="el" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a> &amp;func_consts, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn)</td></tr>
 <tr class="separator:a84fa8e0aee321a9d614433a0b933103b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:af48c6f2f72b61dbd6766e4f5fea85df5" id="r_af48c6f2f72b61dbd6766e4f5fea85df5"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">mlx::core::get_steel_gemm_splitk_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn, bool mn_aligned, bool k_aligned)</td></tr>
@@ -143,7 +143,7 @@ Functions</h2></td></tr>
 <tr class="separator:ab5f60614e965144b451930fdf935e08d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:adce79d220672f5f3c65cc31d145ca9c4" id="r_adce79d220672f5f3c65cc31d145ca9c4"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">mlx::core::get_steel_conv_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, int bm, int bn, int bk, int wm, int wn, int n_channel_specialization, bool small_filter)</td></tr>
 <tr class="separator:adce79d220672f5f3c65cc31d145ca9c4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a90c24e0d0b99b68fad9deefcf4d3e818" id="r_a90c24e0d0b99b68fad9deefcf4d3e818"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">mlx::core::get_gemv_masked_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::optional&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;mask_out, const std::optional&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;mask_op, bool transpose_mat, int bm, int bn, int sm, int sn, int tm, int tn, bool contiguous)</td></tr>
+<tr class="memitem:a90c24e0d0b99b68fad9deefcf4d3e818" id="r_a90c24e0d0b99b68fad9deefcf4d3e818"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">mlx::core::get_gemv_masked_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::optional&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;mask_out, const std::optional&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;mask_op, bool transpose_mat, int bm, int bn, int sm, int sn, int tm, int tn, bool <a class="el" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">contiguous</a>)</td></tr>
 <tr class="separator:a90c24e0d0b99b68fad9deefcf4d3e818"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:abce2b67044ee06a7bbe7a91ec7c8c48d" id="r_abce2b67044ee06a7bbe7a91ec7c8c48d"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">mlx::core::get_steel_conv_general_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, int bm, int bn, int bk, int wm, int wn)</td></tr>
 <tr class="separator:abce2b67044ee06a7bbe7a91ec7c8c48d"><td class="memSeparator" colspan="2">&#160;</td></tr>
diff --git a/docs/build/html/kernels_8h_source.html b/docs/build/html/kernels_8h_source.html
index de9608168..6d859b4a4 100644
--- a/docs/build/html/kernels_8h_source.html
+++ b/docs/build/html/kernels_8h_source.html
@@ -169,175 +169,177 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    <span class="keywordtype">int</span> bn,</div>
 <div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    <span class="keywordtype">int</span> tn);</div>
 <div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span> </div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">   79</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">get_reduce_init_kernel</a>(</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ae0470605dc819efeb6510183619f0299">   79</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#ae0470605dc819efeb6510183619f0299">get_reduce_init_kernel</a>(</div>
 <div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
 <div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
 <div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    <span class="keyword">const</span> std::string&amp; func_name,</div>
 <div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a>&amp; out_type);</div>
 <div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span> </div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">   86</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">get_reduce_kernel</a>(</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a1be32ba7d67137dde7ac191dfe83ff49">   86</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a1be32ba7d67137dde7ac191dfe83ff49">get_reduce_kernel</a>(</div>
 <div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
 <div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
 <div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>    <span class="keyword">const</span> std::string&amp; func_name,</div>
 <div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>    <span class="keywordtype">int</span> ndim = -1,</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>    <span class="keywordtype">int</span> bm = -1,</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>    <span class="keywordtype">int</span> bn = -1);</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span> </div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">   97</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">get_steel_gemm_fused_kernel</a>(</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    <span class="keyword">const</span> std::string&amp; hash_name,</div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>    <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a>&amp; func_consts,</div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    <span class="keywordtype">bool</span> transpose_a,</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    <span class="keywordtype">bool</span> transpose_b,</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>    <span class="keywordtype">int</span> wn);</div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span> </div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">  111</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">get_steel_gemm_splitk_kernel</a>(</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>    <span class="keywordtype">bool</span> transpose_a,</div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    <span class="keywordtype">bool</span> transpose_b,</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    <span class="keywordtype">int</span> wn,</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    <span class="keywordtype">bool</span> mn_aligned,</div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    <span class="keywordtype">bool</span> k_aligned);</div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span> </div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">  126</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">get_steel_gemm_splitk_accum_kernel</a>(</div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    <span class="keywordtype">bool</span> axbpy);</div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span> </div>
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">  133</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">get_steel_gemm_masked_kernel</a>(</div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_out,</div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_op,</div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    <span class="keywordtype">bool</span> transpose_a,</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>    <span class="keywordtype">bool</span> transpose_b,</div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>    <span class="keywordtype">int</span> wn,</div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>    <span class="keywordtype">bool</span> mn_aligned,</div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>    <span class="keywordtype">bool</span> k_aligned);</div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span> </div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">  149</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">get_steel_conv_kernel</a>(</div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    <span class="keywordtype">int</span> wn,</div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    <span class="keywordtype">int</span> n_channel_specialization,</div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    <span class="keywordtype">bool</span> small_filter);</div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span> </div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">  161</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">get_gemv_masked_kernel</a>(</div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_out,</div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_op,</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    <span class="keywordtype">bool</span> transpose_mat,</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    <span class="keywordtype">int</span> sm,</div>
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>    <span class="keywordtype">int</span> sn,</div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>    <span class="keywordtype">int</span> tm,</div>
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    <span class="keywordtype">int</span> tn,</div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>    <span class="keywordtype">bool</span> contiguous);</div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span> </div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">  176</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">get_steel_conv_general_kernel</a>(</div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>    <span class="keywordtype">int</span> wn);</div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span> </div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">  186</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">get_fft_kernel</a>(</div>
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>    <span class="keyword">const</span> std::string&amp; hash_name,</div>
-<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>    <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a>&amp; func_consts,</div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>    <span class="keyword">const</span> std::string&amp; template_def);</div>
-<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span> </div>
-<div class="line"><a id="l00193" name="l00193"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">  193</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">get_quantized_kernel</a>(</div>
-<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>    <span class="keyword">const</span> std::string&amp; template_def);</div>
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span> </div>
-<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span><span class="comment">// Create a GPU kernel template definition for JIT compilation</span></div>
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span>... Args&gt;</div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>std::string</div>
-<div class="foldopen" id="foldopen00201" data-start="{" data-end="}">
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">  201</a></span><a class="code hl_function" href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">get_template_definition</a>(std::string name, std::string func, Args... args) {</div>
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>  std::ostringstream s;</div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>  s &lt;&lt; func &lt;&lt; <span class="stringliteral">&quot;&lt;&quot;</span>;</div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>  <span class="keywordtype">bool</span> first = <span class="keyword">true</span>;</div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>  <span class="keyword">auto</span> add_arg = [&amp;s, &amp;first](<span class="keyword">const</span> <span class="keyword">auto</span>&amp; arg) {</div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>    <span class="keywordflow">if</span> (!first) {</div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>      s &lt;&lt; <span class="stringliteral">&quot;, &quot;</span>;</div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>    }</div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>    first = <span class="keyword">false</span>;</div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>    s &lt;&lt; arg;</div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>  };</div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>  (add_arg(args), ...);</div>
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>  s &lt;&lt; <span class="stringliteral">&quot;&gt;&quot;</span>;</div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>  <span class="keywordflow">return</span> fmt::format(</div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>      <span class="stringliteral">&quot;\ntemplate [[host_name(\&quot;{0}\&quot;)]] [[kernel]] decltype({1}) {1};\n&quot;</span>,</div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>      name,</div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>      s.str());</div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>}</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a>&amp; in_type,</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a>&amp; out_type,</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>    <span class="keyword">const</span> std::string&amp; idx_t,</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>    <span class="keywordtype">int</span> ndim = -1,</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>    <span class="keywordtype">int</span> bm = -1,</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>    <span class="keywordtype">int</span> bn = -1);</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span> </div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">   98</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">get_steel_gemm_fused_kernel</a>(</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>    <span class="keyword">const</span> std::string&amp; hash_name,</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a>&amp; func_consts,</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    <span class="keywordtype">bool</span> transpose_a,</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <span class="keywordtype">bool</span> transpose_b,</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>    <span class="keywordtype">int</span> wn);</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span> </div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">  112</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">get_steel_gemm_splitk_kernel</a>(</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    <span class="keywordtype">bool</span> transpose_a,</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    <span class="keywordtype">bool</span> transpose_b,</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    <span class="keywordtype">int</span> wn,</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    <span class="keywordtype">bool</span> mn_aligned,</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>    <span class="keywordtype">bool</span> k_aligned);</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span> </div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">  127</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">get_steel_gemm_splitk_accum_kernel</a>(</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>    <span class="keywordtype">bool</span> axbpy);</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span> </div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">  134</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">get_steel_gemm_masked_kernel</a>(</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_out,</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_op,</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>    <span class="keywordtype">bool</span> transpose_a,</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>    <span class="keywordtype">bool</span> transpose_b,</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>    <span class="keywordtype">int</span> wn,</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>    <span class="keywordtype">bool</span> mn_aligned,</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>    <span class="keywordtype">bool</span> k_aligned);</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span> </div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">  150</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">get_steel_conv_kernel</a>(</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    <span class="keywordtype">int</span> wn,</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    <span class="keywordtype">int</span> n_channel_specialization,</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    <span class="keywordtype">bool</span> small_filter);</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span> </div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">  162</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">get_gemv_masked_kernel</a>(</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_out,</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_op,</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    <span class="keywordtype">bool</span> transpose_mat,</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>    <span class="keywordtype">int</span> sm,</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>    <span class="keywordtype">int</span> sn,</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    <span class="keywordtype">int</span> tm,</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>    <span class="keywordtype">int</span> tn,</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>    <span class="keywordtype">bool</span> <a class="code hl_function" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">contiguous</a>);</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span> </div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">  177</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">get_steel_conv_general_kernel</a>(</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>    <span class="keywordtype">int</span> wn);</div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span> </div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">  187</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">get_fft_kernel</a>(</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>    <span class="keyword">const</span> std::string&amp; hash_name,</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>    <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a>&amp; func_consts,</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>    <span class="keyword">const</span> std::string&amp; template_def);</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span> </div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">  194</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">get_quantized_kernel</a>(</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>    <span class="keyword">const</span> std::string&amp; template_def);</div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span> </div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span><span class="comment">// Create a GPU kernel template definition for JIT compilation</span></div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span>... Args&gt;</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>std::string</div>
+<div class="foldopen" id="foldopen00202" data-start="{" data-end="}">
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">  202</a></span><a class="code hl_function" href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">get_template_definition</a>(std::string name, std::string func, Args... args) {</div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>  std::ostringstream s;</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>  s &lt;&lt; func &lt;&lt; <span class="stringliteral">&quot;&lt;&quot;</span>;</div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>  <span class="keywordtype">bool</span> first = <span class="keyword">true</span>;</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>  <span class="keyword">auto</span> add_arg = [&amp;s, &amp;first](<span class="keyword">const</span> <span class="keyword">auto</span>&amp; arg) {</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>    <span class="keywordflow">if</span> (!first) {</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>      s &lt;&lt; <span class="stringliteral">&quot;, &quot;</span>;</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>    }</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>    first = <span class="keyword">false</span>;</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>    s &lt;&lt; arg;</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>  };</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>  (add_arg(args), ...);</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>  s &lt;&lt; <span class="stringliteral">&quot;&gt;&quot;</span>;</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>  <span class="keywordflow">return</span> fmt::format(</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>      <span class="stringliteral">&quot;\ntemplate [[host_name(\&quot;{0}\&quot;)]] [[kernel]] decltype({1}) {1};\n&quot;</span>,</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>      name,</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>      s.str());</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>}</div>
 </div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span> </div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>} <span class="comment">// namespace mlx::core</span></div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span> </div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="abackend_2metal_2device_8h_html"><div class="ttname"><a href="backend_2metal_2device_8h.html">device.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:131</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:158</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
+<div class="ttc" id="agroup__ops_html_ga8ab10aa6c41416d739791164a52b25d5"><div class="ttname"><a href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">mlx::core::contiguous</a></div><div class="ttdeci">array contiguous(const array &amp;a, bool allow_col_major=false, StreamOrDevice s={})</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1metal_html_a616e09a1ef321d527770721cef264c54"><div class="ttname"><a href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">mlx::core::metal::MTLFCList</a></div><div class="ttdeci">std::vector&lt; std::tuple&lt; const void *, MTL::DataType, NS::UInteger &gt; &gt; MTLFCList</div><div class="ttdef"><b>Definition</b> device.h:38</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a05a220cff45f12439fde775983c6df78"><div class="ttname"><a href="namespacemlx_1_1core.html#a05a220cff45f12439fde775983c6df78">mlx::core::get_copy_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_copy_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;in, const array &amp;out)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a195b86cad5bb99aa1bcd23952305af6b"><div class="ttname"><a href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">mlx::core::get_steel_gemm_splitk_accum_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_gemm_splitk_accum_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;in, const array &amp;out, bool axbpy)</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_a1be32ba7d67137dde7ac191dfe83ff49"><div class="ttname"><a href="namespacemlx_1_1core.html#a1be32ba7d67137dde7ac191dfe83ff49">mlx::core::get_reduce_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_reduce_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const Dtype &amp;in_type, const Dtype &amp;out_type, const std::string &amp;idx_t, int ndim=-1, int bm=-1, int bn=-1)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a1d4cffc3c78067b3d9a62d64f3fb686f"><div class="ttname"><a href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">mlx::core::get_fft_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_fft_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;hash_name, const metal::MTLFCList &amp;func_consts, const std::string &amp;template_def)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a35a412f688d79eb47e42d20a7c8650ee"><div class="ttname"><a href="namespacemlx_1_1core.html#a35a412f688d79eb47e42d20a7c8650ee">mlx::core::get_softmax_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_softmax_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, bool precise, const array &amp;out)</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_a3bd386cb6db09f636963ce66ceaf8647"><div class="ttname"><a href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">mlx::core::get_reduce_init_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_reduce_init_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const array &amp;out)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a4decd4a07d91487e6903f6e3c8b7513a"><div class="ttname"><a href="namespacemlx_1_1core.html#a4decd4a07d91487e6903f6e3c8b7513a">mlx::core::get_binary_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_binary_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, Dtype in_type, Dtype out_type, const std::string op)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a4e809746f48e5dcf7fa63215d3f5e33e"><div class="ttname"><a href="namespacemlx_1_1core.html#a4e809746f48e5dcf7fa63215d3f5e33e">mlx::core::get_binary_two_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_binary_two_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, Dtype in_type, Dtype out_type, const std::string op)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a54eb3b65375022428aab5f810e40624b"><div class="ttname"><a href="namespacemlx_1_1core.html#a54eb3b65375022428aab5f810e40624b">mlx::core::get_ternary_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_ternary_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, Dtype type, const std::string op)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a76f614e9956a6ca05a9be4db5a483446"><div class="ttname"><a href="namespacemlx_1_1core.html#a76f614e9956a6ca05a9be4db5a483446">mlx::core::get_arange_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_arange_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out)</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_a7aa91fcfe8b9caa42d60a957f11bfe6b"><div class="ttname"><a href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">mlx::core::get_reduce_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_reduce_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const array &amp;in, const array &amp;out, int ndim=-1, int bm=-1, int bn=-1)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a84ebe6275218070f0ea320f126f64e22"><div class="ttname"><a href="namespacemlx_1_1core.html#a84ebe6275218070f0ea320f126f64e22">mlx::core::get_sort_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_sort_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;in, const array &amp;out, int bn, int tn)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a84fa8e0aee321a9d614433a0b933103b"><div class="ttname"><a href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">mlx::core::get_steel_gemm_fused_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_gemm_fused_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;hash_name, const metal::MTLFCList &amp;func_consts, const array &amp;out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a90c24e0d0b99b68fad9deefcf4d3e818"><div class="ttname"><a href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">mlx::core::get_gemv_masked_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_gemv_masked_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out, const std::optional&lt; array &gt; &amp;mask_out, const std::optional&lt; array &gt; &amp;mask_op, bool transpose_mat, int bm, int bn, int sm, int sn, int tm, int tn, bool contiguous)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_aa3faeae5378bfaafe3ce3432a051e43e"><div class="ttname"><a href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">mlx::core::get_quantized_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_quantized_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;template_def)</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_aae0d19f0acdef2accd2428fb84c8a032"><div class="ttname"><a href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">mlx::core::get_template_definition</a></div><div class="ttdeci">std::string get_template_definition(std::string name, std::string func, Args... args)</div><div class="ttdef"><b>Definition</b> kernels.h:201</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_aae0d19f0acdef2accd2428fb84c8a032"><div class="ttname"><a href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">mlx::core::get_template_definition</a></div><div class="ttdeci">std::string get_template_definition(std::string name, std::string func, Args... args)</div><div class="ttdef"><b>Definition</b> kernels.h:202</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_ab5f60614e965144b451930fdf935e08d"><div class="ttname"><a href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">mlx::core::get_steel_gemm_masked_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_gemm_masked_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out, const std::optional&lt; array &gt; &amp;mask_out, const std::optional&lt; array &gt; &amp;mask_op, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn, bool mn_aligned, bool k_aligned)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_abce2b67044ee06a7bbe7a91ec7c8c48d"><div class="ttname"><a href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">mlx::core::get_steel_conv_general_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_conv_general_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out, int bm, int bn, int bk, int wm, int wn)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_adce79d220672f5f3c65cc31d145ca9c4"><div class="ttname"><a href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">mlx::core::get_steel_conv_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_conv_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out, int bm, int bn, int bk, int wm, int wn, int n_channel_specialization, bool small_filter)</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_ae0470605dc819efeb6510183619f0299"><div class="ttname"><a href="namespacemlx_1_1core.html#ae0470605dc819efeb6510183619f0299">mlx::core::get_reduce_init_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_reduce_init_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const Dtype &amp;out_type)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_aeefaff208444d3fa61ecc0946fe1de5f"><div class="ttname"><a href="namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f">mlx::core::get_scan_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_scan_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, bool reverse, bool inclusive, const std::string &amp;reduce_type, const array &amp;in, const array &amp;out)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_af48c6f2f72b61dbd6766e4f5fea85df5"><div class="ttname"><a href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">mlx::core::get_steel_gemm_splitk_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_gemm_splitk_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;in, const array &amp;out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn, bool mn_aligned, bool k_aligned)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_afb57825bb763050cc9a9d194aa41ac36"><div class="ttname"><a href="namespacemlx_1_1core.html#afb57825bb763050cc9a9d194aa41ac36">mlx::core::get_mb_sort_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_mb_sort_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;in, const array &amp;idx, int bn, int tn)</div></div>
diff --git a/docs/build/html/loader__channel__l_8h_source.html b/docs/build/html/loader__channel__l_8h_source.html
index 8e19a8cb1..d6238e5b1 100644
--- a/docs/build/html/loader__channel__l_8h_source.html
+++ b/docs/build/html/loader__channel__l_8h_source.html
@@ -102,8 +102,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span><span class="comment">// Loading helper</span></div>
 <div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span> </div>
 <div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span><span class="keyword">namespace </span><a class="code hl_namespace" href="namespacemlx.html">mlx</a> {</div>
-<div class="foldopen" id="foldopen00014" data-start="{" data-end="}">
-<div class="line"><a id="l00014" name="l00014"></a><span class="lineno"><a class="line" href="namespacemlx_1_1steel.html">   14</a></span><span class="keyword">namespace </span>steel {</div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span><span class="keyword">namespace </span>steel {</div>
 <div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span> </div>
 <div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span><span class="keyword">template</span> &lt;</div>
 <div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    <span class="keyword">typename</span> T,</div>
@@ -562,7 +561,6 @@ $(function(){ initResizable(false); });
 </div>
 <div class="line"><a id="l00447" name="l00447"></a><span class="lineno">  447</span> </div>
 <div class="line"><a id="l00448" name="l00448"></a><span class="lineno">  448</span>} <span class="comment">// namespace steel</span></div>
-</div>
 <div class="line"><a id="l00449" name="l00449"></a><span class="lineno">  449</span>} <span class="comment">// namespace mlx</span></div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html">utils.h</a></div></div>
 <div class="ttc" id="aconv_2params_8h_html"><div class="ttname"><a href="conv_2params_8h.html">params.h</a></div></div>
diff --git a/docs/build/html/matmul_8h_source.html b/docs/build/html/matmul_8h_source.html
index ce4ccfe53..5b8ed1f8e 100644
--- a/docs/build/html/matmul_8h_source.html
+++ b/docs/build/html/matmul_8h_source.html
@@ -143,7 +143,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="abackend_2metal_2device_8h_html"><div class="ttname"><a href="backend_2metal_2device_8h.html">device.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:131</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:158</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a227588758ccc9ee869dba147e830bb74"><div class="ttname"><a href="namespacemlx_1_1core.html#a227588758ccc9ee869dba147e830bb74">mlx::core::steel_matmul_regular</a></div><div class="ttdeci">void steel_matmul_regular(const Stream &amp;s, metal::Device &amp;d, const array &amp;a, const array &amp;b, array &amp;out, int M, int N, int K, int batch_size_out, int lda, int ldb, int ldd, bool transpose_a, bool transpose_b, std::vector&lt; int &gt; batch_shape, std::vector&lt; size_t &gt; batch_strides, size_t A_batch_stride, size_t B_batch_stride, size_t matrix_stride_out, std::vector&lt; array &gt; &amp;copies)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_ab43a7633794498e1c6775cca829eb886"><div class="ttname"><a href="namespacemlx_1_1core.html#ab43a7633794498e1c6775cca829eb886">mlx::core::steel_matmul</a></div><div class="ttdeci">void steel_matmul(const Stream &amp;s, metal::Device &amp;d, const array &amp;a, const array &amp;b, array &amp;out, int M, int N, int K, int batch_size_out, int lda, int ldb, bool transpose_a, bool transpose_b, std::vector&lt; array &gt; &amp;copies, std::vector&lt; int &gt; batch_shape={}, std::vector&lt; size_t &gt; A_batch_stride={}, std::vector&lt; size_t &gt; B_batch_stride={})</div></div>
diff --git a/docs/build/html/menudata.js b/docs/build/html/menudata.js
index 0e4ed6500..11c2fff8b 100644
--- a/docs/build/html/menudata.js
+++ b/docs/build/html/menudata.js
@@ -177,6 +177,7 @@ var menudata={children:[
 {text:"Typedefs",url:"functions_type.html",children:[
 {text:"a",url:"functions_type.html#index_a"},
 {text:"b",url:"functions_type.html#index_b"},
+{text:"c",url:"functions_type.html#index_c"},
 {text:"d",url:"functions_type.html#index_d"},
 {text:"e",url:"functions_type.html#index_e"},
 {text:"f",url:"functions_type.html#index_f"},
@@ -215,6 +216,7 @@ var menudata={children:[
 {text:"g",url:"globals_g.html#index_g"},
 {text:"h",url:"globals_h.html#index_h"},
 {text:"i",url:"globals_i.html#index_i"},
+{text:"j",url:"globals_j.html#index_j"},
 {text:"l",url:"globals_l.html#index_l"},
 {text:"m",url:"globals_m.html#index_m"},
 {text:"n",url:"globals_n.html#index_n"},
@@ -276,8 +278,8 @@ var menudata={children:[
 {text:"f",url:"globals_defs.html#index_f"},
 {text:"h",url:"globals_defs.html#index_h"},
 {text:"i",url:"globals_defs.html#index_i"},
+{text:"j",url:"globals_defs.html#index_j"},
 {text:"m",url:"globals_defs.html#index_m"},
 {text:"p",url:"globals_defs.html#index_p"},
 {text:"r",url:"globals_defs.html#index_r"},
-{text:"s",url:"globals_defs.html#index_s"},
-{text:"u",url:"globals_defs.html#index_u"}]}]}]}]}
+{text:"s",url:"globals_defs.html#index_s"}]}]}]}]}
diff --git a/docs/build/html/metal_2kernels_2binary_8h.html b/docs/build/html/metal_2kernels_2binary_8h.html
index 51b19e243..d217e3b2a 100644
--- a/docs/build/html/metal_2kernels_2binary_8h.html
+++ b/docs/build/html/metal_2kernels_2binary_8h.html
@@ -122,24 +122,24 @@ Functions</h2></td></tr>
 <tr class="memitem:a6808bfb006cb5473da087a2758d0d867" id="r_a6808bfb006cb5473da087a2758d0d867"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op &gt; </td></tr>
 <tr class="memitem:a6808bfb006cb5473da087a2758d0d867"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a6808bfb006cb5473da087a2758d0d867">binary_g_nd1</a> (device const T *a, device const T *b, device U *c, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index)</td></tr>
 <tr class="separator:a6808bfb006cb5473da087a2758d0d867"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8cd5989852ec704c6fd132ae28f4fc14" id="r_a8cd5989852ec704c6fd132ae28f4fc14"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op &gt; </td></tr>
-<tr class="memitem:a8cd5989852ec704c6fd132ae28f4fc14"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a8cd5989852ec704c6fd132ae28f4fc14">binary_g_nd2</a> (device const T *a, device const T *b, device U *c, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim)</td></tr>
-<tr class="separator:a8cd5989852ec704c6fd132ae28f4fc14"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac4979e60b993f7ffb602bcb91cd68bc9" id="r_ac4979e60b993f7ffb602bcb91cd68bc9"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op &gt; </td></tr>
-<tr class="memitem:ac4979e60b993f7ffb602bcb91cd68bc9"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ac4979e60b993f7ffb602bcb91cd68bc9">binary_g_nd3</a> (device const T *a, device const T *b, device U *c, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:ac4979e60b993f7ffb602bcb91cd68bc9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1f3f5d6bfbf3914f365790dd1434c10b" id="r_a1f3f5d6bfbf3914f365790dd1434c10b"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int N = 1&gt; </td></tr>
-<tr class="memitem:a1f3f5d6bfbf3914f365790dd1434c10b"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1f3f5d6bfbf3914f365790dd1434c10b">binary_g</a> (device const T *a, device const T *b, device U *c, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:a1f3f5d6bfbf3914f365790dd1434c10b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6cefcfee68bd62f3a6924df0cd53dd49" id="r_a6cefcfee68bd62f3a6924df0cd53dd49"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:a6cefcfee68bd62f3a6924df0cd53dd49"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a6cefcfee68bd62f3a6924df0cd53dd49">binary_g_nd2</a> (device const T *a, device const T *b, device U *c, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim)</td></tr>
+<tr class="separator:a6cefcfee68bd62f3a6924df0cd53dd49"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abb15de8250f9a259de80618c6de46dfa" id="r_abb15de8250f9a259de80618c6de46dfa"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:abb15de8250f9a259de80618c6de46dfa"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#abb15de8250f9a259de80618c6de46dfa">binary_g_nd3</a> (device const T *a, device const T *b, device U *c, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:abb15de8250f9a259de80618c6de46dfa"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ab1b49438a70f6c707c18afd5bce12bb3" id="r_ab1b49438a70f6c707c18afd5bce12bb3"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int N = 1, typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:ab1b49438a70f6c707c18afd5bce12bb3"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ab1b49438a70f6c707c18afd5bce12bb3">binary_g</a> (device const T *a, device const T *b, device U *c, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:ab1b49438a70f6c707c18afd5bce12bb3"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="a1f3f5d6bfbf3914f365790dd1434c10b" name="a1f3f5d6bfbf3914f365790dd1434c10b"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1f3f5d6bfbf3914f365790dd1434c10b">&#9670;&#160;</a></span>binary_g()</h2>
+<a id="ab1b49438a70f6c707c18afd5bce12bb3" name="ab1b49438a70f6c707c18afd5bce12bb3"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ab1b49438a70f6c707c18afd5bce12bb3">&#9670;&#160;</a></span>binary_g()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int N = 1&gt; </div>
+template&lt;typename T , typename U , typename Op , int N = 1, typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void binary_g </td>
@@ -234,13 +234,13 @@ template&lt;typename T , typename U , typename Op &gt; </div>
 
 </div>
 </div>
-<a id="a8cd5989852ec704c6fd132ae28f4fc14" name="a8cd5989852ec704c6fd132ae28f4fc14"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a8cd5989852ec704c6fd132ae28f4fc14">&#9670;&#160;</a></span>binary_g_nd2()</h2>
+<a id="a6cefcfee68bd62f3a6924df0cd53dd49" name="a6cefcfee68bd62f3a6924df0cd53dd49"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6cefcfee68bd62f3a6924df0cd53dd49">&#9670;&#160;</a></span>binary_g_nd2()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op &gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void binary_g_nd2 </td>
@@ -282,13 +282,13 @@ template&lt;typename T , typename U , typename Op &gt; </div>
 
 </div>
 </div>
-<a id="ac4979e60b993f7ffb602bcb91cd68bc9" name="ac4979e60b993f7ffb602bcb91cd68bc9"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ac4979e60b993f7ffb602bcb91cd68bc9">&#9670;&#160;</a></span>binary_g_nd3()</h2>
+<a id="abb15de8250f9a259de80618c6de46dfa" name="abb15de8250f9a259de80618c6de46dfa"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#abb15de8250f9a259de80618c6de46dfa">&#9670;&#160;</a></span>binary_g_nd3()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op &gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void binary_g_nd3 </td>
diff --git a/docs/build/html/metal_2kernels_2binary_8h_source.html b/docs/build/html/metal_2kernels_2binary_8h_source.html
index 228fde6e6..f843c6da1 100644
--- a/docs/build/html/metal_2kernels_2binary_8h_source.html
+++ b/docs/build/html/metal_2kernels_2binary_8h_source.html
@@ -185,15 +185,15 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>&amp; a_stride,</div>
 <div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>&amp; b_stride,</div>
 <div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    uint index [[thread_position_in_grid]]) {</div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, a_stride);</div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, b_stride);</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;size_t, uint&gt;</a>(index, a_stride);</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;size_t, uint&gt;</a>(index, b_stride);</div>
 <div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>  c[index] = Op()(a[a_idx], b[b_idx]);</div>
 <div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>}</div>
 </div>
 <div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span> </div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op&gt;</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
 <div class="foldopen" id="foldopen00086" data-start="{" data-end="}">
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary_8h.html#a8cd5989852ec704c6fd132ae28f4fc14">   86</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary_8h.html#a8cd5989852ec704c6fd132ae28f4fc14">binary_g_nd2</a>(</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary_8h.html#a6cefcfee68bd62f3a6924df0cd53dd49">   86</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary_8h.html#a6cefcfee68bd62f3a6924df0cd53dd49">binary_g_nd2</a>(</div>
 <div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>    device <span class="keyword">const</span> T* a,</div>
 <div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>    device <span class="keyword">const</span> T* b,</div>
 <div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>    device U* c,</div>
@@ -201,16 +201,16 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span> b_strides[2],</div>
 <div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    uint2 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, a_strides);</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, b_strides);</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>  <span class="keywordtype">size_t</span> out_idx = index.x + size_t(grid_dim.x) * index.y;</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;size_t, IdxT&gt;</a>(index, a_strides);</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;size_t, IdxT&gt;</a>(index, b_strides);</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>  IdxT out_idx = index.x + IdxT(grid_dim.x) * index.y;</div>
 <div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>  c[out_idx] = Op()(a[a_idx], b[b_idx]);</div>
 <div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>}</div>
 </div>
 <div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span> </div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op&gt;</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
 <div class="foldopen" id="foldopen00101" data-start="{" data-end="}">
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary_8h.html#ac4979e60b993f7ffb602bcb91cd68bc9">  101</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary_8h.html#ac4979e60b993f7ffb602bcb91cd68bc9">binary_g_nd3</a>(</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary_8h.html#abb15de8250f9a259de80618c6de46dfa">  101</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary_8h.html#abb15de8250f9a259de80618c6de46dfa">binary_g_nd3</a>(</div>
 <div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    device <span class="keyword">const</span> T* a,</div>
 <div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    device <span class="keyword">const</span> T* b,</div>
 <div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    device U* c,</div>
@@ -218,54 +218,57 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span> b_strides[3],</div>
 <div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    uint3 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, a_strides);</div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, b_strides);</div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  <span class="keywordtype">size_t</span> out_idx =</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>      index.x + grid_dim.x * (index.y + size_t(grid_dim.y) * index.z);</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>  c[out_idx] = Op()(a[a_idx], b[b_idx]);</div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>}</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;size_t, IdxT&gt;</a>(index, a_strides);</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;size_t, IdxT&gt;</a>(index, b_strides);</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  IdxT out_idx = index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>  c[out_idx] = Op()(a[a_idx], b[b_idx]);</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>}</div>
 </div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span> </div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> N = 1&gt;</div>
-<div class="foldopen" id="foldopen00117" data-start="{" data-end="}">
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary_8h.html#a1f3f5d6bfbf3914f365790dd1434c10b">  117</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary_8h.html#a1f3f5d6bfbf3914f365790dd1434c10b">binary_g</a>(</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    device <span class="keyword">const</span> T* a,</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    device <span class="keyword">const</span> T* b,</div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>    device U* c,</div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* a_strides,</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* b_strides,</div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim,</div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>    uint3 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>  <span class="keyword">auto</span> idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">elem_to_loc_2_nd</a>(</div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>      {N * index.x, index.y, index.z}, shape, a_strides, b_strides, ndim);</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>  <span class="keyword">auto</span> xshape = shape[ndim - 1];</div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>  <span class="keywordtype">size_t</span> out_idx =</div>
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>      N * index.x + xshape * (index.y + size_t(grid_dim.y) * index.z);</div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>  <span class="keyword">auto</span> a_xstride = a_strides[ndim - 1];</div>
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  <span class="keyword">auto</span> b_xstride = b_strides[ndim - 1];</div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>    c[out_idx++] = Op()(a[idx.x], b[idx.y]);</div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    idx.x += a_xstride;</div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>    idx.y += b_xstride;</div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>  }</div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>}</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span> </div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    <span class="keyword">typename</span> Op,</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    <span class="keywordtype">int</span> N = 1,</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>    <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
+<div class="foldopen" id="foldopen00121" data-start="{" data-end="}">
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary_8h.html#ab1b49438a70f6c707c18afd5bce12bb3">  121</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary_8h.html#ab1b49438a70f6c707c18afd5bce12bb3">binary_g</a>(</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    device <span class="keyword">const</span> T* a,</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    device <span class="keyword">const</span> T* b,</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    device U* c,</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* a_strides,</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* b_strides,</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim,</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    uint3 index [[thread_position_in_grid]],</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>  <span class="keyword">auto</span> idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">elem_to_loc_2_nd&lt;size_t, IdxT&gt;</a>(</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>      {N * index.x, index.y, index.z}, shape, a_strides, b_strides, ndim);</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  <span class="keyword">auto</span> xshape = shape[ndim - 1];</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>  IdxT a_xstride = a_strides[ndim - 1];</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>  IdxT b_xstride = b_strides[ndim - 1];</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    c[out_idx++] = Op()(a[idx.x], b[idx.y]);</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    idx.x += a_xstride;</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>    idx.y += b_xstride;</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>  }</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>}</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a01c9309978a6c12f79b6e4108728a953"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">elem_to_loc_2_nd</a></div><div class="ttdeci">METAL_FUNC ulong2 elem_to_loc_2_nd(uint3 elem, constant const int *shape, constant const stride_t *a_strides, constant const stride_t *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:153</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a196a07022b812b241d4c06192c0fa83d"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_1(uint elem, constant const stride_t &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:133</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a2c34ed54714c69e6e1b44344f9e6e330"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_3(uint3 elem, constant const stride_t strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ad6c45cacca97899cd362df49c06fea79"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_2(uint2 elem, constant const stride_t strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:139</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a43f33efc000962d6de881a3aab7458de"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_2(uint2 elem, constant const StrideT strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a650f8ea8cf9f9519da9e301aad0308dc"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const StrideT strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:150</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a66a2d7eec0262b12db16cd6c781ccf9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">elem_to_loc_2_nd</a></div><div class="ttdeci">METAL_FUNC vec&lt; IdxT, 2 &gt; elem_to_loc_2_nd(uint3 elem, constant const int *shape, constant const StrideT *a_strides, constant const StrideT *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:159</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ac612d0ae30b8257198339debe04916a3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_1(uint elem, constant const StrideT &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:140</div></div>
 <div class="ttc" id="ametal_2kernels_2binary_8h_html_a19dbbf8fea68b64bdd25dc8d36865171"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#a19dbbf8fea68b64bdd25dc8d36865171">binary_vv2</a></div><div class="ttdeci">void binary_vv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary.h:62</div></div>
-<div class="ttc" id="ametal_2kernels_2binary_8h_html_a1f3f5d6bfbf3914f365790dd1434c10b"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#a1f3f5d6bfbf3914f365790dd1434c10b">binary_g</a></div><div class="ttdeci">void binary_g(device const T *a, device const T *b, device U *c, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> binary.h:117</div></div>
 <div class="ttc" id="ametal_2kernels_2binary_8h_html_a242b8b29a852c255467e50628c6dccf5"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#a242b8b29a852c255467e50628c6dccf5">binary_ss</a></div><div class="ttdeci">void binary_ss(device const T *a, device const T *b, device U *c, uint index)</div><div class="ttdef"><b>Definition</b> binary.h:4</div></div>
 <div class="ttc" id="ametal_2kernels_2binary_8h_html_a4116c35f2e4632366d1611d5a95ba141"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#a4116c35f2e4632366d1611d5a95ba141">binary_sv</a></div><div class="ttdeci">void binary_sv(device const T *a, device const T *b, device U *c, uint index)</div><div class="ttdef"><b>Definition</b> binary.h:13</div></div>
 <div class="ttc" id="ametal_2kernels_2binary_8h_html_a48bd82eb10f9c623ce7d28daec4fa512"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#a48bd82eb10f9c623ce7d28daec4fa512">binary_vs2</a></div><div class="ttdeci">void binary_vs2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary.h:51</div></div>
 <div class="ttc" id="ametal_2kernels_2binary_8h_html_a649851d133358dd5832a73b1061b3313"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#a649851d133358dd5832a73b1061b3313">binary_vs</a></div><div class="ttdeci">void binary_vs(device const T *a, device const T *b, device U *c, uint index)</div><div class="ttdef"><b>Definition</b> binary.h:22</div></div>
 <div class="ttc" id="ametal_2kernels_2binary_8h_html_a6808bfb006cb5473da087a2758d0d867"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#a6808bfb006cb5473da087a2758d0d867">binary_g_nd1</a></div><div class="ttdeci">void binary_g_nd1(device const T *a, device const T *b, device U *c, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index)</div><div class="ttdef"><b>Definition</b> binary.h:73</div></div>
-<div class="ttc" id="ametal_2kernels_2binary_8h_html_a8cd5989852ec704c6fd132ae28f4fc14"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#a8cd5989852ec704c6fd132ae28f4fc14">binary_g_nd2</a></div><div class="ttdeci">void binary_g_nd2(device const T *a, device const T *b, device U *c, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary.h:86</div></div>
+<div class="ttc" id="ametal_2kernels_2binary_8h_html_a6cefcfee68bd62f3a6924df0cd53dd49"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#a6cefcfee68bd62f3a6924df0cd53dd49">binary_g_nd2</a></div><div class="ttdeci">void binary_g_nd2(device const T *a, device const T *b, device U *c, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary.h:86</div></div>
 <div class="ttc" id="ametal_2kernels_2binary_8h_html_aa8c48b1b21d8f5a181f5443de2346589"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#aa8c48b1b21d8f5a181f5443de2346589">binary_sv2</a></div><div class="ttdeci">void binary_sv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary.h:40</div></div>
-<div class="ttc" id="ametal_2kernels_2binary_8h_html_ac4979e60b993f7ffb602bcb91cd68bc9"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#ac4979e60b993f7ffb602bcb91cd68bc9">binary_g_nd3</a></div><div class="ttdeci">void binary_g_nd3(device const T *a, device const T *b, device U *c, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> binary.h:101</div></div>
+<div class="ttc" id="ametal_2kernels_2binary_8h_html_ab1b49438a70f6c707c18afd5bce12bb3"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#ab1b49438a70f6c707c18afd5bce12bb3">binary_g</a></div><div class="ttdeci">void binary_g(device const T *a, device const T *b, device U *c, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> binary.h:121</div></div>
+<div class="ttc" id="ametal_2kernels_2binary_8h_html_abb15de8250f9a259de80618c6de46dfa"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#abb15de8250f9a259de80618c6de46dfa">binary_g_nd3</a></div><div class="ttdeci">void binary_g_nd3(device const T *a, device const T *b, device U *c, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> binary.h:101</div></div>
 <div class="ttc" id="ametal_2kernels_2binary_8h_html_add6a9aeee3cb0ba909574f27fa9ecd5b"><div class="ttname"><a href="metal_2kernels_2binary_8h.html#add6a9aeee3cb0ba909574f27fa9ecd5b">binary_vv</a></div><div class="ttdeci">void binary_vv(device const T *a, device const T *b, device U *c, uint index)</div><div class="ttdef"><b>Definition</b> binary.h:31</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/metal_2kernels_2binary__two_8h.html b/docs/build/html/metal_2kernels_2binary__two_8h.html
index f433d7743..a56b56861 100644
--- a/docs/build/html/metal_2kernels_2binary__two_8h.html
+++ b/docs/build/html/metal_2kernels_2binary__two_8h.html
@@ -122,24 +122,24 @@ Functions</h2></td></tr>
 <tr class="memitem:ad1fad37c168192b212a4294f4cf78133" id="r_ad1fad37c168192b212a4294f4cf78133"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op &gt; </td></tr>
 <tr class="memitem:ad1fad37c168192b212a4294f4cf78133"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ad1fad37c168192b212a4294f4cf78133">binary_g_nd1</a> (device const T *a, device const T *b, device U *c, device U *d, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index)</td></tr>
 <tr class="separator:ad1fad37c168192b212a4294f4cf78133"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a03f7c15a1607576755abb65c542ae347" id="r_a03f7c15a1607576755abb65c542ae347"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op &gt; </td></tr>
-<tr class="memitem:a03f7c15a1607576755abb65c542ae347"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a03f7c15a1607576755abb65c542ae347">binary_g_nd2</a> (device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim)</td></tr>
-<tr class="separator:a03f7c15a1607576755abb65c542ae347"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a43e5943460996c43060d1f3aa1309ba6" id="r_a43e5943460996c43060d1f3aa1309ba6"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op &gt; </td></tr>
-<tr class="memitem:a43e5943460996c43060d1f3aa1309ba6"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a43e5943460996c43060d1f3aa1309ba6">binary_g_nd3</a> (device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:a43e5943460996c43060d1f3aa1309ba6"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6297badf47dece518bb4e67f02cffea8" id="r_a6297badf47dece518bb4e67f02cffea8"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int N = 1&gt; </td></tr>
-<tr class="memitem:a6297badf47dece518bb4e67f02cffea8"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a6297badf47dece518bb4e67f02cffea8">binary_g</a> (device const T *a, device const T *b, device U *c, device U *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:a6297badf47dece518bb4e67f02cffea8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a97b5613aff654d32c49225209a19bb95" id="r_a97b5613aff654d32c49225209a19bb95"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:a97b5613aff654d32c49225209a19bb95"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a97b5613aff654d32c49225209a19bb95">binary_g_nd2</a> (device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim)</td></tr>
+<tr class="separator:a97b5613aff654d32c49225209a19bb95"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aae07014f8dffa3649a5c7f4671e1268e" id="r_aae07014f8dffa3649a5c7f4671e1268e"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:aae07014f8dffa3649a5c7f4671e1268e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aae07014f8dffa3649a5c7f4671e1268e">binary_g_nd3</a> (device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:aae07014f8dffa3649a5c7f4671e1268e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aaf6edb734cea627bca4f6540dc338fbd" id="r_aaf6edb734cea627bca4f6540dc338fbd"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int N = 1, typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:aaf6edb734cea627bca4f6540dc338fbd"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aaf6edb734cea627bca4f6540dc338fbd">binary_g</a> (device const T *a, device const T *b, device U *c, device U *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:aaf6edb734cea627bca4f6540dc338fbd"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="a6297badf47dece518bb4e67f02cffea8" name="a6297badf47dece518bb4e67f02cffea8"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a6297badf47dece518bb4e67f02cffea8">&#9670;&#160;</a></span>binary_g()</h2>
+<a id="aaf6edb734cea627bca4f6540dc338fbd" name="aaf6edb734cea627bca4f6540dc338fbd"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aaf6edb734cea627bca4f6540dc338fbd">&#9670;&#160;</a></span>binary_g()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int N = 1&gt; </div>
+template&lt;typename T , typename U , typename Op , int N = 1, typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void binary_g </td>
@@ -244,13 +244,13 @@ template&lt;typename T , typename U , typename Op &gt; </div>
 
 </div>
 </div>
-<a id="a03f7c15a1607576755abb65c542ae347" name="a03f7c15a1607576755abb65c542ae347"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a03f7c15a1607576755abb65c542ae347">&#9670;&#160;</a></span>binary_g_nd2()</h2>
+<a id="a97b5613aff654d32c49225209a19bb95" name="a97b5613aff654d32c49225209a19bb95"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a97b5613aff654d32c49225209a19bb95">&#9670;&#160;</a></span>binary_g_nd2()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op &gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void binary_g_nd2 </td>
@@ -297,13 +297,13 @@ template&lt;typename T , typename U , typename Op &gt; </div>
 
 </div>
 </div>
-<a id="a43e5943460996c43060d1f3aa1309ba6" name="a43e5943460996c43060d1f3aa1309ba6"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a43e5943460996c43060d1f3aa1309ba6">&#9670;&#160;</a></span>binary_g_nd3()</h2>
+<a id="aae07014f8dffa3649a5c7f4671e1268e" name="aae07014f8dffa3649a5c7f4671e1268e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aae07014f8dffa3649a5c7f4671e1268e">&#9670;&#160;</a></span>binary_g_nd3()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op &gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void binary_g_nd3 </td>
diff --git a/docs/build/html/metal_2kernels_2binary__two_8h_source.html b/docs/build/html/metal_2kernels_2binary__two_8h_source.html
index 3a8b9ba8e..216aca599 100644
--- a/docs/build/html/metal_2kernels_2binary__two_8h_source.html
+++ b/docs/build/html/metal_2kernels_2binary__two_8h_source.html
@@ -207,17 +207,17 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>&amp; a_stride,</div>
 <div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>&amp; b_stride,</div>
 <div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>    uint index [[thread_position_in_grid]]) {</div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, a_stride);</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, b_stride);</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;size_t, uint&gt;</a>(index, a_stride);</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;size_t, uint&gt;</a>(index, b_stride);</div>
 <div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>  <span class="keyword">auto</span> out = Op()(a[a_idx], b[b_idx]);</div>
 <div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>  c[index] = out[0];</div>
 <div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>  d[index] = out[1];</div>
 <div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>}</div>
 </div>
 <div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span> </div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op&gt;</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
 <div class="foldopen" id="foldopen00110" data-start="{" data-end="}">
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary__two_8h.html#a03f7c15a1607576755abb65c542ae347">  110</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary__two_8h.html#a03f7c15a1607576755abb65c542ae347">binary_g_nd2</a>(</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary__two_8h.html#a97b5613aff654d32c49225209a19bb95">  110</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary__two_8h.html#a97b5613aff654d32c49225209a19bb95">binary_g_nd2</a>(</div>
 <div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    device <span class="keyword">const</span> T* a,</div>
 <div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    device <span class="keyword">const</span> T* b,</div>
 <div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    device U* c,</div>
@@ -226,18 +226,18 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span> b_strides[2],</div>
 <div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    uint2 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, a_strides);</div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, b_strides);</div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  <span class="keywordtype">size_t</span> out_idx = index.x + size_t(grid_dim.x) * index.y;</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;size_t, IdxT&gt;</a>(index, a_strides);</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;size_t, IdxT&gt;</a>(index, b_strides);</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  IdxT out_idx = index.x + IdxT(grid_dim.x) * index.y;</div>
 <div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>  <span class="keyword">auto</span> out = Op()(a[a_idx], b[b_idx]);</div>
 <div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>  c[out_idx] = out[0];</div>
 <div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>  d[out_idx] = out[1];</div>
 <div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>}</div>
 </div>
 <div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span> </div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op&gt;</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
 <div class="foldopen" id="foldopen00128" data-start="{" data-end="}">
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary__two_8h.html#a43e5943460996c43060d1f3aa1309ba6">  128</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary__two_8h.html#a43e5943460996c43060d1f3aa1309ba6">binary_g_nd3</a>(</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary__two_8h.html#aae07014f8dffa3649a5c7f4671e1268e">  128</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary__two_8h.html#aae07014f8dffa3649a5c7f4671e1268e">binary_g_nd3</a>(</div>
 <div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    device <span class="keyword">const</span> T* a,</div>
 <div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    device <span class="keyword">const</span> T* b,</div>
 <div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    device U* c,</div>
@@ -246,56 +246,59 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span> b_strides[3],</div>
 <div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>    uint3 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, a_strides);</div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, b_strides);</div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>  <span class="keywordtype">size_t</span> out_idx =</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>      index.x + grid_dim.x * (index.y + size_t(grid_dim.y) * index.z);</div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>  <span class="keyword">auto</span> out = Op()(a[a_idx], b[b_idx]);</div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>  c[out_idx] = out[0];</div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>  d[out_idx] = out[1];</div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>}</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;size_t, IdxT&gt;</a>(index, a_strides);</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;size_t, IdxT&gt;</a>(index, b_strides);</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>  IdxT out_idx = index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>  <span class="keyword">auto</span> out = Op()(a[a_idx], b[b_idx]);</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>  c[out_idx] = out[0];</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>  d[out_idx] = out[1];</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>}</div>
 </div>
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span> </div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> N = 1&gt;</div>
-<div class="foldopen" id="foldopen00147" data-start="{" data-end="}">
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary__two_8h.html#a6297badf47dece518bb4e67f02cffea8">  147</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary__two_8h.html#a6297badf47dece518bb4e67f02cffea8">binary_g</a>(</div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>    device <span class="keyword">const</span> T* a,</div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>    device <span class="keyword">const</span> T* b,</div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    device U* c,</div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>    device U* d,</div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* a_strides,</div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* b_strides,</div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim,</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    uint3 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>  <span class="keyword">auto</span> idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">elem_to_loc_2_nd</a>(</div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>      {N * index.x, index.y, index.z}, shape, a_strides, b_strides, ndim);</div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>  <span class="keyword">auto</span> xshape = shape[ndim - 1];</div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>  <span class="keywordtype">size_t</span> out_idx =</div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>      N * index.x + xshape * (index.y + size_t(grid_dim.y) * index.z);</div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>  <span class="keyword">auto</span> a_xstride = a_strides[ndim - 1];</div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>  <span class="keyword">auto</span> b_xstride = b_strides[ndim - 1];</div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keyword">auto</span> out = Op()(a[idx.x], b[idx.y]);</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    c[out_idx] = out[0];</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    d[out_idx++] = out[1];</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    idx.x += a_xstride;</div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    idx.y += b_xstride;</div>
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>  }</div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>}</div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span> </div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>    <span class="keyword">typename</span> Op,</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>    <span class="keywordtype">int</span> N = 1,</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
+<div class="foldopen" id="foldopen00151" data-start="{" data-end="}">
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno"><a class="line" href="metal_2kernels_2binary__two_8h.html#aaf6edb734cea627bca4f6540dc338fbd">  151</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2binary__two_8h.html#aaf6edb734cea627bca4f6540dc338fbd">binary_g</a>(</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    device <span class="keyword">const</span> T* a,</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    device <span class="keyword">const</span> T* b,</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>    device U* c,</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    device U* d,</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* a_strides,</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* b_strides,</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim,</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    uint3 index [[thread_position_in_grid]],</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>  <span class="keyword">auto</span> idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">elem_to_loc_2_nd&lt;size_t, IdxT&gt;</a>(</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>      {N * index.x, index.y, index.z}, shape, a_strides, b_strides, ndim);</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>  <span class="keyword">auto</span> xshape = shape[ndim - 1];</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>  IdxT a_xstride = a_strides[ndim - 1];</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>  IdxT b_xstride = b_strides[ndim - 1];</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    <span class="keyword">auto</span> out = Op()(a[idx.x], b[idx.y]);</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    c[out_idx] = out[0];</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>    d[out_idx++] = out[1];</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>    idx.x += a_xstride;</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    idx.y += b_xstride;</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>  }</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>}</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a01c9309978a6c12f79b6e4108728a953"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">elem_to_loc_2_nd</a></div><div class="ttdeci">METAL_FUNC ulong2 elem_to_loc_2_nd(uint3 elem, constant const int *shape, constant const stride_t *a_strides, constant const stride_t *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:153</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a196a07022b812b241d4c06192c0fa83d"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_1(uint elem, constant const stride_t &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:133</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a2c34ed54714c69e6e1b44344f9e6e330"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_3(uint3 elem, constant const stride_t strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ad6c45cacca97899cd362df49c06fea79"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_2(uint2 elem, constant const stride_t strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:139</div></div>
-<div class="ttc" id="ametal_2kernels_2binary__two_8h_html_a03f7c15a1607576755abb65c542ae347"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#a03f7c15a1607576755abb65c542ae347">binary_g_nd2</a></div><div class="ttdeci">void binary_g_nd2(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary_two.h:110</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a43f33efc000962d6de881a3aab7458de"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_2(uint2 elem, constant const StrideT strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a650f8ea8cf9f9519da9e301aad0308dc"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const StrideT strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:150</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a66a2d7eec0262b12db16cd6c781ccf9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">elem_to_loc_2_nd</a></div><div class="ttdeci">METAL_FUNC vec&lt; IdxT, 2 &gt; elem_to_loc_2_nd(uint3 elem, constant const int *shape, constant const StrideT *a_strides, constant const StrideT *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:159</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ac612d0ae30b8257198339debe04916a3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_1(uint elem, constant const StrideT &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:140</div></div>
 <div class="ttc" id="ametal_2kernels_2binary__two_8h_html_a08822ff98ea6f61a98b49a9e9a38b891"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#a08822ff98ea6f61a98b49a9e9a38b891">binary_sv2</a></div><div class="ttdeci">void binary_sv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary_two.h:52</div></div>
 <div class="ttc" id="ametal_2kernels_2binary__two_8h_html_a12dbda74fa460812177ccb9aeee6e1ca"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#a12dbda74fa460812177ccb9aeee6e1ca">binary_vs</a></div><div class="ttdeci">void binary_vs(device const T *a, device const T *b, device U *c, device U *d, uint index)</div><div class="ttdef"><b>Definition</b> binary_two.h:28</div></div>
 <div class="ttc" id="ametal_2kernels_2binary__two_8h_html_a12e80730e43dfaa4c79ce8d5f99edc50"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#a12e80730e43dfaa4c79ce8d5f99edc50">binary_vv2</a></div><div class="ttdeci">void binary_vv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary_two.h:80</div></div>
 <div class="ttc" id="ametal_2kernels_2binary__two_8h_html_a273d2f31691f2c64623c2a97eab344be"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#a273d2f31691f2c64623c2a97eab344be">binary_vs2</a></div><div class="ttdeci">void binary_vs2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary_two.h:66</div></div>
-<div class="ttc" id="ametal_2kernels_2binary__two_8h_html_a43e5943460996c43060d1f3aa1309ba6"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#a43e5943460996c43060d1f3aa1309ba6">binary_g_nd3</a></div><div class="ttdeci">void binary_g_nd3(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> binary_two.h:128</div></div>
-<div class="ttc" id="ametal_2kernels_2binary__two_8h_html_a6297badf47dece518bb4e67f02cffea8"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#a6297badf47dece518bb4e67f02cffea8">binary_g</a></div><div class="ttdeci">void binary_g(device const T *a, device const T *b, device U *c, device U *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> binary_two.h:147</div></div>
+<div class="ttc" id="ametal_2kernels_2binary__two_8h_html_a97b5613aff654d32c49225209a19bb95"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#a97b5613aff654d32c49225209a19bb95">binary_g_nd2</a></div><div class="ttdeci">void binary_g_nd2(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> binary_two.h:110</div></div>
+<div class="ttc" id="ametal_2kernels_2binary__two_8h_html_aae07014f8dffa3649a5c7f4671e1268e"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#aae07014f8dffa3649a5c7f4671e1268e">binary_g_nd3</a></div><div class="ttdeci">void binary_g_nd3(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> binary_two.h:128</div></div>
+<div class="ttc" id="ametal_2kernels_2binary__two_8h_html_aaf6edb734cea627bca4f6540dc338fbd"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#aaf6edb734cea627bca4f6540dc338fbd">binary_g</a></div><div class="ttdeci">void binary_g(device const T *a, device const T *b, device U *c, device U *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> binary_two.h:151</div></div>
 <div class="ttc" id="ametal_2kernels_2binary__two_8h_html_ab18c6ecf5065275c93701efd095c916c"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#ab18c6ecf5065275c93701efd095c916c">binary_sv</a></div><div class="ttdeci">void binary_sv(device const T *a, device const T *b, device U *c, device U *d, uint index)</div><div class="ttdef"><b>Definition</b> binary_two.h:16</div></div>
 <div class="ttc" id="ametal_2kernels_2binary__two_8h_html_ab4324f594c007a6895540b77ad5d89d9"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#ab4324f594c007a6895540b77ad5d89d9">binary_vv</a></div><div class="ttdeci">void binary_vv(device const T *a, device const T *b, device U *c, device U *d, uint index)</div><div class="ttdef"><b>Definition</b> binary_two.h:40</div></div>
 <div class="ttc" id="ametal_2kernels_2binary__two_8h_html_ad1fad37c168192b212a4294f4cf78133"><div class="ttname"><a href="metal_2kernels_2binary__two_8h.html#ad1fad37c168192b212a4294f4cf78133">binary_g_nd1</a></div><div class="ttdeci">void binary_g_nd1(device const T *a, device const T *b, device U *c, device U *d, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index)</div><div class="ttdef"><b>Definition</b> binary_two.h:94</div></div>
diff --git a/docs/build/html/metal_2kernels_2copy_8h.html b/docs/build/html/metal_2kernels_2copy_8h.html
index 908663fd3..5274e6311 100644
--- a/docs/build/html/metal_2kernels_2copy_8h.html
+++ b/docs/build/html/metal_2kernels_2copy_8h.html
@@ -113,36 +113,36 @@ Functions</h2></td></tr>
 <tr class="memitem:aba4530a7db6a61ca36f50e4f5e58fb77" id="r_aba4530a7db6a61ca36f50e4f5e58fb77"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U &gt; </td></tr>
 <tr class="memitem:aba4530a7db6a61ca36f50e4f5e58fb77"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aba4530a7db6a61ca36f50e4f5e58fb77">copy_g_nd1</a> (device const T *src, device U *dst, constant const int64_t &amp;src_stride, uint index)</td></tr>
 <tr class="separator:aba4530a7db6a61ca36f50e4f5e58fb77"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aee678c7c31119f3e609685589f37490c" id="r_aee678c7c31119f3e609685589f37490c"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U &gt; </td></tr>
-<tr class="memitem:aee678c7c31119f3e609685589f37490c"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aee678c7c31119f3e609685589f37490c">copy_g_nd2</a> (device const T *src, device U *dst, constant const int64_t *src_strides, uint2 index, uint2 grid_dim)</td></tr>
-<tr class="separator:aee678c7c31119f3e609685589f37490c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a821f8f3f3891159a295c66fc25aed1ff" id="r_a821f8f3f3891159a295c66fc25aed1ff"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U &gt; </td></tr>
-<tr class="memitem:a821f8f3f3891159a295c66fc25aed1ff"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a821f8f3f3891159a295c66fc25aed1ff">copy_g_nd3</a> (device const T *src, device U *dst, constant const int64_t *src_strides, uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:a821f8f3f3891159a295c66fc25aed1ff"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a778ce2dbfbaa23b24bd5efbe68448c36" id="r_a778ce2dbfbaa23b24bd5efbe68448c36"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , int N = 1&gt; </td></tr>
-<tr class="memitem:a778ce2dbfbaa23b24bd5efbe68448c36"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a778ce2dbfbaa23b24bd5efbe68448c36">copy_g</a> (device const T *src, device U *dst, constant const int *src_shape, constant const int64_t *src_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:a778ce2dbfbaa23b24bd5efbe68448c36"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a39ec5b7b8351e4332b842982a2ee6260" id="r_a39ec5b7b8351e4332b842982a2ee6260"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename IdxT  = int64_t&gt; </td></tr>
+<tr class="memitem:a39ec5b7b8351e4332b842982a2ee6260"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a39ec5b7b8351e4332b842982a2ee6260">copy_g_nd2</a> (device const T *src, device U *dst, constant const int64_t *src_strides, uint2 index, uint2 grid_dim)</td></tr>
+<tr class="separator:a39ec5b7b8351e4332b842982a2ee6260"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aab82689380897ff4716b5eafd6ef3ecc" id="r_aab82689380897ff4716b5eafd6ef3ecc"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename IdxT  = int64_t&gt; </td></tr>
+<tr class="memitem:aab82689380897ff4716b5eafd6ef3ecc"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aab82689380897ff4716b5eafd6ef3ecc">copy_g_nd3</a> (device const T *src, device U *dst, constant const int64_t *src_strides, uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:aab82689380897ff4716b5eafd6ef3ecc"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a71e4103db4689d90ef6f9d5ba93604cf" id="r_a71e4103db4689d90ef6f9d5ba93604cf"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , int N = 1, typename IdxT  = int64_t&gt; </td></tr>
+<tr class="memitem:a71e4103db4689d90ef6f9d5ba93604cf"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a71e4103db4689d90ef6f9d5ba93604cf">copy_g</a> (device const T *src, device U *dst, constant const int *src_shape, constant const int64_t *src_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:a71e4103db4689d90ef6f9d5ba93604cf"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a3278d9c999718bee3ccbe2922f501bf1" id="r_a3278d9c999718bee3ccbe2922f501bf1"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U &gt; </td></tr>
 <tr class="memitem:a3278d9c999718bee3ccbe2922f501bf1"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a3278d9c999718bee3ccbe2922f501bf1">copy_gg_nd1</a> (device const T *src, device U *dst, constant const int64_t &amp;src_stride, constant const int64_t &amp;dst_stride, uint index)</td></tr>
 <tr class="separator:a3278d9c999718bee3ccbe2922f501bf1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3e2d3cc7f34f56170409b6735f51a950" id="r_a3e2d3cc7f34f56170409b6735f51a950"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U &gt; </td></tr>
-<tr class="memitem:a3e2d3cc7f34f56170409b6735f51a950"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a3e2d3cc7f34f56170409b6735f51a950">copy_gg_nd2</a> (device const T *src, device U *dst, constant const int64_t *src_strides, constant const int64_t *dst_strides, uint2 index)</td></tr>
-<tr class="separator:a3e2d3cc7f34f56170409b6735f51a950"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a59f43b5bffed936d7559ceb06a10aabd" id="r_a59f43b5bffed936d7559ceb06a10aabd"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U &gt; </td></tr>
-<tr class="memitem:a59f43b5bffed936d7559ceb06a10aabd"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a59f43b5bffed936d7559ceb06a10aabd">copy_gg_nd3</a> (device const T *src, device U *dst, constant const int64_t *src_strides, constant const int64_t *dst_strides, uint3 index)</td></tr>
-<tr class="separator:a59f43b5bffed936d7559ceb06a10aabd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1e39c2683eeaf05955e7619fbd34aea5" id="r_a1e39c2683eeaf05955e7619fbd34aea5"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , int N = 1&gt; </td></tr>
-<tr class="memitem:a1e39c2683eeaf05955e7619fbd34aea5"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1e39c2683eeaf05955e7619fbd34aea5">copy_gg</a> (device const T *src, device U *dst, constant const int *src_shape, constant const int64_t *src_strides, constant const int64_t *dst_strides, constant const int &amp;ndim, uint3 index)</td></tr>
-<tr class="separator:a1e39c2683eeaf05955e7619fbd34aea5"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:af0b06ac3a96852a64fa4274a94b58301" id="r_af0b06ac3a96852a64fa4274a94b58301"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename IdxT  = int64_t&gt; </td></tr>
+<tr class="memitem:af0b06ac3a96852a64fa4274a94b58301"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#af0b06ac3a96852a64fa4274a94b58301">copy_gg_nd2</a> (device const T *src, device U *dst, constant const int64_t *src_strides, constant const int64_t *dst_strides, uint2 index)</td></tr>
+<tr class="separator:af0b06ac3a96852a64fa4274a94b58301"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3f3836ad0b6545ec9b9e1864224f7a13" id="r_a3f3836ad0b6545ec9b9e1864224f7a13"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename IdxT  = int64_t&gt; </td></tr>
+<tr class="memitem:a3f3836ad0b6545ec9b9e1864224f7a13"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a3f3836ad0b6545ec9b9e1864224f7a13">copy_gg_nd3</a> (device const T *src, device U *dst, constant const int64_t *src_strides, constant const int64_t *dst_strides, uint3 index)</td></tr>
+<tr class="separator:a3f3836ad0b6545ec9b9e1864224f7a13"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ade9a9eea9b8262a854a11721fe2bb9fa" id="r_ade9a9eea9b8262a854a11721fe2bb9fa"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , int N = 1, typename IdxT  = int64_t&gt; </td></tr>
+<tr class="memitem:ade9a9eea9b8262a854a11721fe2bb9fa"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ade9a9eea9b8262a854a11721fe2bb9fa">copy_gg</a> (device const T *src, device U *dst, constant const int *src_shape, constant const int64_t *src_strides, constant const int64_t *dst_strides, constant const int &amp;ndim, uint3 index)</td></tr>
+<tr class="separator:ade9a9eea9b8262a854a11721fe2bb9fa"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="a778ce2dbfbaa23b24bd5efbe68448c36" name="a778ce2dbfbaa23b24bd5efbe68448c36"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a778ce2dbfbaa23b24bd5efbe68448c36">&#9670;&#160;</a></span>copy_g()</h2>
+<a id="a71e4103db4689d90ef6f9d5ba93604cf" name="a71e4103db4689d90ef6f9d5ba93604cf"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a71e4103db4689d90ef6f9d5ba93604cf">&#9670;&#160;</a></span>copy_g()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , int N = 1&gt; </div>
+template&lt;typename T , typename U , int N = 1, typename IdxT  = int64_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void copy_g </td>
@@ -217,13 +217,13 @@ template&lt;typename T , typename U &gt; </div>
 
 </div>
 </div>
-<a id="aee678c7c31119f3e609685589f37490c" name="aee678c7c31119f3e609685589f37490c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aee678c7c31119f3e609685589f37490c">&#9670;&#160;</a></span>copy_g_nd2()</h2>
+<a id="a39ec5b7b8351e4332b842982a2ee6260" name="a39ec5b7b8351e4332b842982a2ee6260"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a39ec5b7b8351e4332b842982a2ee6260">&#9670;&#160;</a></span>copy_g_nd2()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U &gt; </div>
+template&lt;typename T , typename U , typename IdxT  = int64_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void copy_g_nd2 </td>
@@ -255,13 +255,13 @@ template&lt;typename T , typename U &gt; </div>
 
 </div>
 </div>
-<a id="a821f8f3f3891159a295c66fc25aed1ff" name="a821f8f3f3891159a295c66fc25aed1ff"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a821f8f3f3891159a295c66fc25aed1ff">&#9670;&#160;</a></span>copy_g_nd3()</h2>
+<a id="aab82689380897ff4716b5eafd6ef3ecc" name="aab82689380897ff4716b5eafd6ef3ecc"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aab82689380897ff4716b5eafd6ef3ecc">&#9670;&#160;</a></span>copy_g_nd3()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U &gt; </div>
+template&lt;typename T , typename U , typename IdxT  = int64_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void copy_g_nd3 </td>
@@ -293,13 +293,13 @@ template&lt;typename T , typename U &gt; </div>
 
 </div>
 </div>
-<a id="a1e39c2683eeaf05955e7619fbd34aea5" name="a1e39c2683eeaf05955e7619fbd34aea5"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1e39c2683eeaf05955e7619fbd34aea5">&#9670;&#160;</a></span>copy_gg()</h2>
+<a id="ade9a9eea9b8262a854a11721fe2bb9fa" name="ade9a9eea9b8262a854a11721fe2bb9fa"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ade9a9eea9b8262a854a11721fe2bb9fa">&#9670;&#160;</a></span>copy_gg()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , int N = 1&gt; </div>
+template&lt;typename T , typename U , int N = 1, typename IdxT  = int64_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void copy_gg </td>
@@ -379,13 +379,13 @@ template&lt;typename T , typename U &gt; </div>
 
 </div>
 </div>
-<a id="a3e2d3cc7f34f56170409b6735f51a950" name="a3e2d3cc7f34f56170409b6735f51a950"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a3e2d3cc7f34f56170409b6735f51a950">&#9670;&#160;</a></span>copy_gg_nd2()</h2>
+<a id="af0b06ac3a96852a64fa4274a94b58301" name="af0b06ac3a96852a64fa4274a94b58301"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#af0b06ac3a96852a64fa4274a94b58301">&#9670;&#160;</a></span>copy_gg_nd2()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U &gt; </div>
+template&lt;typename T , typename U , typename IdxT  = int64_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void copy_gg_nd2 </td>
@@ -417,13 +417,13 @@ template&lt;typename T , typename U &gt; </div>
 
 </div>
 </div>
-<a id="a59f43b5bffed936d7559ceb06a10aabd" name="a59f43b5bffed936d7559ceb06a10aabd"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a59f43b5bffed936d7559ceb06a10aabd">&#9670;&#160;</a></span>copy_gg_nd3()</h2>
+<a id="a3f3836ad0b6545ec9b9e1864224f7a13" name="a3f3836ad0b6545ec9b9e1864224f7a13"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a3f3836ad0b6545ec9b9e1864224f7a13">&#9670;&#160;</a></span>copy_gg_nd3()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U &gt; </div>
+template&lt;typename T , typename U , typename IdxT  = int64_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void copy_gg_nd3 </td>
diff --git a/docs/build/html/metal_2kernels_2copy_8h_source.html b/docs/build/html/metal_2kernels_2copy_8h_source.html
index 882e64744..1716e9bc9 100644
--- a/docs/build/html/metal_2kernels_2copy_8h_source.html
+++ b/docs/build/html/metal_2kernels_2copy_8h_source.html
@@ -144,43 +144,43 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>    device U* dst [[buffer(1)]],</div>
 <div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>    constant <span class="keyword">const</span> int64_t&amp; src_stride [[buffer(3)]],</div>
 <div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>    uint index [[thread_position_in_grid]]) {</div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, src_stride);</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;int64_t, int&gt;</a>(index, src_stride);</div>
 <div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  dst[index] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
 <div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>}</div>
 </div>
 <div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span> </div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U&gt;</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> IdxT = <span class="keywordtype">int</span>64_t&gt;</div>
 <div class="foldopen" id="foldopen00050" data-start="{" data-end="}">
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c">   50</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c">copy_g_nd2</a>(</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a39ec5b7b8351e4332b842982a2ee6260">   50</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a39ec5b7b8351e4332b842982a2ee6260">copy_g_nd2</a>(</div>
 <div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
 <div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>    device U* dst [[buffer(1)]],</div>
 <div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>    constant <span class="keyword">const</span> int64_t* src_strides [[buffer(3)]],</div>
 <div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>    uint2 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, src_strides);</div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>  int64_t dst_idx = index.x + (int64_t)grid_dim.x * index.y;</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;int64_t, IdxT&gt;</a>(index, src_strides);</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>  IdxT dst_idx = index.x + IdxT(grid_dim.x) * index.y;</div>
 <div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>  dst[dst_idx] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
 <div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>}</div>
 </div>
 <div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span> </div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U&gt;</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> IdxT = <span class="keywordtype">int</span>64_t&gt;</div>
 <div class="foldopen" id="foldopen00062" data-start="{" data-end="}">
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff">   62</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff">copy_g_nd3</a>(</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#aab82689380897ff4716b5eafd6ef3ecc">   62</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#aab82689380897ff4716b5eafd6ef3ecc">copy_g_nd3</a>(</div>
 <div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
 <div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    device U* dst [[buffer(1)]],</div>
 <div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>    constant <span class="keyword">const</span> int64_t* src_strides [[buffer(3)]],</div>
 <div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    uint3 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, src_strides);</div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>  int64_t dst_idx =</div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>      index.x + (int64_t)grid_dim.x * (index.y + (int64_t)grid_dim.y * index.z);</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;int64_t, IdxT&gt;</a>(index, src_strides);</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>  IdxT dst_idx =</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>      index.x + IdxT(grid_dim.x) * (index.y + IdxT(grid_dim.y) * index.z);</div>
 <div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>  dst[dst_idx] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
 <div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>}</div>
 </div>
 <div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span> </div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keywordtype">int</span> N = 1&gt;</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keywordtype">int</span> N = 1, <span class="keyword">typename</span> IdxT = <span class="keywordtype">int</span>64_t&gt;</div>
 <div class="foldopen" id="foldopen00075" data-start="{" data-end="}">
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36">   75</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36">copy_g</a>(</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a71e4103db4689d90ef6f9d5ba93604cf">   75</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a71e4103db4689d90ef6f9d5ba93604cf">copy_g</a>(</div>
 <div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
 <div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    device U* dst [[buffer(1)]],</div>
 <div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* src_shape [[buffer(2)]],</div>
@@ -188,114 +188,113 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim [[buffer(5)]],</div>
 <div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    uint3 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;int64_t, IdxT&gt;</a>(</div>
 <div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>      {N * index.x, index.y, index.z}, src_shape, src_strides, ndim);</div>
 <div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>  <span class="keywordflow">if</span> (N == 1) {</div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>    int64_t dst_idx =</div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>        index.x + grid_dim.x * (index.y + int64_t(grid_dim.y) * index.z);</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>    IdxT dst_idx =</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>        index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);</div>
 <div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>    dst[dst_idx] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
 <div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>    <span class="keywordflow">return</span>;</div>
 <div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>  }</div>
 <div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  <span class="keyword">auto</span> xshape = src_shape[ndim - 1];</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>  int64_t dst_idx =</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>      N * index.x + xshape * (index.y + int64_t(grid_dim.y) * index.z);</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  <span class="keyword">auto</span> src_xstride = src_strides[ndim - 1];</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>    dst[dst_idx + i] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>    src_idx += src_xstride;</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  }</div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>}</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>  IdxT dst_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>  <span class="keyword">auto</span> src_xstride = src_strides[ndim - 1];</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>    dst[dst_idx + i] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>    src_idx += src_xstride;</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>  }</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>}</div>
 </div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span> </div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U&gt;</div>
-<div class="foldopen" id="foldopen00102" data-start="{" data-end="}">
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1">  102</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1">copy_gg_nd1</a>(</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    device U* dst [[buffer(1)]],</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    constant <span class="keyword">const</span> int64_t&amp; src_stride [[buffer(3)]],</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    constant <span class="keyword">const</span> int64_t&amp; dst_stride [[buffer(4)]],</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    uint index [[thread_position_in_grid]]) {</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, src_stride);</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>  <span class="keyword">auto</span> dst_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, dst_stride);</div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  dst[dst_idx] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>}</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span> </div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U&gt;</div>
+<div class="foldopen" id="foldopen00101" data-start="{" data-end="}">
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1">  101</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1">copy_gg_nd1</a>(</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    device U* dst [[buffer(1)]],</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    constant <span class="keyword">const</span> int64_t&amp; src_stride [[buffer(3)]],</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    constant <span class="keyword">const</span> int64_t&amp; dst_stride [[buffer(4)]],</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    uint index [[thread_position_in_grid]]) {</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;int64_t, int&gt;</a>(index, src_stride);</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>  <span class="keyword">auto</span> dst_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;int64_t, int&gt;</a>(index, dst_stride);</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>  dst[dst_idx] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>}</div>
 </div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span> </div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U&gt;</div>
-<div class="foldopen" id="foldopen00114" data-start="{" data-end="}">
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950">  114</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950">copy_gg_nd2</a>(</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>    device U* dst [[buffer(1)]],</div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    constant <span class="keyword">const</span> int64_t* src_strides [[buffer(3)]],</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    constant <span class="keyword">const</span> int64_t* dst_strides [[buffer(4)]],</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    uint2 index [[thread_position_in_grid]]) {</div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, src_strides);</div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  <span class="keyword">auto</span> dst_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, dst_strides);</div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>  dst[dst_idx] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>}</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span> </div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> IdxT = <span class="keywordtype">int</span>64_t&gt;</div>
+<div class="foldopen" id="foldopen00113" data-start="{" data-end="}">
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#af0b06ac3a96852a64fa4274a94b58301">  113</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#af0b06ac3a96852a64fa4274a94b58301">copy_gg_nd2</a>(</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>    device U* dst [[buffer(1)]],</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>    constant <span class="keyword">const</span> int64_t* src_strides [[buffer(3)]],</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    constant <span class="keyword">const</span> int64_t* dst_strides [[buffer(4)]],</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    uint2 index [[thread_position_in_grid]]) {</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;int64_t, IdxT&gt;</a>(index, src_strides);</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  <span class="keyword">auto</span> dst_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;int64_t, IdxT&gt;</a>(index, dst_strides);</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  dst[dst_idx] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>}</div>
 </div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span> </div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U&gt;</div>
-<div class="foldopen" id="foldopen00126" data-start="{" data-end="}">
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd">  126</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd">copy_gg_nd3</a>(</div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    device U* dst [[buffer(1)]],</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    constant <span class="keyword">const</span> int64_t* src_strides [[buffer(3)]],</div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    constant <span class="keyword">const</span> int64_t* dst_strides [[buffer(4)]],</div>
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    uint3 index [[thread_position_in_grid]]) {</div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, src_strides);</div>
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  <span class="keyword">auto</span> dst_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, dst_strides);</div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>  dst[dst_idx] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>}</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span> </div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> IdxT = <span class="keywordtype">int</span>64_t&gt;</div>
+<div class="foldopen" id="foldopen00125" data-start="{" data-end="}">
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a3f3836ad0b6545ec9b9e1864224f7a13">  125</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a3f3836ad0b6545ec9b9e1864224f7a13">copy_gg_nd3</a>(</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    device U* dst [[buffer(1)]],</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    constant <span class="keyword">const</span> int64_t* src_strides [[buffer(3)]],</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    constant <span class="keyword">const</span> int64_t* dst_strides [[buffer(4)]],</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    uint3 index [[thread_position_in_grid]]) {</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>  <span class="keyword">auto</span> src_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;int64_t, IdxT&gt;</a>(index, src_strides);</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>  <span class="keyword">auto</span> dst_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;int64_t, IdxT&gt;</a>(index, dst_strides);</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  dst[dst_idx] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[src_idx]);</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>}</div>
 </div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span> </div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keywordtype">int</span> N = 1&gt;</div>
-<div class="foldopen" id="foldopen00138" data-start="{" data-end="}">
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5">  138</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5">copy_gg</a>(</div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>    device U* dst [[buffer(1)]],</div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* src_shape [[buffer(2)]],</div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>    constant <span class="keyword">const</span> int64_t* src_strides [[buffer(3)]],</div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    constant <span class="keyword">const</span> int64_t* dst_strides [[buffer(4)]],</div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim [[buffer(5)]],</div>
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>    uint3 index [[thread_position_in_grid]]) {</div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>  <span class="keyword">auto</span> idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">elem_to_loc_2_nd</a>(</div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>      {N * index.x, index.y, index.z},</div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>      src_shape,</div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>      src_strides,</div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>      dst_strides,</div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>      ndim);</div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>  <span class="keywordflow">if</span> (N == 1) {</div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    dst[idx.y] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[idx.x]);</div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>    <span class="keywordflow">return</span>;</div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>  }</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>  <span class="keyword">auto</span> src_xstride = src_strides[ndim - 1];</div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>  <span class="keyword">auto</span> dst_xstride = dst_strides[ndim - 1];</div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>  <span class="keyword">auto</span> xshape = src_shape[ndim - 1];</div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    dst[idx.y] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[idx.x]);</div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    idx.x += src_xstride;</div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    idx.y += dst_xstride;</div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>  }</div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>}</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span> </div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keywordtype">int</span> N = 1, <span class="keyword">typename</span> IdxT = <span class="keywordtype">int</span>64_t&gt;</div>
+<div class="foldopen" id="foldopen00137" data-start="{" data-end="}">
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno"><a class="line" href="metal_2kernels_2copy_8h.html#ade9a9eea9b8262a854a11721fe2bb9fa">  137</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2copy_8h.html#ade9a9eea9b8262a854a11721fe2bb9fa">copy_gg</a>(</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    device <span class="keyword">const</span> T* src [[buffer(0)]],</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    device U* dst [[buffer(1)]],</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* src_shape [[buffer(2)]],</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>    constant <span class="keyword">const</span> int64_t* src_strides [[buffer(3)]],</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>    constant <span class="keyword">const</span> int64_t* dst_strides [[buffer(4)]],</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim [[buffer(5)]],</div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>    uint3 index [[thread_position_in_grid]]) {</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>  <span class="keyword">auto</span> idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">elem_to_loc_2_nd&lt;int64_t, IdxT&gt;</a>(</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>      {N * index.x, index.y, index.z},</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>      src_shape,</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>      src_strides,</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>      dst_strides,</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>      ndim);</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>  <span class="keywordflow">if</span> (N == 1) {</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    dst[idx.y] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[idx.x]);</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>  }</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>  IdxT src_xstride = src_strides[ndim - 1];</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>  IdxT dst_xstride = dst_strides[ndim - 1];</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>  <span class="keyword">auto</span> xshape = src_shape[ndim - 1];</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    dst[idx.y] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(src[idx.x]);</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    idx.x += src_xstride;</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    idx.y += dst_xstride;</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>  }</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>}</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a01c9309978a6c12f79b6e4108728a953"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953">elem_to_loc_2_nd</a></div><div class="ttdeci">METAL_FUNC ulong2 elem_to_loc_2_nd(uint3 elem, constant const int *shape, constant const stride_t *a_strides, constant const stride_t *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:153</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a196a07022b812b241d4c06192c0fa83d"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_1(uint elem, constant const stride_t &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:133</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a2c34ed54714c69e6e1b44344f9e6e330"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_3(uint3 elem, constant const stride_t strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ad6c45cacca97899cd362df49c06fea79"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_2(uint2 elem, constant const stride_t strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:139</div></div>
-<div class="ttc" id="ametal_2kernels_2copy_8h_html_a1e39c2683eeaf05955e7619fbd34aea5"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5">copy_gg</a></div><div class="ttdeci">void copy_gg(device const T *src, device U *dst, constant const int *src_shape, constant const int64_t *src_strides, constant const int64_t *dst_strides, constant const int &amp;ndim, uint3 index)</div><div class="ttdef"><b>Definition</b> copy.h:138</div></div>
-<div class="ttc" id="ametal_2kernels_2copy_8h_html_a3278d9c999718bee3ccbe2922f501bf1"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1">copy_gg_nd1</a></div><div class="ttdeci">void copy_gg_nd1(device const T *src, device U *dst, constant const int64_t &amp;src_stride, constant const int64_t &amp;dst_stride, uint index)</div><div class="ttdef"><b>Definition</b> copy.h:102</div></div>
-<div class="ttc" id="ametal_2kernels_2copy_8h_html_a3e2d3cc7f34f56170409b6735f51a950"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950">copy_gg_nd2</a></div><div class="ttdeci">void copy_gg_nd2(device const T *src, device U *dst, constant const int64_t *src_strides, constant const int64_t *dst_strides, uint2 index)</div><div class="ttdef"><b>Definition</b> copy.h:114</div></div>
-<div class="ttc" id="ametal_2kernels_2copy_8h_html_a59f43b5bffed936d7559ceb06a10aabd"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd">copy_gg_nd3</a></div><div class="ttdeci">void copy_gg_nd3(device const T *src, device U *dst, constant const int64_t *src_strides, constant const int64_t *dst_strides, uint3 index)</div><div class="ttdef"><b>Definition</b> copy.h:126</div></div>
-<div class="ttc" id="ametal_2kernels_2copy_8h_html_a778ce2dbfbaa23b24bd5efbe68448c36"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36">copy_g</a></div><div class="ttdeci">void copy_g(device const T *src, device U *dst, constant const int *src_shape, constant const int64_t *src_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> copy.h:75</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a43f33efc000962d6de881a3aab7458de"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_2(uint2 elem, constant const StrideT strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a650f8ea8cf9f9519da9e301aad0308dc"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const StrideT strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:150</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a66a2d7eec0262b12db16cd6c781ccf9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a">elem_to_loc_2_nd</a></div><div class="ttdeci">METAL_FUNC vec&lt; IdxT, 2 &gt; elem_to_loc_2_nd(uint3 elem, constant const int *shape, constant const StrideT *a_strides, constant const StrideT *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:159</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ac612d0ae30b8257198339debe04916a3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_1(uint elem, constant const StrideT &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:140</div></div>
+<div class="ttc" id="ametal_2kernels_2copy_8h_html_a3278d9c999718bee3ccbe2922f501bf1"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1">copy_gg_nd1</a></div><div class="ttdeci">void copy_gg_nd1(device const T *src, device U *dst, constant const int64_t &amp;src_stride, constant const int64_t &amp;dst_stride, uint index)</div><div class="ttdef"><b>Definition</b> copy.h:101</div></div>
+<div class="ttc" id="ametal_2kernels_2copy_8h_html_a39ec5b7b8351e4332b842982a2ee6260"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a39ec5b7b8351e4332b842982a2ee6260">copy_g_nd2</a></div><div class="ttdeci">void copy_g_nd2(device const T *src, device U *dst, constant const int64_t *src_strides, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> copy.h:50</div></div>
+<div class="ttc" id="ametal_2kernels_2copy_8h_html_a3f3836ad0b6545ec9b9e1864224f7a13"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a3f3836ad0b6545ec9b9e1864224f7a13">copy_gg_nd3</a></div><div class="ttdeci">void copy_gg_nd3(device const T *src, device U *dst, constant const int64_t *src_strides, constant const int64_t *dst_strides, uint3 index)</div><div class="ttdef"><b>Definition</b> copy.h:125</div></div>
+<div class="ttc" id="ametal_2kernels_2copy_8h_html_a71e4103db4689d90ef6f9d5ba93604cf"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a71e4103db4689d90ef6f9d5ba93604cf">copy_g</a></div><div class="ttdeci">void copy_g(device const T *src, device U *dst, constant const int *src_shape, constant const int64_t *src_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> copy.h:75</div></div>
 <div class="ttc" id="ametal_2kernels_2copy_8h_html_a8023e9335cc5334847a8d315042be3a3"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3">copy_s2</a></div><div class="ttdeci">void copy_s2(device const T *src, device U *dst, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> copy.h:20</div></div>
-<div class="ttc" id="ametal_2kernels_2copy_8h_html_a821f8f3f3891159a295c66fc25aed1ff"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff">copy_g_nd3</a></div><div class="ttdeci">void copy_g_nd3(device const T *src, device U *dst, constant const int64_t *src_strides, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> copy.h:62</div></div>
+<div class="ttc" id="ametal_2kernels_2copy_8h_html_aab82689380897ff4716b5eafd6ef3ecc"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#aab82689380897ff4716b5eafd6ef3ecc">copy_g_nd3</a></div><div class="ttdeci">void copy_g_nd3(device const T *src, device U *dst, constant const int64_t *src_strides, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> copy.h:62</div></div>
 <div class="ttc" id="ametal_2kernels_2copy_8h_html_aba4530a7db6a61ca36f50e4f5e58fb77"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77">copy_g_nd1</a></div><div class="ttdeci">void copy_g_nd1(device const T *src, device U *dst, constant const int64_t &amp;src_stride, uint index)</div><div class="ttdef"><b>Definition</b> copy.h:40</div></div>
+<div class="ttc" id="ametal_2kernels_2copy_8h_html_ade9a9eea9b8262a854a11721fe2bb9fa"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#ade9a9eea9b8262a854a11721fe2bb9fa">copy_gg</a></div><div class="ttdeci">void copy_gg(device const T *src, device U *dst, constant const int *src_shape, constant const int64_t *src_strides, constant const int64_t *dst_strides, constant const int &amp;ndim, uint3 index)</div><div class="ttdef"><b>Definition</b> copy.h:137</div></div>
 <div class="ttc" id="ametal_2kernels_2copy_8h_html_ae26a13e0c8e6c15f7b10078e65970659"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659">copy_v</a></div><div class="ttdeci">void copy_v(device const T *src, device U *dst, uint index)</div><div class="ttdef"><b>Definition</b> copy.h:12</div></div>
 <div class="ttc" id="ametal_2kernels_2copy_8h_html_aee14a5326f53d9b30b0b38e27d180ef3"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3">copy_v2</a></div><div class="ttdeci">void copy_v2(device const T *src, device U *dst, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> copy.h:30</div></div>
-<div class="ttc" id="ametal_2kernels_2copy_8h_html_aee678c7c31119f3e609685589f37490c"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c">copy_g_nd2</a></div><div class="ttdeci">void copy_g_nd2(device const T *src, device U *dst, constant const int64_t *src_strides, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> copy.h:50</div></div>
 <div class="ttc" id="ametal_2kernels_2copy_8h_html_aef09f9b9475345b1bba121d037d222ea"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea">copy_s</a></div><div class="ttdeci">void copy_s(device const T *src, device U *dst, uint index)</div><div class="ttdef"><b>Definition</b> copy.h:4</div></div>
+<div class="ttc" id="ametal_2kernels_2copy_8h_html_af0b06ac3a96852a64fa4274a94b58301"><div class="ttname"><a href="metal_2kernels_2copy_8h.html#af0b06ac3a96852a64fa4274a94b58301">copy_gg_nd2</a></div><div class="ttdeci">void copy_gg_nd2(device const T *src, device U *dst, constant const int64_t *src_strides, constant const int64_t *dst_strides, uint2 index)</div><div class="ttdef"><b>Definition</b> copy.h:113</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/metal_2kernels_2hadamard_8h_source.html b/docs/build/html/metal_2kernels_2hadamard_8h_source.html
index c9c27c0fa..76b2c5e62 100644
--- a/docs/build/html/metal_2kernels_2hadamard_8h_source.html
+++ b/docs/build/html/metal_2kernels_2hadamard_8h_source.html
@@ -268,7 +268,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="ametal_2kernels_2hadamard_8h_html_a590e5366adc78bab4fe44e37885d413f"><div class="ttname"><a href="metal_2kernels_2hadamard_8h.html#a590e5366adc78bab4fe44e37885d413f">radix_func</a></div><div class="ttdeci">METAL_FUNC void radix_func(thread float *x)</div><div class="ttdef"><b>Definition</b> hadamard.h:11</div></div>
 <div class="ttc" id="ametal_2kernels_2hadamard_8h_html_a63c0e8510e555cd065e1f0ddfb33ce18"><div class="ttname"><a href="metal_2kernels_2hadamard_8h.html#a63c0e8510e555cd065e1f0ddfb33ce18">hadamard_n</a></div><div class="ttdeci">void hadamard_n(const device T *in, device T *out, constant const float &amp;scale, uint3 elem, uint3 grid)</div><div class="ttdef"><b>Definition</b> hadamard.h:30</div></div>
 <div class="ttc" id="ametal_2kernels_2hadamard_8h_html_ab0bd478f2051af35aed1869005e3370a"><div class="ttname"><a href="metal_2kernels_2hadamard_8h.html#ab0bd478f2051af35aed1869005e3370a">hadamard_m</a></div><div class="ttdeci">void hadamard_m(const device T *in, device T *out, constant const float &amp;scale, uint3 elem, uint3 grid)</div><div class="ttdef"><b>Definition</b> hadamard.h:127</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
 <div class="ttc" id="asteel_2defines_8h_html"><div class="ttname"><a href="steel_2defines_8h.html">defines.h</a></div></div>
 <div class="ttc" id="asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6"><div class="ttname"><a href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div><div class="ttdeci">#define STEEL_PRAGMA_UNROLL</div><div class="ttdef"><b>Definition</b> defines.h:4</div></div>
 </div><!-- fragment --></div><!-- contents -->
diff --git a/docs/build/html/metal_2kernels_2ternary_8h.html b/docs/build/html/metal_2kernels_2ternary_8h.html
index 5aed46f56..2a8ce1056 100644
--- a/docs/build/html/metal_2kernels_2ternary_8h.html
+++ b/docs/build/html/metal_2kernels_2ternary_8h.html
@@ -107,24 +107,24 @@ Functions</h2></td></tr>
 <tr class="memitem:a1bd5918559850f3f80e3adee2391fe6a" id="r_a1bd5918559850f3f80e3adee2391fe6a"><td class="memTemplParams" colspan="2">template&lt;typename T , typename Op &gt; </td></tr>
 <tr class="memitem:a1bd5918559850f3f80e3adee2391fe6a"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1bd5918559850f3f80e3adee2391fe6a">ternary_g_nd1</a> (device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t &amp;a_strides, constant const size_t &amp;b_strides, constant const size_t &amp;c_strides, uint index)</td></tr>
 <tr class="separator:a1bd5918559850f3f80e3adee2391fe6a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afdf0d9d0cb21fcb3f176500785076af8" id="r_afdf0d9d0cb21fcb3f176500785076af8"><td class="memTemplParams" colspan="2">template&lt;typename T , typename Op &gt; </td></tr>
-<tr class="memitem:afdf0d9d0cb21fcb3f176500785076af8"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#afdf0d9d0cb21fcb3f176500785076af8">ternary_g_nd2</a> (device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t a_strides[2], constant const size_t b_strides[2], constant const size_t c_strides[2], uint2 index, uint2 grid_dim)</td></tr>
-<tr class="separator:afdf0d9d0cb21fcb3f176500785076af8"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a113df0c8a841b0e986900d580644e047" id="r_a113df0c8a841b0e986900d580644e047"><td class="memTemplParams" colspan="2">template&lt;typename T , typename Op &gt; </td></tr>
-<tr class="memitem:a113df0c8a841b0e986900d580644e047"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a113df0c8a841b0e986900d580644e047">ternary_g_nd3</a> (device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t a_strides[3], constant const size_t b_strides[3], constant const size_t c_strides[3], uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:a113df0c8a841b0e986900d580644e047"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adf8b5989de971e43829875dc0097cdfb" id="r_adf8b5989de971e43829875dc0097cdfb"><td class="memTemplParams" colspan="2">template&lt;typename T , typename Op , int N = 1&gt; </td></tr>
-<tr class="memitem:adf8b5989de971e43829875dc0097cdfb"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#adf8b5989de971e43829875dc0097cdfb">ternary_g</a> (device const bool *a, device const T *b, device const T *c, device T *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:adf8b5989de971e43829875dc0097cdfb"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:adec9ca8a8bf527cb15d70da5857af15d" id="r_adec9ca8a8bf527cb15d70da5857af15d"><td class="memTemplParams" colspan="2">template&lt;typename T , typename Op , typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:adec9ca8a8bf527cb15d70da5857af15d"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#adec9ca8a8bf527cb15d70da5857af15d">ternary_g_nd2</a> (device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t a_strides[2], constant const size_t b_strides[2], constant const size_t c_strides[2], uint2 index, uint2 grid_dim)</td></tr>
+<tr class="separator:adec9ca8a8bf527cb15d70da5857af15d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a046dcbf67cd2318d45355dc7516e3ff4" id="r_a046dcbf67cd2318d45355dc7516e3ff4"><td class="memTemplParams" colspan="2">template&lt;typename T , typename Op , typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:a046dcbf67cd2318d45355dc7516e3ff4"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a046dcbf67cd2318d45355dc7516e3ff4">ternary_g_nd3</a> (device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t a_strides[3], constant const size_t b_strides[3], constant const size_t c_strides[3], uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:a046dcbf67cd2318d45355dc7516e3ff4"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ab2051fd944c2e24c57d5b4af54894d72" id="r_ab2051fd944c2e24c57d5b4af54894d72"><td class="memTemplParams" colspan="2">template&lt;typename T , typename Op , int N = 1, typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:ab2051fd944c2e24c57d5b4af54894d72"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ab2051fd944c2e24c57d5b4af54894d72">ternary_g</a> (device const bool *a, device const T *b, device const T *c, device T *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:ab2051fd944c2e24c57d5b4af54894d72"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="adf8b5989de971e43829875dc0097cdfb" name="adf8b5989de971e43829875dc0097cdfb"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#adf8b5989de971e43829875dc0097cdfb">&#9670;&#160;</a></span>ternary_g()</h2>
+<a id="ab2051fd944c2e24c57d5b4af54894d72" name="ab2051fd944c2e24c57d5b4af54894d72"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ab2051fd944c2e24c57d5b4af54894d72">&#9670;&#160;</a></span>ternary_g()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename Op , int N = 1&gt; </div>
+template&lt;typename T , typename Op , int N = 1, typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void ternary_g </td>
@@ -239,13 +239,13 @@ template&lt;typename T , typename Op &gt; </div>
 
 </div>
 </div>
-<a id="afdf0d9d0cb21fcb3f176500785076af8" name="afdf0d9d0cb21fcb3f176500785076af8"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#afdf0d9d0cb21fcb3f176500785076af8">&#9670;&#160;</a></span>ternary_g_nd2()</h2>
+<a id="adec9ca8a8bf527cb15d70da5857af15d" name="adec9ca8a8bf527cb15d70da5857af15d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#adec9ca8a8bf527cb15d70da5857af15d">&#9670;&#160;</a></span>ternary_g_nd2()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename Op &gt; </div>
+template&lt;typename T , typename Op , typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void ternary_g_nd2 </td>
@@ -297,13 +297,13 @@ template&lt;typename T , typename Op &gt; </div>
 
 </div>
 </div>
-<a id="a113df0c8a841b0e986900d580644e047" name="a113df0c8a841b0e986900d580644e047"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a113df0c8a841b0e986900d580644e047">&#9670;&#160;</a></span>ternary_g_nd3()</h2>
+<a id="a046dcbf67cd2318d45355dc7516e3ff4" name="a046dcbf67cd2318d45355dc7516e3ff4"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a046dcbf67cd2318d45355dc7516e3ff4">&#9670;&#160;</a></span>ternary_g_nd3()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename Op &gt; </div>
+template&lt;typename T , typename Op , typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void ternary_g_nd3 </td>
diff --git a/docs/build/html/metal_2kernels_2ternary_8h_source.html b/docs/build/html/metal_2kernels_2ternary_8h_source.html
index 08b491196..9083ba23f 100644
--- a/docs/build/html/metal_2kernels_2ternary_8h_source.html
+++ b/docs/build/html/metal_2kernels_2ternary_8h_source.html
@@ -130,16 +130,16 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>&amp; b_strides,</div>
 <div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>&amp; c_strides,</div>
 <div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    uint index [[thread_position_in_grid]]) {</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, a_strides);</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, b_strides);</div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keyword">auto</span> c_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a>(index, c_strides);</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;size_t, uint&gt;</a>(index, a_strides);</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;size_t, uint&gt;</a>(index, b_strides);</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keyword">auto</span> c_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1&lt;size_t, uint&gt;</a>(index, c_strides);</div>
 <div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  d[index] = Op()(a[a_idx], b[b_idx], c[c_idx]);</div>
 <div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>}</div>
 </div>
 <div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span> </div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> Op&gt;</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> Op, <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
 <div class="foldopen" id="foldopen00042" data-start="{" data-end="}">
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno"><a class="line" href="metal_2kernels_2ternary_8h.html#afdf0d9d0cb21fcb3f176500785076af8">   42</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2ternary_8h.html#afdf0d9d0cb21fcb3f176500785076af8">ternary_g_nd2</a>(</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno"><a class="line" href="metal_2kernels_2ternary_8h.html#adec9ca8a8bf527cb15d70da5857af15d">   42</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2ternary_8h.html#adec9ca8a8bf527cb15d70da5857af15d">ternary_g_nd2</a>(</div>
 <div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>    device <span class="keyword">const</span> <span class="keywordtype">bool</span>* a,</div>
 <div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>    device <span class="keyword">const</span> T* b,</div>
 <div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>    device <span class="keyword">const</span> T* c,</div>
@@ -149,17 +149,17 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span> c_strides[2],</div>
 <div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>    uint2 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, a_strides);</div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, b_strides);</div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>  <span class="keyword">auto</span> c_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a>(index, c_strides);</div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>  <span class="keywordtype">size_t</span> out_idx = index.x + size_t(grid_dim.x) * index.y;</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;size_t, IdxT&gt;</a>(index, a_strides);</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;size_t, IdxT&gt;</a>(index, b_strides);</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>  <span class="keyword">auto</span> c_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2&lt;size_t, IdxT&gt;</a>(index, c_strides);</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>  IdxT out_idx = index.x + IdxT(grid_dim.x) * index.y;</div>
 <div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  d[out_idx] = Op()(a[a_idx], b[b_idx], c[c_idx]);</div>
 <div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>}</div>
 </div>
 <div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span> </div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> Op&gt;</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> Op, <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
 <div class="foldopen" id="foldopen00060" data-start="{" data-end="}">
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno"><a class="line" href="metal_2kernels_2ternary_8h.html#a113df0c8a841b0e986900d580644e047">   60</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2ternary_8h.html#a113df0c8a841b0e986900d580644e047">ternary_g_nd3</a>(</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno"><a class="line" href="metal_2kernels_2ternary_8h.html#a046dcbf67cd2318d45355dc7516e3ff4">   60</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2ternary_8h.html#a046dcbf67cd2318d45355dc7516e3ff4">ternary_g_nd3</a>(</div>
 <div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>    device <span class="keyword">const</span> <span class="keywordtype">bool</span>* a,</div>
 <div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>    device <span class="keyword">const</span> T* b,</div>
 <div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    device <span class="keyword">const</span> T* c,</div>
@@ -169,60 +169,58 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span> c_strides[3],</div>
 <div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    uint3 index [[thread_position_in_grid]],</div>
 <div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, a_strides);</div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, b_strides);</div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>  <span class="keyword">auto</span> c_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a>(index, c_strides);</div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  <span class="keywordtype">size_t</span> out_idx =</div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>      index.x + grid_dim.x * (index.y + size_t(grid_dim.y) * index.z);</div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>  d[out_idx] = Op()(a[a_idx], b[b_idx], c[c_idx]);</div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>}</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>  <span class="keyword">auto</span> a_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;size_t, IdxT&gt;</a>(index, a_strides);</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>  <span class="keyword">auto</span> b_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;size_t, IdxT&gt;</a>(index, b_strides);</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>  <span class="keyword">auto</span> c_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3&lt;size_t, IdxT&gt;</a>(index, c_strides);</div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  IdxT out_idx = index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>  d[out_idx] = Op()(a[a_idx], b[b_idx], c[c_idx]);</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>}</div>
 </div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span> </div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> N = 1&gt;</div>
-<div class="foldopen" id="foldopen00079" data-start="{" data-end="}">
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno"><a class="line" href="metal_2kernels_2ternary_8h.html#adf8b5989de971e43829875dc0097cdfb">   79</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2ternary_8h.html#adf8b5989de971e43829875dc0097cdfb">ternary_g</a>(</div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    device <span class="keyword">const</span> <span class="keywordtype">bool</span>* a,</div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    device <span class="keyword">const</span> T* b,</div>
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    device <span class="keyword">const</span> T* c,</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    device T* d,</div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* a_strides,</div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* b_strides,</div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* c_strides,</div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim,</div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>    uint3 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  <span class="keyword">auto</span> idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b">elem_to_loc_3_nd</a>(</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>      {N * index.x, index.y, index.z},</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>      shape,</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>      a_strides,</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>      b_strides,</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>      c_strides,</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>      ndim);</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  <span class="keyword">auto</span> xshape = shape[ndim - 1];</div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>  <span class="keywordtype">size_t</span> out_idx =</div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>      N * index.x + xshape * (index.y + size_t(grid_dim.y) * index.z);</div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>  <span class="keyword">auto</span> a_xstride = a_strides[ndim - 1];</div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>  <span class="keyword">auto</span> b_xstride = b_strides[ndim - 1];</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>  <span class="keyword">auto</span> c_xstride = c_strides[ndim - 1];</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    d[out_idx++] = Op()(a[idx.x], b[idx.y], c[idx.z]);</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    idx.x += a_xstride;</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    idx.y += b_xstride;</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    idx.z += c_xstride;</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>  }</div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>}</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span> </div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> N = 1, <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
+<div class="foldopen" id="foldopen00078" data-start="{" data-end="}">
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno"><a class="line" href="metal_2kernels_2ternary_8h.html#ab2051fd944c2e24c57d5b4af54894d72">   78</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2ternary_8h.html#ab2051fd944c2e24c57d5b4af54894d72">ternary_g</a>(</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    device <span class="keyword">const</span> <span class="keywordtype">bool</span>* a,</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    device <span class="keyword">const</span> T* b,</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    device <span class="keyword">const</span> T* c,</div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    device T* d,</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* shape,</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* a_strides,</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* b_strides,</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* c_strides,</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim,</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>    uint3 index [[thread_position_in_grid]],</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>  <span class="keyword">auto</span> idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733">elem_to_loc_3_nd&lt;IdxT&gt;</a>(</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>      {N * index.x, index.y, index.z},</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>      shape,</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>      a_strides,</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>      b_strides,</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>      c_strides,</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>      ndim);</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>  <span class="keyword">auto</span> xshape = shape[ndim - 1];</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>  IdxT a_xstride = a_strides[ndim - 1];</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>  IdxT b_xstride = b_strides[ndim - 1];</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>  IdxT c_xstride = c_strides[ndim - 1];</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    d[out_idx++] = Op()(a[idx.x], b[idx.y], c[idx.z]);</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    idx.x += a_xstride;</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    idx.y += b_xstride;</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    idx.z += c_xstride;</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>  }</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>}</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a196a07022b812b241d4c06192c0fa83d"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_1(uint elem, constant const stride_t &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:133</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a2c34ed54714c69e6e1b44344f9e6e330"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_3(uint3 elem, constant const stride_t strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a66940b1cc3d64651d24634bc696d528b"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b">elem_to_loc_3_nd</a></div><div class="ttdeci">METAL_FUNC ulong3 elem_to_loc_3_nd(uint3 elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:171</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ad6c45cacca97899cd362df49c06fea79"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc_2(uint2 elem, constant const stride_t strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:139</div></div>
-<div class="ttc" id="ametal_2kernels_2ternary_8h_html_a113df0c8a841b0e986900d580644e047"><div class="ttname"><a href="metal_2kernels_2ternary_8h.html#a113df0c8a841b0e986900d580644e047">ternary_g_nd3</a></div><div class="ttdeci">void ternary_g_nd3(device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t a_strides[3], constant const size_t b_strides[3], constant const size_t c_strides[3], uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> ternary.h:60</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a43f33efc000962d6de881a3aab7458de"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de">elem_to_loc_2</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_2(uint2 elem, constant const StrideT strides[2])</div><div class="ttdef"><b>Definition</b> utils.h:145</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a650f8ea8cf9f9519da9e301aad0308dc"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc">elem_to_loc_3</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_3(uint3 elem, constant const StrideT strides[3])</div><div class="ttdef"><b>Definition</b> utils.h:150</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a65d87b425e1f8ca19df97c15049f8733"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733">elem_to_loc_3_nd</a></div><div class="ttdeci">METAL_FUNC vec&lt; IdxT, 3 &gt; elem_to_loc_3_nd(uint3 elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:182</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_ac612d0ae30b8257198339debe04916a3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3">elem_to_loc_1</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc_1(uint elem, constant const StrideT &amp;stride)</div><div class="ttdef"><b>Definition</b> utils.h:140</div></div>
+<div class="ttc" id="ametal_2kernels_2ternary_8h_html_a046dcbf67cd2318d45355dc7516e3ff4"><div class="ttname"><a href="metal_2kernels_2ternary_8h.html#a046dcbf67cd2318d45355dc7516e3ff4">ternary_g_nd3</a></div><div class="ttdeci">void ternary_g_nd3(device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t a_strides[3], constant const size_t b_strides[3], constant const size_t c_strides[3], uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> ternary.h:60</div></div>
 <div class="ttc" id="ametal_2kernels_2ternary_8h_html_a1bd5918559850f3f80e3adee2391fe6a"><div class="ttname"><a href="metal_2kernels_2ternary_8h.html#a1bd5918559850f3f80e3adee2391fe6a">ternary_g_nd1</a></div><div class="ttdeci">void ternary_g_nd1(device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t &amp;a_strides, constant const size_t &amp;b_strides, constant const size_t &amp;c_strides, uint index)</div><div class="ttdef"><b>Definition</b> ternary.h:26</div></div>
 <div class="ttc" id="ametal_2kernels_2ternary_8h_html_a3e610f3b01966bdbf23fdfebe5d2c508"><div class="ttname"><a href="metal_2kernels_2ternary_8h.html#a3e610f3b01966bdbf23fdfebe5d2c508">ternary_v2</a></div><div class="ttdeci">void ternary_v2(device const bool *a, device const T *b, device const T *c, device T *d, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> ternary.h:14</div></div>
 <div class="ttc" id="ametal_2kernels_2ternary_8h_html_a83f93644d21ee774e06e8190d0725ccb"><div class="ttname"><a href="metal_2kernels_2ternary_8h.html#a83f93644d21ee774e06e8190d0725ccb">ternary_v</a></div><div class="ttdeci">void ternary_v(device const bool *a, device const T *b, device const T *c, device T *d, uint index)</div><div class="ttdef"><b>Definition</b> ternary.h:4</div></div>
-<div class="ttc" id="ametal_2kernels_2ternary_8h_html_adf8b5989de971e43829875dc0097cdfb"><div class="ttname"><a href="metal_2kernels_2ternary_8h.html#adf8b5989de971e43829875dc0097cdfb">ternary_g</a></div><div class="ttdeci">void ternary_g(device const bool *a, device const T *b, device const T *c, device T *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> ternary.h:79</div></div>
-<div class="ttc" id="ametal_2kernels_2ternary_8h_html_afdf0d9d0cb21fcb3f176500785076af8"><div class="ttname"><a href="metal_2kernels_2ternary_8h.html#afdf0d9d0cb21fcb3f176500785076af8">ternary_g_nd2</a></div><div class="ttdeci">void ternary_g_nd2(device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t a_strides[2], constant const size_t b_strides[2], constant const size_t c_strides[2], uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> ternary.h:42</div></div>
+<div class="ttc" id="ametal_2kernels_2ternary_8h_html_ab2051fd944c2e24c57d5b4af54894d72"><div class="ttname"><a href="metal_2kernels_2ternary_8h.html#ab2051fd944c2e24c57d5b4af54894d72">ternary_g</a></div><div class="ttdeci">void ternary_g(device const bool *a, device const T *b, device const T *c, device T *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> ternary.h:78</div></div>
+<div class="ttc" id="ametal_2kernels_2ternary_8h_html_adec9ca8a8bf527cb15d70da5857af15d"><div class="ttname"><a href="metal_2kernels_2ternary_8h.html#adec9ca8a8bf527cb15d70da5857af15d">ternary_g_nd2</a></div><div class="ttdeci">void ternary_g_nd2(device const bool *a, device const T *b, device const T *c, device T *d, constant const size_t a_strides[2], constant const size_t b_strides[2], constant const size_t c_strides[2], uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> ternary.h:42</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/metal_2kernels_2unary_8h.html b/docs/build/html/metal_2kernels_2unary_8h.html
index 40ff138c8..6b97691e9 100644
--- a/docs/build/html/metal_2kernels_2unary_8h.html
+++ b/docs/build/html/metal_2kernels_2unary_8h.html
@@ -104,18 +104,18 @@ Functions</h2></td></tr>
 <tr class="memitem:a7c7690f0df9d2acc60b63be58d9c7777" id="r_a7c7690f0df9d2acc60b63be58d9c7777"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op &gt; </td></tr>
 <tr class="memitem:a7c7690f0df9d2acc60b63be58d9c7777"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a7c7690f0df9d2acc60b63be58d9c7777">unary_v2</a> (device const T *in, device U *out, uint2 index, uint2 grid_dim)</td></tr>
 <tr class="separator:a7c7690f0df9d2acc60b63be58d9c7777"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac965f8d3ed62f8580dbfb645e83d4ae5" id="r_ac965f8d3ed62f8580dbfb645e83d4ae5"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int N = 1&gt; </td></tr>
-<tr class="memitem:ac965f8d3ed62f8580dbfb645e83d4ae5"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ac965f8d3ed62f8580dbfb645e83d4ae5">unary_g</a> (device const T *in, device U *out, constant const int *in_shape, constant const size_t *in_strides, device const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
-<tr class="separator:ac965f8d3ed62f8580dbfb645e83d4ae5"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac2a85fee50af49620ff62c1a71e2575d" id="r_ac2a85fee50af49620ff62c1a71e2575d"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int N = 1, typename IdxT  = size_t&gt; </td></tr>
+<tr class="memitem:ac2a85fee50af49620ff62c1a71e2575d"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ac2a85fee50af49620ff62c1a71e2575d">unary_g</a> (device const T *in, device U *out, constant const int *in_shape, constant const size_t *in_strides, device const int &amp;ndim, uint3 index, uint3 grid_dim)</td></tr>
+<tr class="separator:ac2a85fee50af49620ff62c1a71e2575d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="ac965f8d3ed62f8580dbfb645e83d4ae5" name="ac965f8d3ed62f8580dbfb645e83d4ae5"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ac965f8d3ed62f8580dbfb645e83d4ae5">&#9670;&#160;</a></span>unary_g()</h2>
+<a id="ac2a85fee50af49620ff62c1a71e2575d" name="ac2a85fee50af49620ff62c1a71e2575d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ac2a85fee50af49620ff62c1a71e2575d">&#9670;&#160;</a></span>unary_g()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int N = 1&gt; </div>
+template&lt;typename T , typename U , typename Op , int N = 1, typename IdxT  = size_t&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void unary_g </td>
diff --git a/docs/build/html/metal_2kernels_2unary_8h_source.html b/docs/build/html/metal_2kernels_2unary_8h_source.html
index 57d71ace4..96e7ea964 100644
--- a/docs/build/html/metal_2kernels_2unary_8h_source.html
+++ b/docs/build/html/metal_2kernels_2unary_8h_source.html
@@ -115,32 +115,36 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>}</div>
 </div>
 <div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span> </div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> N = 1&gt;</div>
-<div class="foldopen" id="foldopen00022" data-start="{" data-end="}">
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno"><a class="line" href="metal_2kernels_2unary_8h.html#ac965f8d3ed62f8580dbfb645e83d4ae5">   22</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2unary_8h.html#ac965f8d3ed62f8580dbfb645e83d4ae5">unary_g</a>(</div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>    device <span class="keyword">const</span> T* in,</div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    device U* out,</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* in_shape,</div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* in_strides,</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>    device <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim,</div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>    uint3 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  <span class="keyword">auto</span> idx =</div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>      <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>({N * index.x, index.y, index.z}, in_shape, in_strides, ndim);</div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>  <span class="keyword">auto</span> xshape = in_shape[ndim - 1];</div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  <span class="keyword">auto</span> xstride = in_strides[ndim - 1];</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>  <span class="keywordtype">size_t</span> out_idx =</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>      N * index.x + xshape * (index.y + size_t(grid_dim.y) * index.z);</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>    out[out_idx++] = Op()(in[idx]);</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>    idx += xstride;</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  }</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>}</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    <span class="keyword">typename</span> Op,</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>    <span class="keywordtype">int</span> N = 1,</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    <span class="keyword">typename</span> IdxT = <span class="keywordtype">size_t</span>&gt;</div>
+<div class="foldopen" id="foldopen00027" data-start="{" data-end="}">
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno"><a class="line" href="metal_2kernels_2unary_8h.html#ac2a85fee50af49620ff62c1a71e2575d">   27</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="metal_2kernels_2unary_8h.html#ac2a85fee50af49620ff62c1a71e2575d">unary_g</a>(</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>    device <span class="keyword">const</span> T* in,</div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>    device U* out,</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>    constant <span class="keyword">const</span> <span class="keywordtype">int</span>* in_shape,</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>    constant <span class="keyword">const</span> <span class="keywordtype">size_t</span>* in_strides,</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    device <span class="keyword">const</span> <span class="keywordtype">int</span>&amp; ndim,</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>    uint3 index [[thread_position_in_grid]],</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    uint3 grid_dim [[threads_per_grid]]) {</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>  <span class="keyword">auto</span> idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, IdxT&gt;</a>(</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>      {N * index.x, index.y, index.z}, in_shape, in_strides, ndim);</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keyword">auto</span> xshape = in_shape[ndim - 1];</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  IdxT xstride = in_strides[ndim - 1];</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  IdxT out_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N &amp;&amp; (int(N * index.x) + i) &lt; xshape; ++i) {</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>    out[out_idx++] = Op()(in[idx]);</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>    idx += xstride;</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>  }</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>}</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
 <div class="ttc" id="ametal_2kernels_2unary_8h_html_a64e4f6737edddb72122e262977ee3014"><div class="ttname"><a href="metal_2kernels_2unary_8h.html#a64e4f6737edddb72122e262977ee3014">unary_v</a></div><div class="ttdeci">void unary_v(device const T *in, device U *out, uint index)</div><div class="ttdef"><b>Definition</b> unary.h:4</div></div>
 <div class="ttc" id="ametal_2kernels_2unary_8h_html_a7c7690f0df9d2acc60b63be58d9c7777"><div class="ttname"><a href="metal_2kernels_2unary_8h.html#a7c7690f0df9d2acc60b63be58d9c7777">unary_v2</a></div><div class="ttdeci">void unary_v2(device const T *in, device U *out, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> unary.h:12</div></div>
-<div class="ttc" id="ametal_2kernels_2unary_8h_html_ac965f8d3ed62f8580dbfb645e83d4ae5"><div class="ttname"><a href="metal_2kernels_2unary_8h.html#ac965f8d3ed62f8580dbfb645e83d4ae5">unary_g</a></div><div class="ttdeci">void unary_g(device const T *in, device U *out, constant const int *in_shape, constant const size_t *in_strides, device const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> unary.h:22</div></div>
+<div class="ttc" id="ametal_2kernels_2unary_8h_html_ac2a85fee50af49620ff62c1a71e2575d"><div class="ttname"><a href="metal_2kernels_2unary_8h.html#ac2a85fee50af49620ff62c1a71e2575d">unary_g</a></div><div class="ttdeci">void unary_g(device const T *in, device U *out, constant const int *in_shape, constant const size_t *in_strides, device const int &amp;ndim, uint3 index, uint3 grid_dim)</div><div class="ttdef"><b>Definition</b> unary.h:27</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/metal_2reduce_8h_source.html b/docs/build/html/metal_2reduce_8h_source.html
index 79e30d6a2..375474ec7 100644
--- a/docs/build/html/metal_2reduce_8h_source.html
+++ b/docs/build/html/metal_2reduce_8h_source.html
@@ -134,7 +134,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="abackend_2metal_2device_8h_html"><div class="ttname"><a href="backend_2metal_2device_8h.html">device.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:131</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:158</div></div>
 <div class="ttc" id="acommon_2reduce_8h_html"><div class="ttname"><a href="common_2reduce_8h.html">reduce.h</a></div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a3ab0fd997d9a35782106ff083a72e098"><div class="ttname"><a href="namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098">mlx::core::all_reduce_dispatch</a></div><div class="ttdeci">void all_reduce_dispatch(const array &amp;in, array &amp;out, const std::string &amp;op_name, CommandEncoder &amp;compute_encoder, metal::Device &amp;d, const Stream &amp;s)</div></div>
diff --git a/docs/build/html/namespacemembers_b.html b/docs/build/html/namespacemembers_b.html
index a25b68434..3ad450677 100644
--- a/docs/build/html/namespacemembers_b.html
+++ b/docs/build/html/namespacemembers_b.html
@@ -90,6 +90,7 @@ $(function(){ initResizable(false); });
 <li>BACKWARD&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#a9d1eaa7469c018c39e745733eab9a9c3">pocketfft::detail</a></li>
 <li>bernoulli()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1random.html#acb3f278fea2c4f06dea947d3bac2e9b7">mlx::core::random</a></li>
 <li>bfloat16&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a514cf8b4e6f0a6af3a867e752f4338f7">mlx::core</a></li>
+<li>bfs_max_width()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1env.html#ac3266e1259a64c8b56bdc6c7029179f2">mlx::core::env</a></li>
 <li>binary()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1metal.html#a269d591ec02e2f7c0f7a718fbfa37f73">mlx::core::metal</a></li>
 <li>binary_op_gpu()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ad884f4a36308b5b4f8a5d990d2e086df">mlx::core</a></li>
 <li>binary_op_gpu_inplace()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a8616c0b7b0fc118a75400bc86404c367">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_c.html b/docs/build/html/namespacemembers_c.html
index ec3175b69..97ce6623a 100644
--- a/docs/build/html/namespacemembers_c.html
+++ b/docs/build/html/namespacemembers_c.html
@@ -111,10 +111,11 @@ $(function(){ initResizable(false); });
 <li>CompileMode&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#adb15ff2b1ca5207fd4f6e631e2c3bcb4">mlx::core</a></li>
 <li>complex64&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#af99db87e0078bfcdb383f5689bc874d4">mlx::core</a></li>
 <li>complexfloating&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a70b8e88c9df750af984757105af33423">mlx::core</a></li>
-<li>concatenate()&#160;:&#160;<a class="el" href="group__ops.html#gabdc36fa65697d0361c8d67495de77129">mlx::core</a></li>
+<li>concatenate()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">mlx::core</a></li>
 <li>concatenate_gpu()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a050299d0d366ca5c9d09d1004dcc3e7d">mlx::core</a></li>
 <li>conj()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#a66d79051d502046a9b9f103e744dbad3">pocketfft::detail</a></li>
 <li>conjugate()&#160;:&#160;<a class="el" href="group__ops.html#ga5b596906bf8cdc8d97ed6ddc9aeb4c23">mlx::core</a></li>
+<li>contiguous()&#160;:&#160;<a class="el" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">mlx::core</a></li>
 <li>ContiguousAllReduce&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ae4e34c7154eb8dc47aa8503209730424">mlx::core</a></li>
 <li>ContiguousReduce&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ad2547f25dffe8d8936dbec25601cfc84">mlx::core</a></li>
 <li>ContiguousStridedReduce&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ab48dac7508a2c790de1bdc33f29177ed">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_func_b.html b/docs/build/html/namespacemembers_func_b.html
index b6a92df32..e32661a98 100644
--- a/docs/build/html/namespacemembers_func_b.html
+++ b/docs/build/html/namespacemembers_func_b.html
@@ -88,6 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_b" name="index_b"></a>- b -</h3><ul>
 <li>bernoulli()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1random.html#acb3f278fea2c4f06dea947d3bac2e9b7">mlx::core::random</a></li>
+<li>bfs_max_width()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1env.html#ac3266e1259a64c8b56bdc6c7029179f2">mlx::core::env</a></li>
 <li>binary()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1metal.html#a269d591ec02e2f7c0f7a718fbfa37f73">mlx::core::metal</a></li>
 <li>binary_op_gpu()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ad884f4a36308b5b4f8a5d990d2e086df">mlx::core</a></li>
 <li>binary_op_gpu_inplace()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a8616c0b7b0fc118a75400bc86404c367">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_func_c.html b/docs/build/html/namespacemembers_func_c.html
index 6acac05f6..09372abba 100644
--- a/docs/build/html/namespacemembers_func_c.html
+++ b/docs/build/html/namespacemembers_func_c.html
@@ -106,10 +106,11 @@ $(function(){ initResizable(false); });
 <li>compile_erase()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1detail.html#a69eb76a14f845ca000f1ccb2edda0175">mlx::core::detail</a></li>
 <li>compiled_allocate_outputs()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ab8c3c4fc05745f586de922c8266f4fce">mlx::core</a></li>
 <li>compiled_check_contiguity()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027">mlx::core</a></li>
-<li>concatenate()&#160;:&#160;<a class="el" href="group__ops.html#gabdc36fa65697d0361c8d67495de77129">mlx::core</a></li>
+<li>concatenate()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">mlx::core</a></li>
 <li>concatenate_gpu()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a050299d0d366ca5c9d09d1004dcc3e7d">mlx::core</a></li>
 <li>conj()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#a66d79051d502046a9b9f103e744dbad3">pocketfft::detail</a></li>
 <li>conjugate()&#160;:&#160;<a class="el" href="group__ops.html#ga5b596906bf8cdc8d97ed6ddc9aeb4c23">mlx::core</a></li>
+<li>contiguous()&#160;:&#160;<a class="el" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">mlx::core</a></li>
 <li>conv()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1metal.html#ab1704e853394c725668c06752ebb5c24">mlx::core::metal</a></li>
 <li>conv1d()&#160;:&#160;<a class="el" href="group__ops.html#ga30d47e08093c03a3676f235f9f559411">mlx::core</a></li>
 <li>conv2d()&#160;:&#160;<a class="el" href="group__ops.html#ga73b02833229678786e7f302d458d5a83">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_func_g.html b/docs/build/html/namespacemembers_func_g.html
index 1c777baeb..589a5f8b1 100644
--- a/docs/build/html/namespacemembers_func_g.html
+++ b/docs/build/html/namespacemembers_func_g.html
@@ -112,8 +112,8 @@ $(function(){ initResizable(false); });
 <li>get_pool()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html#a7ec2b3f99232bd0f15f7b022c59d139a">pocketfft::detail::threading</a></li>
 <li>get_primitive_string()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">mlx::core</a></li>
 <li>get_quantized_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">mlx::core</a></li>
-<li>get_reduce_init_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">mlx::core</a></li>
-<li>get_reduce_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">mlx::core</a></li>
+<li>get_reduce_init_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ae0470605dc819efeb6510183619f0299">mlx::core</a></li>
+<li>get_reduce_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a1be32ba7d67137dde7ac191dfe83ff49">mlx::core</a></li>
 <li>get_reduction_plan()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ac97b5a6f009ca3d99854ce9512c20dba">mlx::core</a></li>
 <li>get_scan_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f">mlx::core</a></li>
 <li>get_shape()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aab0d8a256957984acc1e3615c65c898e">mlx::core</a></li>
@@ -129,6 +129,7 @@ $(function(){ initResizable(false); });
 <li>get_ternary_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a54eb3b65375022428aab5f810e40624b">mlx::core</a></li>
 <li>get_type_string()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#af776fd91dd60594dcfebbafd17f19068">mlx::core</a></li>
 <li>get_unary_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#afbb085188b563a54606d84f87a9bf5a6">mlx::core</a></li>
+<li>get_var()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3">mlx::core::env</a></li>
 <li>gguf_load_quantized()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a65dd68163bdaef3631e3724327782498">mlx::core</a></li>
 <li>grad()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a3d2b2929ed4636e9e2b86e125b2e57d9">mlx::core</a></li>
 <li>greater()&#160;:&#160;<a class="el" href="group__ops.html#gaf4ec7bfc1ad13b891f1f3ef1772ef04d">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_func_m.html b/docs/build/html/namespacemembers_func_m.html
index 3f374fc03..87d8d0e00 100644
--- a/docs/build/html/namespacemembers_func_m.html
+++ b/docs/build/html/namespacemembers_func_m.html
@@ -96,6 +96,7 @@ $(function(){ initResizable(false); });
 <li>matmul()&#160;:&#160;<a class="el" href="group__ops.html#ga753d59f5a9f5f2362865ee83b4dced2a">mlx::core</a></li>
 <li>max()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#a747e2e58092a27fb8b4dd3d16934fb52">metal::fast</a>, <a class="el" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal</a>, <a class="el" href="namespacemetal_1_1precise.html#a6a954a4e4e3753303d1dc734855a185f">metal::precise</a>, <a class="el" href="group__ops.html#ga7fed87d96cc7741d8267f4eac83f5fe7">mlx::core</a></li>
 <li>max3()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#a6fc2cf18ffa8149561864c86dba0f803">metal::fast</a>, <a class="el" href="namespacemetal.html#a00f9c0ad66d969794614f56912eed9c9">metal</a>, <a class="el" href="namespacemetal_1_1precise.html#ac490e8614ebd2c9343af1ae6c0d4e82c">metal::precise</a></li>
+<li>max_ops_per_buffer()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1env.html#aedbf4e739553024c33dd0094dd9107aa">mlx::core::env</a></li>
 <li>maximum()&#160;:&#160;<a class="el" href="group__ops.html#ga7ade2ea305e2e4219c3609443fb5db8d">mlx::core</a></li>
 <li>mean()&#160;:&#160;<a class="el" href="group__ops.html#gade46e768fd46b8b640eb16f26abeecef">mlx::core</a></li>
 <li>median3()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc">metal::fast</a>, <a class="el" href="namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157">metal</a>, <a class="el" href="namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2">metal::precise</a></li>
@@ -104,6 +105,7 @@ $(function(){ initResizable(false); });
 <li>min()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61">metal::fast</a>, <a class="el" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal</a>, <a class="el" href="namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e">metal::precise</a>, <a class="el" href="group__ops.html#gab27599802617a4c8f9964ab5f4ffee12">mlx::core</a></li>
 <li>min3()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f">metal::fast</a>, <a class="el" href="namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f">metal</a>, <a class="el" href="namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231">metal::precise</a></li>
 <li>minimum()&#160;:&#160;<a class="el" href="group__ops.html#ga49ba00c090f81f331c91b0c97040bce0">mlx::core</a></li>
+<li>move_or_copy()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2">mlx::core</a></li>
 <li>moveaxis()&#160;:&#160;<a class="el" href="group__ops.html#ga24067d10a842db2c9d509ea48135a2c3">mlx::core</a></li>
 <li>MPINPLACE()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2">pocketfft::detail</a></li>
 <li>multiply()&#160;:&#160;<a class="el" href="group__ops.html#gaf57392e641640b5d06e4c99518391c38">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_func_s.html b/docs/build/html/namespacemembers_func_s.html
index df9da014d..e4f14791d 100644
--- a/docs/build/html/namespacemembers_func_s.html
+++ b/docs/build/html/namespacemembers_func_s.html
@@ -105,7 +105,6 @@ $(function(){ initResizable(false); });
 <li>set_default_device()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a312a2de41367fe52caeaf8c0f596a120">mlx::core</a></li>
 <li>set_default_stream()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#af35a2b06517d8bb7dbb469692b4f841c">mlx::core</a></li>
 <li>set_memory_limit()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1metal.html#a3fb2c4a237fa4bfdff798156146c4937">mlx::core::metal</a></li>
-<li>set_vector_bytes()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf">mlx::core</a></li>
 <li>set_wired_limit()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1metal.html#a31eab4828d31d292bc84e07b0d961e1e">mlx::core::metal</a></li>
 <li>shapes_without_reduction_axes()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a44c3ea6db6553c3f6552b9ba64a69494">mlx::core</a></li>
 <li>shared_buffer_slice()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aea2a6a4eddfd4cfac89d20786059de2a">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_func_t.html b/docs/build/html/namespacemembers_func_t.html
index 1a7e5d089..2526b0acf 100644
--- a/docs/build/html/namespacemembers_func_t.html
+++ b/docs/build/html/namespacemembers_func_t.html
@@ -114,7 +114,7 @@ $(function(){ initResizable(false); });
 <li>triu()&#160;:&#160;<a class="el" href="group__ops.html#gaa9df5917876eeb0cb28b7fa81f880412">mlx::core</a></li>
 <li>trunc()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#aa62e1075e86c626d97038f16e9433415">metal::fast</a>, <a class="el" href="namespacemetal_1_1precise.html#a334183e7a2dd49b983d072d1e8ee2b27">metal::precise</a>, <a class="el" href="namespacemetal.html#a93cb75a11a362bfc8310ea19c554c887">metal</a></li>
 <li>truncated_normal()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1random.html#a00aa5746bac6d729d2ba9465153bb279">mlx::core::random</a></li>
-<li>type_to_name()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae">mlx::core</a></li>
+<li>type_to_name()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164">mlx::core</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/namespacemembers_g.html b/docs/build/html/namespacemembers_g.html
index 0bbdf9db4..996a985c3 100644
--- a/docs/build/html/namespacemembers_g.html
+++ b/docs/build/html/namespacemembers_g.html
@@ -116,8 +116,8 @@ $(function(){ initResizable(false); });
 <li>get_pool()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html#a7ec2b3f99232bd0f15f7b022c59d139a">pocketfft::detail::threading</a></li>
 <li>get_primitive_string()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">mlx::core</a></li>
 <li>get_quantized_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">mlx::core</a></li>
-<li>get_reduce_init_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">mlx::core</a></li>
-<li>get_reduce_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">mlx::core</a></li>
+<li>get_reduce_init_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ae0470605dc819efeb6510183619f0299">mlx::core</a></li>
+<li>get_reduce_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a1be32ba7d67137dde7ac191dfe83ff49">mlx::core</a></li>
 <li>get_reduction_plan()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ac97b5a6f009ca3d99854ce9512c20dba">mlx::core</a></li>
 <li>get_scan_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f">mlx::core</a></li>
 <li>get_shape()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aab0d8a256957984acc1e3615c65c898e">mlx::core</a></li>
@@ -133,6 +133,7 @@ $(function(){ initResizable(false); });
 <li>get_ternary_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a54eb3b65375022428aab5f810e40624b">mlx::core</a></li>
 <li>get_type_string()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#af776fd91dd60594dcfebbafd17f19068">mlx::core</a></li>
 <li>get_unary_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#afbb085188b563a54606d84f87a9bf5a6">mlx::core</a></li>
+<li>get_var()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3">mlx::core::env</a></li>
 <li>gguf_load_quantized()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a65dd68163bdaef3631e3724327782498">mlx::core</a></li>
 <li>GGUFLoad&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aa5b0f7f13a941e1f41c411194e9033c7">mlx::core</a></li>
 <li>GGUFMetaData&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a8c2c1b9a37aadfb48f4c3a7e806e32e3">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_m.html b/docs/build/html/namespacemembers_m.html
index 299c717bb..3794a31ca 100644
--- a/docs/build/html/namespacemembers_m.html
+++ b/docs/build/html/namespacemembers_m.html
@@ -96,6 +96,7 @@ $(function(){ initResizable(false); });
 <li>matmul()&#160;:&#160;<a class="el" href="group__ops.html#ga753d59f5a9f5f2362865ee83b4dced2a">mlx::core</a></li>
 <li>max()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#a747e2e58092a27fb8b4dd3d16934fb52">metal::fast</a>, <a class="el" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal</a>, <a class="el" href="namespacemetal_1_1precise.html#a6a954a4e4e3753303d1dc734855a185f">metal::precise</a>, <a class="el" href="group__ops.html#ga7fed87d96cc7741d8267f4eac83f5fe7">mlx::core</a></li>
 <li>max3()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#a6fc2cf18ffa8149561864c86dba0f803">metal::fast</a>, <a class="el" href="namespacemetal.html#a00f9c0ad66d969794614f56912eed9c9">metal</a>, <a class="el" href="namespacemetal_1_1precise.html#ac490e8614ebd2c9343af1ae6c0d4e82c">metal::precise</a></li>
+<li>max_ops_per_buffer()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1env.html#aedbf4e739553024c33dd0094dd9107aa">mlx::core::env</a></li>
 <li>max_threads&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html#a2d5c0729f0b66cf061918baea4337d70">pocketfft::detail::threading</a></li>
 <li>maximum()&#160;:&#160;<a class="el" href="group__ops.html#ga7ade2ea305e2e4219c3609443fb5db8d">mlx::core</a></li>
 <li>mean()&#160;:&#160;<a class="el" href="group__ops.html#gade46e768fd46b8b640eb16f26abeecef">mlx::core</a></li>
@@ -106,6 +107,7 @@ $(function(){ initResizable(false); });
 <li>min()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61">metal::fast</a>, <a class="el" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal</a>, <a class="el" href="namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e">metal::precise</a>, <a class="el" href="group__ops.html#gab27599802617a4c8f9964ab5f4ffee12">mlx::core</a></li>
 <li>min3()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f">metal::fast</a>, <a class="el" href="namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f">metal</a>, <a class="el" href="namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231">metal::precise</a></li>
 <li>minimum()&#160;:&#160;<a class="el" href="group__ops.html#ga49ba00c090f81f331c91b0c97040bce0">mlx::core</a></li>
+<li>move_or_copy()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2">mlx::core</a></li>
 <li>moveaxis()&#160;:&#160;<a class="el" href="group__ops.html#ga24067d10a842db2c9d509ea48135a2c3">mlx::core</a></li>
 <li>MPINPLACE()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2">pocketfft::detail</a></li>
 <li>MTLFCList&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">mlx::core::metal</a></li>
diff --git a/docs/build/html/namespacemembers_s.html b/docs/build/html/namespacemembers_s.html
index d4f255a4f..8b5c4cc4e 100644
--- a/docs/build/html/namespacemembers_s.html
+++ b/docs/build/html/namespacemembers_s.html
@@ -106,7 +106,6 @@ $(function(){ initResizable(false); });
 <li>set_default_device()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a312a2de41367fe52caeaf8c0f596a120">mlx::core</a></li>
 <li>set_default_stream()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#af35a2b06517d8bb7dbb469692b4f841c">mlx::core</a></li>
 <li>set_memory_limit()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1metal.html#a3fb2c4a237fa4bfdff798156146c4937">mlx::core::metal</a></li>
-<li>set_vector_bytes()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf">mlx::core</a></li>
 <li>set_wired_limit()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1metal.html#a31eab4828d31d292bc84e07b0d961e1e">mlx::core::metal</a></li>
 <li>shape_t&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#a885ee37fcf564a268a5c8ca9ea8603e1">pocketfft::detail</a></li>
 <li>shapes_without_reduction_axes()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a44c3ea6db6553c3f6552b9ba64a69494">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_t.html b/docs/build/html/namespacemembers_t.html
index 299d396fd..fe0368c3b 100644
--- a/docs/build/html/namespacemembers_t.html
+++ b/docs/build/html/namespacemembers_t.html
@@ -116,7 +116,7 @@ $(function(){ initResizable(false); });
 <li>true_type&#160;:&#160;<a class="el" href="namespacemlx_1_1steel.html#a594a6ccb75b38b5ae4ddd0d9ad047b3a">mlx::steel</a></li>
 <li>trunc()&#160;:&#160;<a class="el" href="namespacemetal_1_1fast.html#aa62e1075e86c626d97038f16e9433415">metal::fast</a>, <a class="el" href="namespacemetal_1_1precise.html#a334183e7a2dd49b983d072d1e8ee2b27">metal::precise</a>, <a class="el" href="namespacemetal.html#a93cb75a11a362bfc8310ea19c554c887">metal</a></li>
 <li>truncated_normal()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1random.html#a00aa5746bac6d729d2ba9465153bb279">mlx::core::random</a></li>
-<li>type_to_name()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae">mlx::core</a></li>
+<li>type_to_name()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164">mlx::core</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/namespacemetal.html b/docs/build/html/namespacemetal.html
index bfee13a68..484309721 100644
--- a/docs/build/html/namespacemetal.html
+++ b/docs/build/html/namespacemetal.html
@@ -132,150 +132,150 @@ Typedefs</h2></td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
+<tr class="memitem:a87c5122c60f9a12afceb9925a5b78ffb" id="r_a87c5122c60f9a12afceb9925a5b78ffb"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a87c5122c60f9a12afceb9925a5b78ffb">abs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a87c5122c60f9a12afceb9925a5b78ffb"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ad4537748b3c832b6569ff7ccb209fcb2" id="r_ad4537748b3c832b6569ff7ccb209fcb2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad4537748b3c832b6569ff7ccb209fcb2">acos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:ad4537748b3c832b6569ff7ccb209fcb2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a2d0efb92b7f61eff342d776bd6c5f3a0" id="r_a2d0efb92b7f61eff342d776bd6c5f3a0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2d0efb92b7f61eff342d776bd6c5f3a0">acosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a2d0efb92b7f61eff342d776bd6c5f3a0"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a16e843194df3fd136404bf80ba5ac95c" id="r_a16e843194df3fd136404bf80ba5ac95c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a16e843194df3fd136404bf80ba5ac95c">asin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a16e843194df3fd136404bf80ba5ac95c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abcc3251866930cfe880f89e7473d0e63" id="r_abcc3251866930cfe880f89e7473d0e63"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abcc3251866930cfe880f89e7473d0e63">asinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:abcc3251866930cfe880f89e7473d0e63"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a80a771553d9a0012b93620d19c48b00f" id="r_a80a771553d9a0012b93620d19c48b00f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a80a771553d9a0012b93620d19c48b00f">atan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
+<tr class="separator:a80a771553d9a0012b93620d19c48b00f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1d430793eaa38ccf0d07145e3fcd1e61" id="r_a1d430793eaa38ccf0d07145e3fcd1e61"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1d430793eaa38ccf0d07145e3fcd1e61">atan2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a1d430793eaa38ccf0d07145e3fcd1e61"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a57116427997ba71dd3863bfb15de33bf" id="r_a57116427997ba71dd3863bfb15de33bf"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a57116427997ba71dd3863bfb15de33bf">atanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a57116427997ba71dd3863bfb15de33bf"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ad63204d38bc01df6ffc64583f7886b3c" id="r_ad63204d38bc01df6ffc64583f7886b3c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad63204d38bc01df6ffc64583f7886b3c">ceil</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:ad63204d38bc01df6ffc64583f7886b3c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a2fa4778a6fe2fa43253ea724e5a608a3" id="r_a2fa4778a6fe2fa43253ea724e5a608a3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2fa4778a6fe2fa43253ea724e5a608a3">cos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a2fa4778a6fe2fa43253ea724e5a608a3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a8a68a88cc110830d057dbd71431b93c0" id="r_a8a68a88cc110830d057dbd71431b93c0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8a68a88cc110830d057dbd71431b93c0">cosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a8a68a88cc110830d057dbd71431b93c0"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a5c2f37939ad705ddea4409d3bedb8ce1" id="r_a5c2f37939ad705ddea4409d3bedb8ce1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5c2f37939ad705ddea4409d3bedb8ce1">cospi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a5c2f37939ad705ddea4409d3bedb8ce1"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a2aea493fc1a874970b77ed0031e965df" id="r_a2aea493fc1a874970b77ed0031e965df"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2aea493fc1a874970b77ed0031e965df">divide</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:a2aea493fc1a874970b77ed0031e965df"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac2a0b3618d922ac014baac8189d44650" id="r_ac2a0b3618d922ac014baac8189d44650"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac2a0b3618d922ac014baac8189d44650">exp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:ac2a0b3618d922ac014baac8189d44650"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a4c63707d13c89364496a48906631c204" id="r_a4c63707d13c89364496a48906631c204"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4c63707d13c89364496a48906631c204">exp10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a4c63707d13c89364496a48906631c204"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a228201c20777848804a4d0589c1d33e7" id="r_a228201c20777848804a4d0589c1d33e7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a228201c20777848804a4d0589c1d33e7">exp2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a228201c20777848804a4d0589c1d33e7"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a487eba718144be1325abcf66e109bb21" id="r_a487eba718144be1325abcf66e109bb21"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a487eba718144be1325abcf66e109bb21">fabs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a487eba718144be1325abcf66e109bb21"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a85a560794be56d8116889c1ee2d78761" id="r_a85a560794be56d8116889c1ee2d78761"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a85a560794be56d8116889c1ee2d78761">fdim</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:a85a560794be56d8116889c1ee2d78761"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a020790f30c28a9982c4a83deaa258277" id="r_a020790f30c28a9982c4a83deaa258277"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a020790f30c28a9982c4a83deaa258277">floor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a020790f30c28a9982c4a83deaa258277"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6301a78d69ff14a06194ca85a0c7d326" id="r_a6301a78d69ff14a06194ca85a0c7d326"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6301a78d69ff14a06194ca85a0c7d326">fma</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="separator:a6301a78d69ff14a06194ca85a0c7d326"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a0558e56fdb94b456deea6a4eb53964ed" id="r_a0558e56fdb94b456deea6a4eb53964ed"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0558e56fdb94b456deea6a4eb53964ed">fmax</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:a0558e56fdb94b456deea6a4eb53964ed"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae0c1a7ba1a7449adc64d00b2a29e67f6" id="r_ae0c1a7ba1a7449adc64d00b2a29e67f6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae0c1a7ba1a7449adc64d00b2a29e67f6">fmax3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="separator:ae0c1a7ba1a7449adc64d00b2a29e67f6"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa35227450d943fb88cf43162aa9d8c49" id="r_aa35227450d943fb88cf43162aa9d8c49"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa35227450d943fb88cf43162aa9d8c49">fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="separator:aa35227450d943fb88cf43162aa9d8c49"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a66ac19825ea79b8294e243ae6d0b3d3c" id="r_a66ac19825ea79b8294e243ae6d0b3d3c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a66ac19825ea79b8294e243ae6d0b3d3c">fmin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:a66ac19825ea79b8294e243ae6d0b3d3c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae2acd25f2241f00aaf89ff48f132a879" id="r_ae2acd25f2241f00aaf89ff48f132a879"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae2acd25f2241f00aaf89ff48f132a879">fmin3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="separator:ae2acd25f2241f00aaf89ff48f132a879"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a2ff952d4d596a7969b2a3035fc2fda58" id="r_a2ff952d4d596a7969b2a3035fc2fda58"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2ff952d4d596a7969b2a3035fc2fda58">fmod</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:a2ff952d4d596a7969b2a3035fc2fda58"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6b1c15d251aeaacb1f4338a5e152ae78" id="r_a6b1c15d251aeaacb1f4338a5e152ae78"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6b1c15d251aeaacb1f4338a5e152ae78">fract</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a6b1c15d251aeaacb1f4338a5e152ae78"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac89d4ef524d21a301da6c37dbd95ff9f" id="r_ac89d4ef524d21a301da6c37dbd95ff9f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac89d4ef524d21a301da6c37dbd95ff9f">frexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="#ac2a0b3618d922ac014baac8189d44650">exp</a>)</td></tr>
+<tr class="separator:ac89d4ef524d21a301da6c37dbd95ff9f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3deed001738b6f03accd3c2195586c2b" id="r_a3deed001738b6f03accd3c2195586c2b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3deed001738b6f03accd3c2195586c2b">ldexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
+<tr class="separator:a3deed001738b6f03accd3c2195586c2b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a423a9f4f2fc7ef5ec7eda061277b51b6" id="r_a423a9f4f2fc7ef5ec7eda061277b51b6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a423a9f4f2fc7ef5ec7eda061277b51b6">log</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a423a9f4f2fc7ef5ec7eda061277b51b6"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a042b98827baa910e9d726227cec55a80" id="r_a042b98827baa910e9d726227cec55a80"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a042b98827baa910e9d726227cec55a80">log10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a042b98827baa910e9d726227cec55a80"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae894dd5fc13799f120b55cab6267c89c" id="r_ae894dd5fc13799f120b55cab6267c89c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae894dd5fc13799f120b55cab6267c89c">log2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:ae894dd5fc13799f120b55cab6267c89c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a853c80479ab2264d9c4587c7bcac767b" id="r_a853c80479ab2264d9c4587c7bcac767b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a853c80479ab2264d9c4587c7bcac767b">max</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:a853c80479ab2264d9c4587c7bcac767b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a00f9c0ad66d969794614f56912eed9c9" id="r_a00f9c0ad66d969794614f56912eed9c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a00f9c0ad66d969794614f56912eed9c9">max3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="separator:a00f9c0ad66d969794614f56912eed9c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa3ff49457ce3c93fc1c0897fd1525157" id="r_aa3ff49457ce3c93fc1c0897fd1525157"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa3ff49457ce3c93fc1c0897fd1525157">median3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="separator:aa3ff49457ce3c93fc1c0897fd1525157"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6653b28c9473087141eddce39878d4d3" id="r_a6653b28c9473087141eddce39878d4d3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6653b28c9473087141eddce39878d4d3">min</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:a6653b28c9473087141eddce39878d4d3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a005510c8c0f964ce2b8aad3ba76a7a3f" id="r_a005510c8c0f964ce2b8aad3ba76a7a3f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a005510c8c0f964ce2b8aad3ba76a7a3f">min3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="separator:a005510c8c0f964ce2b8aad3ba76a7a3f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a9547fd7b09164931986f6db4813bd72d" id="r_a9547fd7b09164931986f6db4813bd72d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9547fd7b09164931986f6db4813bd72d">nextafter</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:a9547fd7b09164931986f6db4813bd72d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:acd288d4552215bd10455584a214c57b8" id="r_acd288d4552215bd10455584a214c57b8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acd288d4552215bd10455584a214c57b8">pow</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:acd288d4552215bd10455584a214c57b8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae529e431f178bafedc18a889323c0bc2" id="r_ae529e431f178bafedc18a889323c0bc2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae529e431f178bafedc18a889323c0bc2">powr</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="separator:ae529e431f178bafedc18a889323c0bc2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a29ab6060527120eee745aec0daa06e01" id="r_a29ab6060527120eee745aec0daa06e01"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a29ab6060527120eee745aec0daa06e01">rint</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a29ab6060527120eee745aec0daa06e01"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a46c667e169ff9d51a9204a045305442f" id="r_a46c667e169ff9d51a9204a045305442f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a46c667e169ff9d51a9204a045305442f">round</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a46c667e169ff9d51a9204a045305442f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1cf4b605c0aa7ff5bfe5e979a16f5157" id="r_a1cf4b605c0aa7ff5bfe5e979a16f5157"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1cf4b605c0aa7ff5bfe5e979a16f5157">rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a1cf4b605c0aa7ff5bfe5e979a16f5157"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a619a159ca5f2ddfe3647d3a6bb6e804c" id="r_a619a159ca5f2ddfe3647d3a6bb6e804c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a619a159ca5f2ddfe3647d3a6bb6e804c">sin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a619a159ca5f2ddfe3647d3a6bb6e804c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a83ba4235ae350ab8880a9df09158620b" id="r_a83ba4235ae350ab8880a9df09158620b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a83ba4235ae350ab8880a9df09158620b">sinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a83ba4235ae350ab8880a9df09158620b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae9655f7fa2ba6c0625ca25fbb278e269" id="r_ae9655f7fa2ba6c0625ca25fbb278e269"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae9655f7fa2ba6c0625ca25fbb278e269">sinpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:ae9655f7fa2ba6c0625ca25fbb278e269"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ab3f4d4852ca0e591104fbd8e5b50d31b" id="r_ab3f4d4852ca0e591104fbd8e5b50d31b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab3f4d4852ca0e591104fbd8e5b50d31b">sqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:ab3f4d4852ca0e591104fbd8e5b50d31b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a862215a8ddacb086296ba02567c9b158" id="r_a862215a8ddacb086296ba02567c9b158"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a862215a8ddacb086296ba02567c9b158">tan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a862215a8ddacb086296ba02567c9b158"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa97fc50bd6addfc6de0aae8570fe963d" id="r_aa97fc50bd6addfc6de0aae8570fe963d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa97fc50bd6addfc6de0aae8570fe963d">tanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:aa97fc50bd6addfc6de0aae8570fe963d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae2046d163a525fc1822a9ec8a0aeaeb3" id="r_ae2046d163a525fc1822a9ec8a0aeaeb3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae2046d163a525fc1822a9ec8a0aeaeb3">tanpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:ae2046d163a525fc1822a9ec8a0aeaeb3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a93cb75a11a362bfc8310ea19c554c887" id="r_a93cb75a11a362bfc8310ea19c554c887"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a93cb75a11a362bfc8310ea19c554c887">trunc</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="separator:a93cb75a11a362bfc8310ea19c554c887"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a498f1e85107eb5f01ba4435977f8efe0" id="r_a498f1e85107eb5f01ba4435977f8efe0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a498f1e85107eb5f01ba4435977f8efe0">simd_broadcast</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort broadcast_lane_id)</td></tr>
+<tr class="separator:a498f1e85107eb5f01ba4435977f8efe0"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a259ed115bc3c58f88eb35830916b26d4" id="r_a259ed115bc3c58f88eb35830916b26d4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort simd_lane_id)</td></tr>
+<tr class="separator:a259ed115bc3c58f88eb35830916b26d4"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae29a06f0eac636ad7af21dea5b04938b" id="r_ae29a06f0eac636ad7af21dea5b04938b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae29a06f0eac636ad7af21dea5b04938b">simd_shuffle_and_fill_down</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta, ushort modulo)</td></tr>
+<tr class="separator:ae29a06f0eac636ad7af21dea5b04938b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a0ee6239fa29a5f9ee0201e0dc5ddc8e0" id="r_a0ee6239fa29a5f9ee0201e0dc5ddc8e0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0ee6239fa29a5f9ee0201e0dc5ddc8e0">simd_shuffle_and_fill_down</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta)</td></tr>
+<tr class="separator:a0ee6239fa29a5f9ee0201e0dc5ddc8e0"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1ca14116bf50639b214d8414b5bbaaa6" id="r_a1ca14116bf50639b214d8414b5bbaaa6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta, ushort modulo)</td></tr>
+<tr class="separator:a1ca14116bf50639b214d8414b5bbaaa6"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a5138d5cdc18139e135707916a243cd8e" id="r_a5138d5cdc18139e135707916a243cd8e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5138d5cdc18139e135707916a243cd8e">simd_shuffle_and_fill_up</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta)</td></tr>
+<tr class="separator:a5138d5cdc18139e135707916a243cd8e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:af6e2dd7ae087aba6abac4f0350b7611c" id="r_af6e2dd7ae087aba6abac4f0350b7611c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
+<tr class="separator:af6e2dd7ae087aba6abac4f0350b7611c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a4bb203647a421032db47e73cd649841b" id="r_a4bb203647a421032db47e73cd649841b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4bb203647a421032db47e73cd649841b">simd_shuffle_rotate_down</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
+<tr class="separator:a4bb203647a421032db47e73cd649841b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a729b22077d6c944491a6027c18ea80c9" id="r_a729b22077d6c944491a6027c18ea80c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a729b22077d6c944491a6027c18ea80c9">simd_shuffle_rotate_up</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
+<tr class="separator:a729b22077d6c944491a6027c18ea80c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:afe81c5fbde3f4890458b081909242c55" id="r_afe81c5fbde3f4890458b081909242c55"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
+<tr class="separator:afe81c5fbde3f4890458b081909242c55"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a5017efc9605e069cfb507137cd1a1852" id="r_a5017efc9605e069cfb507137cd1a1852"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5017efc9605e069cfb507137cd1a1852">simd_shuffle_xor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort mask)</td></tr>
+<tr class="separator:a5017efc9605e069cfb507137cd1a1852"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a048cad0aca52cb737ebf103e76bd1c49" id="r_a048cad0aca52cb737ebf103e76bd1c49"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="separator:a048cad0aca52cb737ebf103e76bd1c49"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae9e2a23e00724ba2d7868bc4112b386b" id="r_ae9e2a23e00724ba2d7868bc4112b386b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae9e2a23e00724ba2d7868bc4112b386b">simd_min</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="separator:ae9e2a23e00724ba2d7868bc4112b386b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a5ca40242390b632f737e29636829b2e4" id="r_a5ca40242390b632f737e29636829b2e4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5ca40242390b632f737e29636829b2e4">simd_prefix_exclusive_product</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="separator:a5ca40242390b632f737e29636829b2e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abfbb70c7471f28bf7ff36a612ad014b2" id="r_abfbb70c7471f28bf7ff36a612ad014b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abfbb70c7471f28bf7ff36a612ad014b2">simd_prefix_exclusive_sum</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="separator:abfbb70c7471f28bf7ff36a612ad014b2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6ca6a7e1996228fa536e969e9e45c446" id="r_a6ca6a7e1996228fa536e969e9e45c446"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6ca6a7e1996228fa536e969e9e45c446">simd_prefix_inclusive_product</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="separator:a6ca6a7e1996228fa536e969e9e45c446"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a567acb18199ac0107712eb8cb8aeb8e9" id="r_a567acb18199ac0107712eb8cb8aeb8e9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a567acb18199ac0107712eb8cb8aeb8e9">simd_prefix_inclusive_sum</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="separator:a567acb18199ac0107712eb8cb8aeb8e9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac6e883a04e2265a9790d7db76059e1b4" id="r_ac6e883a04e2265a9790d7db76059e1b4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac6e883a04e2265a9790d7db76059e1b4">simd_product</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="separator:ac6e883a04e2265a9790d7db76059e1b4"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a85181e37a00cb4a4217f1bb25389bce5" id="r_a85181e37a00cb4a4217f1bb25389bce5"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="separator:a85181e37a00cb4a4217f1bb25389bce5"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1308decbf2d5c33d34d6be523ea1c30f" id="r_a1308decbf2d5c33d34d6be523ea1c30f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1308decbf2d5c33d34d6be523ea1c30f">simd_xor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
+<tr class="separator:a1308decbf2d5c33d34d6be523ea1c30f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a83320ba983d90dd1fa5847b6940dc0bb" id="r_a83320ba983d90dd1fa5847b6940dc0bb"><td class="memItemLeft" align="right" valign="top">METAL_FUNC bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a83320ba983d90dd1fa5847b6940dc0bb">isnan</a> (<a class="el" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a> x)</td></tr>
 <tr class="separator:a83320ba983d90dd1fa5847b6940dc0bb"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a87c5122c60f9a12afceb9925a5b78ffb" id="r_a87c5122c60f9a12afceb9925a5b78ffb"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a87c5122c60f9a12afceb9925a5b78ffb">abs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a87c5122c60f9a12afceb9925a5b78ffb"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad4537748b3c832b6569ff7ccb209fcb2" id="r_ad4537748b3c832b6569ff7ccb209fcb2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad4537748b3c832b6569ff7ccb209fcb2">acos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:ad4537748b3c832b6569ff7ccb209fcb2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2d0efb92b7f61eff342d776bd6c5f3a0" id="r_a2d0efb92b7f61eff342d776bd6c5f3a0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2d0efb92b7f61eff342d776bd6c5f3a0">acosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a2d0efb92b7f61eff342d776bd6c5f3a0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a16e843194df3fd136404bf80ba5ac95c" id="r_a16e843194df3fd136404bf80ba5ac95c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a16e843194df3fd136404bf80ba5ac95c">asin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a16e843194df3fd136404bf80ba5ac95c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:abcc3251866930cfe880f89e7473d0e63" id="r_abcc3251866930cfe880f89e7473d0e63"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abcc3251866930cfe880f89e7473d0e63">asinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:abcc3251866930cfe880f89e7473d0e63"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a80a771553d9a0012b93620d19c48b00f" id="r_a80a771553d9a0012b93620d19c48b00f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a80a771553d9a0012b93620d19c48b00f">atan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
-<tr class="separator:a80a771553d9a0012b93620d19c48b00f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1d430793eaa38ccf0d07145e3fcd1e61" id="r_a1d430793eaa38ccf0d07145e3fcd1e61"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1d430793eaa38ccf0d07145e3fcd1e61">atan2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a1d430793eaa38ccf0d07145e3fcd1e61"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a57116427997ba71dd3863bfb15de33bf" id="r_a57116427997ba71dd3863bfb15de33bf"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a57116427997ba71dd3863bfb15de33bf">atanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a57116427997ba71dd3863bfb15de33bf"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad63204d38bc01df6ffc64583f7886b3c" id="r_ad63204d38bc01df6ffc64583f7886b3c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad63204d38bc01df6ffc64583f7886b3c">ceil</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:ad63204d38bc01df6ffc64583f7886b3c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2fa4778a6fe2fa43253ea724e5a608a3" id="r_a2fa4778a6fe2fa43253ea724e5a608a3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2fa4778a6fe2fa43253ea724e5a608a3">cos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a2fa4778a6fe2fa43253ea724e5a608a3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8a68a88cc110830d057dbd71431b93c0" id="r_a8a68a88cc110830d057dbd71431b93c0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8a68a88cc110830d057dbd71431b93c0">cosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a8a68a88cc110830d057dbd71431b93c0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5c2f37939ad705ddea4409d3bedb8ce1" id="r_a5c2f37939ad705ddea4409d3bedb8ce1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5c2f37939ad705ddea4409d3bedb8ce1">cospi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a5c2f37939ad705ddea4409d3bedb8ce1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2aea493fc1a874970b77ed0031e965df" id="r_a2aea493fc1a874970b77ed0031e965df"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2aea493fc1a874970b77ed0031e965df">divide</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:a2aea493fc1a874970b77ed0031e965df"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac2a0b3618d922ac014baac8189d44650" id="r_ac2a0b3618d922ac014baac8189d44650"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac2a0b3618d922ac014baac8189d44650">exp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:ac2a0b3618d922ac014baac8189d44650"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4c63707d13c89364496a48906631c204" id="r_a4c63707d13c89364496a48906631c204"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4c63707d13c89364496a48906631c204">exp10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a4c63707d13c89364496a48906631c204"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a228201c20777848804a4d0589c1d33e7" id="r_a228201c20777848804a4d0589c1d33e7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a228201c20777848804a4d0589c1d33e7">exp2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a228201c20777848804a4d0589c1d33e7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a487eba718144be1325abcf66e109bb21" id="r_a487eba718144be1325abcf66e109bb21"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a487eba718144be1325abcf66e109bb21">fabs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a487eba718144be1325abcf66e109bb21"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a85a560794be56d8116889c1ee2d78761" id="r_a85a560794be56d8116889c1ee2d78761"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a85a560794be56d8116889c1ee2d78761">fdim</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:a85a560794be56d8116889c1ee2d78761"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a020790f30c28a9982c4a83deaa258277" id="r_a020790f30c28a9982c4a83deaa258277"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a020790f30c28a9982c4a83deaa258277">floor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a020790f30c28a9982c4a83deaa258277"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6301a78d69ff14a06194ca85a0c7d326" id="r_a6301a78d69ff14a06194ca85a0c7d326"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6301a78d69ff14a06194ca85a0c7d326">fma</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
-<tr class="separator:a6301a78d69ff14a06194ca85a0c7d326"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0558e56fdb94b456deea6a4eb53964ed" id="r_a0558e56fdb94b456deea6a4eb53964ed"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0558e56fdb94b456deea6a4eb53964ed">fmax</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:a0558e56fdb94b456deea6a4eb53964ed"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae0c1a7ba1a7449adc64d00b2a29e67f6" id="r_ae0c1a7ba1a7449adc64d00b2a29e67f6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae0c1a7ba1a7449adc64d00b2a29e67f6">fmax3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
-<tr class="separator:ae0c1a7ba1a7449adc64d00b2a29e67f6"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa35227450d943fb88cf43162aa9d8c49" id="r_aa35227450d943fb88cf43162aa9d8c49"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa35227450d943fb88cf43162aa9d8c49">fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
-<tr class="separator:aa35227450d943fb88cf43162aa9d8c49"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a66ac19825ea79b8294e243ae6d0b3d3c" id="r_a66ac19825ea79b8294e243ae6d0b3d3c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a66ac19825ea79b8294e243ae6d0b3d3c">fmin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:a66ac19825ea79b8294e243ae6d0b3d3c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae2acd25f2241f00aaf89ff48f132a879" id="r_ae2acd25f2241f00aaf89ff48f132a879"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae2acd25f2241f00aaf89ff48f132a879">fmin3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
-<tr class="separator:ae2acd25f2241f00aaf89ff48f132a879"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2ff952d4d596a7969b2a3035fc2fda58" id="r_a2ff952d4d596a7969b2a3035fc2fda58"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2ff952d4d596a7969b2a3035fc2fda58">fmod</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:a2ff952d4d596a7969b2a3035fc2fda58"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6b1c15d251aeaacb1f4338a5e152ae78" id="r_a6b1c15d251aeaacb1f4338a5e152ae78"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6b1c15d251aeaacb1f4338a5e152ae78">fract</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a6b1c15d251aeaacb1f4338a5e152ae78"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac89d4ef524d21a301da6c37dbd95ff9f" id="r_ac89d4ef524d21a301da6c37dbd95ff9f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac89d4ef524d21a301da6c37dbd95ff9f">frexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="#ac2a0b3618d922ac014baac8189d44650">exp</a>)</td></tr>
-<tr class="separator:ac89d4ef524d21a301da6c37dbd95ff9f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3deed001738b6f03accd3c2195586c2b" id="r_a3deed001738b6f03accd3c2195586c2b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3deed001738b6f03accd3c2195586c2b">ldexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
-<tr class="separator:a3deed001738b6f03accd3c2195586c2b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a423a9f4f2fc7ef5ec7eda061277b51b6" id="r_a423a9f4f2fc7ef5ec7eda061277b51b6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a423a9f4f2fc7ef5ec7eda061277b51b6">log</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a423a9f4f2fc7ef5ec7eda061277b51b6"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a042b98827baa910e9d726227cec55a80" id="r_a042b98827baa910e9d726227cec55a80"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a042b98827baa910e9d726227cec55a80">log10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a042b98827baa910e9d726227cec55a80"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae894dd5fc13799f120b55cab6267c89c" id="r_ae894dd5fc13799f120b55cab6267c89c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae894dd5fc13799f120b55cab6267c89c">log2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:ae894dd5fc13799f120b55cab6267c89c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a853c80479ab2264d9c4587c7bcac767b" id="r_a853c80479ab2264d9c4587c7bcac767b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a853c80479ab2264d9c4587c7bcac767b">max</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:a853c80479ab2264d9c4587c7bcac767b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a00f9c0ad66d969794614f56912eed9c9" id="r_a00f9c0ad66d969794614f56912eed9c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a00f9c0ad66d969794614f56912eed9c9">max3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
-<tr class="separator:a00f9c0ad66d969794614f56912eed9c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa3ff49457ce3c93fc1c0897fd1525157" id="r_aa3ff49457ce3c93fc1c0897fd1525157"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa3ff49457ce3c93fc1c0897fd1525157">median3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
-<tr class="separator:aa3ff49457ce3c93fc1c0897fd1525157"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6653b28c9473087141eddce39878d4d3" id="r_a6653b28c9473087141eddce39878d4d3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6653b28c9473087141eddce39878d4d3">min</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:a6653b28c9473087141eddce39878d4d3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a005510c8c0f964ce2b8aad3ba76a7a3f" id="r_a005510c8c0f964ce2b8aad3ba76a7a3f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a005510c8c0f964ce2b8aad3ba76a7a3f">min3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
-<tr class="separator:a005510c8c0f964ce2b8aad3ba76a7a3f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a9547fd7b09164931986f6db4813bd72d" id="r_a9547fd7b09164931986f6db4813bd72d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9547fd7b09164931986f6db4813bd72d">nextafter</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:a9547fd7b09164931986f6db4813bd72d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:acd288d4552215bd10455584a214c57b8" id="r_acd288d4552215bd10455584a214c57b8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acd288d4552215bd10455584a214c57b8">pow</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:acd288d4552215bd10455584a214c57b8"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae529e431f178bafedc18a889323c0bc2" id="r_ae529e431f178bafedc18a889323c0bc2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae529e431f178bafedc18a889323c0bc2">powr</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
-<tr class="separator:ae529e431f178bafedc18a889323c0bc2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a29ab6060527120eee745aec0daa06e01" id="r_a29ab6060527120eee745aec0daa06e01"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a29ab6060527120eee745aec0daa06e01">rint</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a29ab6060527120eee745aec0daa06e01"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a46c667e169ff9d51a9204a045305442f" id="r_a46c667e169ff9d51a9204a045305442f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a46c667e169ff9d51a9204a045305442f">round</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a46c667e169ff9d51a9204a045305442f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1cf4b605c0aa7ff5bfe5e979a16f5157" id="r_a1cf4b605c0aa7ff5bfe5e979a16f5157"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1cf4b605c0aa7ff5bfe5e979a16f5157">rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a1cf4b605c0aa7ff5bfe5e979a16f5157"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a619a159ca5f2ddfe3647d3a6bb6e804c" id="r_a619a159ca5f2ddfe3647d3a6bb6e804c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a619a159ca5f2ddfe3647d3a6bb6e804c">sin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a619a159ca5f2ddfe3647d3a6bb6e804c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a83ba4235ae350ab8880a9df09158620b" id="r_a83ba4235ae350ab8880a9df09158620b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a83ba4235ae350ab8880a9df09158620b">sinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a83ba4235ae350ab8880a9df09158620b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae9655f7fa2ba6c0625ca25fbb278e269" id="r_ae9655f7fa2ba6c0625ca25fbb278e269"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae9655f7fa2ba6c0625ca25fbb278e269">sinpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:ae9655f7fa2ba6c0625ca25fbb278e269"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab3f4d4852ca0e591104fbd8e5b50d31b" id="r_ab3f4d4852ca0e591104fbd8e5b50d31b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab3f4d4852ca0e591104fbd8e5b50d31b">sqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:ab3f4d4852ca0e591104fbd8e5b50d31b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a862215a8ddacb086296ba02567c9b158" id="r_a862215a8ddacb086296ba02567c9b158"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a862215a8ddacb086296ba02567c9b158">tan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a862215a8ddacb086296ba02567c9b158"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa97fc50bd6addfc6de0aae8570fe963d" id="r_aa97fc50bd6addfc6de0aae8570fe963d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa97fc50bd6addfc6de0aae8570fe963d">tanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:aa97fc50bd6addfc6de0aae8570fe963d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae2046d163a525fc1822a9ec8a0aeaeb3" id="r_ae2046d163a525fc1822a9ec8a0aeaeb3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae2046d163a525fc1822a9ec8a0aeaeb3">tanpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:ae2046d163a525fc1822a9ec8a0aeaeb3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a93cb75a11a362bfc8310ea19c554c887" id="r_a93cb75a11a362bfc8310ea19c554c887"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a93cb75a11a362bfc8310ea19c554c887">trunc</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
-<tr class="separator:a93cb75a11a362bfc8310ea19c554c887"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a498f1e85107eb5f01ba4435977f8efe0" id="r_a498f1e85107eb5f01ba4435977f8efe0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a498f1e85107eb5f01ba4435977f8efe0">simd_broadcast</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort broadcast_lane_id)</td></tr>
-<tr class="separator:a498f1e85107eb5f01ba4435977f8efe0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a259ed115bc3c58f88eb35830916b26d4" id="r_a259ed115bc3c58f88eb35830916b26d4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a259ed115bc3c58f88eb35830916b26d4">simd_shuffle</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort simd_lane_id)</td></tr>
-<tr class="separator:a259ed115bc3c58f88eb35830916b26d4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae29a06f0eac636ad7af21dea5b04938b" id="r_ae29a06f0eac636ad7af21dea5b04938b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae29a06f0eac636ad7af21dea5b04938b">simd_shuffle_and_fill_down</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta, ushort modulo)</td></tr>
-<tr class="separator:ae29a06f0eac636ad7af21dea5b04938b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0ee6239fa29a5f9ee0201e0dc5ddc8e0" id="r_a0ee6239fa29a5f9ee0201e0dc5ddc8e0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0ee6239fa29a5f9ee0201e0dc5ddc8e0">simd_shuffle_and_fill_down</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta)</td></tr>
-<tr class="separator:a0ee6239fa29a5f9ee0201e0dc5ddc8e0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1ca14116bf50639b214d8414b5bbaaa6" id="r_a1ca14116bf50639b214d8414b5bbaaa6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1ca14116bf50639b214d8414b5bbaaa6">simd_shuffle_and_fill_up</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta, ushort modulo)</td></tr>
-<tr class="separator:a1ca14116bf50639b214d8414b5bbaaa6"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5138d5cdc18139e135707916a243cd8e" id="r_a5138d5cdc18139e135707916a243cd8e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5138d5cdc18139e135707916a243cd8e">simd_shuffle_and_fill_up</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> filling_data, ushort delta)</td></tr>
-<tr class="separator:a5138d5cdc18139e135707916a243cd8e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af6e2dd7ae087aba6abac4f0350b7611c" id="r_af6e2dd7ae087aba6abac4f0350b7611c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
-<tr class="separator:af6e2dd7ae087aba6abac4f0350b7611c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4bb203647a421032db47e73cd649841b" id="r_a4bb203647a421032db47e73cd649841b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4bb203647a421032db47e73cd649841b">simd_shuffle_rotate_down</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
-<tr class="separator:a4bb203647a421032db47e73cd649841b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a729b22077d6c944491a6027c18ea80c9" id="r_a729b22077d6c944491a6027c18ea80c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a729b22077d6c944491a6027c18ea80c9">simd_shuffle_rotate_up</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
-<tr class="separator:a729b22077d6c944491a6027c18ea80c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afe81c5fbde3f4890458b081909242c55" id="r_afe81c5fbde3f4890458b081909242c55"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afe81c5fbde3f4890458b081909242c55">simd_shuffle_up</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort delta)</td></tr>
-<tr class="separator:afe81c5fbde3f4890458b081909242c55"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5017efc9605e069cfb507137cd1a1852" id="r_a5017efc9605e069cfb507137cd1a1852"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5017efc9605e069cfb507137cd1a1852">simd_shuffle_xor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data, ushort mask)</td></tr>
-<tr class="separator:a5017efc9605e069cfb507137cd1a1852"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a048cad0aca52cb737ebf103e76bd1c49" id="r_a048cad0aca52cb737ebf103e76bd1c49"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
-<tr class="separator:a048cad0aca52cb737ebf103e76bd1c49"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae9e2a23e00724ba2d7868bc4112b386b" id="r_ae9e2a23e00724ba2d7868bc4112b386b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae9e2a23e00724ba2d7868bc4112b386b">simd_min</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
-<tr class="separator:ae9e2a23e00724ba2d7868bc4112b386b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5ca40242390b632f737e29636829b2e4" id="r_a5ca40242390b632f737e29636829b2e4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5ca40242390b632f737e29636829b2e4">simd_prefix_exclusive_product</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
-<tr class="separator:a5ca40242390b632f737e29636829b2e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:abfbb70c7471f28bf7ff36a612ad014b2" id="r_abfbb70c7471f28bf7ff36a612ad014b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abfbb70c7471f28bf7ff36a612ad014b2">simd_prefix_exclusive_sum</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
-<tr class="separator:abfbb70c7471f28bf7ff36a612ad014b2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6ca6a7e1996228fa536e969e9e45c446" id="r_a6ca6a7e1996228fa536e969e9e45c446"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6ca6a7e1996228fa536e969e9e45c446">simd_prefix_inclusive_product</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
-<tr class="separator:a6ca6a7e1996228fa536e969e9e45c446"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a567acb18199ac0107712eb8cb8aeb8e9" id="r_a567acb18199ac0107712eb8cb8aeb8e9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a567acb18199ac0107712eb8cb8aeb8e9">simd_prefix_inclusive_sum</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
-<tr class="separator:a567acb18199ac0107712eb8cb8aeb8e9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac6e883a04e2265a9790d7db76059e1b4" id="r_ac6e883a04e2265a9790d7db76059e1b4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac6e883a04e2265a9790d7db76059e1b4">simd_product</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
-<tr class="separator:ac6e883a04e2265a9790d7db76059e1b4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a85181e37a00cb4a4217f1bb25389bce5" id="r_a85181e37a00cb4a4217f1bb25389bce5"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
-<tr class="separator:a85181e37a00cb4a4217f1bb25389bce5"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1308decbf2d5c33d34d6be523ea1c30f" id="r_a1308decbf2d5c33d34d6be523ea1c30f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1308decbf2d5c33d34d6be523ea1c30f">simd_xor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> data)</td></tr>
-<tr class="separator:a1308decbf2d5c33d34d6be523ea1c30f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Typedef Documentation</h2>
 <a id="ac82ee6c3fbe9ec5c78c07329424aaec9" name="ac82ee6c3fbe9ec5c78c07329424aaec9"></a>
@@ -318,9 +318,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::abs </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::abs </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -335,9 +335,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::acos </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::acos </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -352,9 +352,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::acosh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::acosh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -369,9 +369,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::asin </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::asin </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -386,9 +386,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::asinh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::asinh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -403,9 +403,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::atan </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::atan </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y_over_x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y_over_x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -420,14 +420,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::atan2 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::atan2 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -441,9 +441,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::atanh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::atanh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -458,9 +458,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::ceil </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::ceil </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -475,9 +475,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::cos </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::cos </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -492,9 +492,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::cosh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::cosh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -509,9 +509,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::cospi </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::cospi </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -526,14 +526,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::divide </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::divide </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -547,9 +547,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::exp </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::exp </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -564,9 +564,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::exp10 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::exp10 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -581,9 +581,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::exp2 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::exp2 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -598,9 +598,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fabs </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fabs </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -615,14 +615,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fdim </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fdim </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -636,9 +636,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::floor </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::floor </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -653,19 +653,19 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fma </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fma </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -679,14 +679,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmax </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmax </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -700,19 +700,19 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmax3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmax3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -726,19 +726,19 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmedian3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmedian3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -752,14 +752,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmin </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmin </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -773,19 +773,19 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmin3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmin3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -799,14 +799,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmod </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fmod </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -820,9 +820,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fract </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fract </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -837,9 +837,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::frexp </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::frexp </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -875,9 +875,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::ldexp </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::ldexp </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -896,9 +896,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::log </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::log </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -913,9 +913,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::log10 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::log10 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -930,9 +930,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::log2 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::log2 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -947,14 +947,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::max </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::max </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -968,19 +968,19 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::max3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::max3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -994,19 +994,19 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::median3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::median3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -1020,14 +1020,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::min </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::min </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -1041,19 +1041,19 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::min3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::min3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -1067,14 +1067,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::nextafter </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::nextafter </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -1088,14 +1088,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::pow </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::pow </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -1109,14 +1109,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::powr </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::powr </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -1130,9 +1130,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::rint </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::rint </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1147,9 +1147,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::round </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::round </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1164,9 +1164,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::rsqrt </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::rsqrt </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1181,9 +1181,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_broadcast </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_broadcast </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1202,9 +1202,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_max </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_max </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1219,9 +1219,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_min </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_min </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1236,9 +1236,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_prefix_exclusive_product </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_prefix_exclusive_product </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1253,9 +1253,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_prefix_exclusive_sum </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_prefix_exclusive_sum </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1270,9 +1270,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_prefix_inclusive_product </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_prefix_inclusive_product </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1287,9 +1287,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_prefix_inclusive_sum </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_prefix_inclusive_sum </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1304,9 +1304,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_product </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_product </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1321,9 +1321,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1342,14 +1342,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_and_fill_down </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_and_fill_down </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>filling_data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>filling_data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1368,14 +1368,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_and_fill_down </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_and_fill_down </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>filling_data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>filling_data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1399,14 +1399,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_and_fill_up </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_and_fill_up </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>filling_data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>filling_data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1425,14 +1425,14 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_and_fill_up </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_and_fill_up </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>filling_data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>filling_data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1456,9 +1456,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_down </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_down </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1477,9 +1477,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_rotate_down </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_rotate_down </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1498,9 +1498,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_rotate_up </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_rotate_up </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1519,9 +1519,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_up </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_up </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1540,9 +1540,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_xor </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_shuffle_xor </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -1561,9 +1561,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_sum </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_sum </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1578,9 +1578,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_xor </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::simd_xor </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>data</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1595,9 +1595,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::sin </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::sin </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1612,9 +1612,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::sinh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::sinh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1629,9 +1629,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::sinpi </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::sinpi </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1646,9 +1646,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::sqrt </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::sqrt </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1663,9 +1663,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::tan </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::tan </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1680,9 +1680,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::tanh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::tanh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1697,9 +1697,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::tanpi </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::tanpi </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1714,9 +1714,9 @@ template&lt;typename... Ts&gt; </div>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::trunc </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::trunc </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
diff --git a/docs/build/html/namespacemetal_1_1fast.html b/docs/build/html/namespacemetal_1_1fast.html
index b5126ec13..6c6e053cb 100644
--- a/docs/build/html/namespacemetal_1_1fast.html
+++ b/docs/build/html/namespacemetal_1_1fast.html
@@ -96,107 +96,107 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:a90d2973f71f83180e7f02e38d11c7a8f" id="r_a90d2973f71f83180e7f02e38d11c7a8f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a90d2973f71f83180e7f02e38d11c7a8f">abs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a90d2973f71f83180e7f02e38d11c7a8f" id="r_a90d2973f71f83180e7f02e38d11c7a8f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a90d2973f71f83180e7f02e38d11c7a8f">abs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a90d2973f71f83180e7f02e38d11c7a8f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a805ce5c3a94b618b7349d70bbb82f0b2" id="r_a805ce5c3a94b618b7349d70bbb82f0b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a805ce5c3a94b618b7349d70bbb82f0b2">acos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a805ce5c3a94b618b7349d70bbb82f0b2" id="r_a805ce5c3a94b618b7349d70bbb82f0b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a805ce5c3a94b618b7349d70bbb82f0b2">acos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a805ce5c3a94b618b7349d70bbb82f0b2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afb656fc3406649a238b6f1e0509de751" id="r_afb656fc3406649a238b6f1e0509de751"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afb656fc3406649a238b6f1e0509de751">acosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:afb656fc3406649a238b6f1e0509de751" id="r_afb656fc3406649a238b6f1e0509de751"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afb656fc3406649a238b6f1e0509de751">acosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:afb656fc3406649a238b6f1e0509de751"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a769455a283da99654b6e42c3acf13eb1" id="r_a769455a283da99654b6e42c3acf13eb1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a769455a283da99654b6e42c3acf13eb1">asin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a769455a283da99654b6e42c3acf13eb1" id="r_a769455a283da99654b6e42c3acf13eb1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a769455a283da99654b6e42c3acf13eb1">asin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a769455a283da99654b6e42c3acf13eb1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4367034b7b3e14310803bb2be975a556" id="r_a4367034b7b3e14310803bb2be975a556"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4367034b7b3e14310803bb2be975a556">asinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a4367034b7b3e14310803bb2be975a556" id="r_a4367034b7b3e14310803bb2be975a556"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4367034b7b3e14310803bb2be975a556">asinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a4367034b7b3e14310803bb2be975a556"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a769503b4b7f89071d0983258c5a3ac5a" id="r_a769503b4b7f89071d0983258c5a3ac5a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a769503b4b7f89071d0983258c5a3ac5a">atan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
+<tr class="memitem:a769503b4b7f89071d0983258c5a3ac5a" id="r_a769503b4b7f89071d0983258c5a3ac5a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a769503b4b7f89071d0983258c5a3ac5a">atan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
 <tr class="separator:a769503b4b7f89071d0983258c5a3ac5a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a00e687ea46f5affe26e6aef8fd62b89a" id="r_a00e687ea46f5affe26e6aef8fd62b89a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a00e687ea46f5affe26e6aef8fd62b89a">atan2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a00e687ea46f5affe26e6aef8fd62b89a" id="r_a00e687ea46f5affe26e6aef8fd62b89a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a00e687ea46f5affe26e6aef8fd62b89a">atan2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a00e687ea46f5affe26e6aef8fd62b89a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af24608fc605db9a14427d37c36dc1c53" id="r_af24608fc605db9a14427d37c36dc1c53"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af24608fc605db9a14427d37c36dc1c53">atanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:af24608fc605db9a14427d37c36dc1c53" id="r_af24608fc605db9a14427d37c36dc1c53"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af24608fc605db9a14427d37c36dc1c53">atanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:af24608fc605db9a14427d37c36dc1c53"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a97b0bbd79f1f45d9d3104d712914e6b8" id="r_a97b0bbd79f1f45d9d3104d712914e6b8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a97b0bbd79f1f45d9d3104d712914e6b8">ceil</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a97b0bbd79f1f45d9d3104d712914e6b8" id="r_a97b0bbd79f1f45d9d3104d712914e6b8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a97b0bbd79f1f45d9d3104d712914e6b8">ceil</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a97b0bbd79f1f45d9d3104d712914e6b8"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a75b6bb32fa3870eda46a7bfc9f481f88" id="r_a75b6bb32fa3870eda46a7bfc9f481f88"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a75b6bb32fa3870eda46a7bfc9f481f88">cos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a75b6bb32fa3870eda46a7bfc9f481f88" id="r_a75b6bb32fa3870eda46a7bfc9f481f88"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a75b6bb32fa3870eda46a7bfc9f481f88">cos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a75b6bb32fa3870eda46a7bfc9f481f88"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a31544ad9de28012a4ddda86e3966a77e" id="r_a31544ad9de28012a4ddda86e3966a77e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a31544ad9de28012a4ddda86e3966a77e">cosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a31544ad9de28012a4ddda86e3966a77e" id="r_a31544ad9de28012a4ddda86e3966a77e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a31544ad9de28012a4ddda86e3966a77e">cosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a31544ad9de28012a4ddda86e3966a77e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a9906b41f75319b384ffb570cc94d67ce" id="r_a9906b41f75319b384ffb570cc94d67ce"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9906b41f75319b384ffb570cc94d67ce">cospi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a9906b41f75319b384ffb570cc94d67ce" id="r_a9906b41f75319b384ffb570cc94d67ce"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9906b41f75319b384ffb570cc94d67ce">cospi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a9906b41f75319b384ffb570cc94d67ce"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae70bc2185e4649369cf7b15f5e1d48be" id="r_ae70bc2185e4649369cf7b15f5e1d48be"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae70bc2185e4649369cf7b15f5e1d48be">divide</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ae70bc2185e4649369cf7b15f5e1d48be" id="r_ae70bc2185e4649369cf7b15f5e1d48be"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae70bc2185e4649369cf7b15f5e1d48be">divide</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ae70bc2185e4649369cf7b15f5e1d48be"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad3dbd387b63373c29e3449609f763ede" id="r_ad3dbd387b63373c29e3449609f763ede"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad3dbd387b63373c29e3449609f763ede">exp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ad3dbd387b63373c29e3449609f763ede" id="r_ad3dbd387b63373c29e3449609f763ede"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad3dbd387b63373c29e3449609f763ede">exp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ad3dbd387b63373c29e3449609f763ede"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a453122f982485cbb4e471b3ac282ee5e" id="r_a453122f982485cbb4e471b3ac282ee5e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a453122f982485cbb4e471b3ac282ee5e">exp10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a453122f982485cbb4e471b3ac282ee5e" id="r_a453122f982485cbb4e471b3ac282ee5e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a453122f982485cbb4e471b3ac282ee5e">exp10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a453122f982485cbb4e471b3ac282ee5e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac092b65a46720adaf22f6266671d2d71" id="r_ac092b65a46720adaf22f6266671d2d71"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac092b65a46720adaf22f6266671d2d71">exp2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ac092b65a46720adaf22f6266671d2d71" id="r_ac092b65a46720adaf22f6266671d2d71"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac092b65a46720adaf22f6266671d2d71">exp2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ac092b65a46720adaf22f6266671d2d71"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a129fbd68c9df1a437e8959a25187f554" id="r_a129fbd68c9df1a437e8959a25187f554"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a129fbd68c9df1a437e8959a25187f554">fabs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a129fbd68c9df1a437e8959a25187f554" id="r_a129fbd68c9df1a437e8959a25187f554"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a129fbd68c9df1a437e8959a25187f554">fabs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a129fbd68c9df1a437e8959a25187f554"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a667df76100d5ea0ce5860ddae3e5a00b" id="r_a667df76100d5ea0ce5860ddae3e5a00b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a667df76100d5ea0ce5860ddae3e5a00b">fdim</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a667df76100d5ea0ce5860ddae3e5a00b" id="r_a667df76100d5ea0ce5860ddae3e5a00b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a667df76100d5ea0ce5860ddae3e5a00b">fdim</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a667df76100d5ea0ce5860ddae3e5a00b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac012ce1701c2339914f15cce9f2c632f" id="r_ac012ce1701c2339914f15cce9f2c632f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac012ce1701c2339914f15cce9f2c632f">floor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ac012ce1701c2339914f15cce9f2c632f" id="r_ac012ce1701c2339914f15cce9f2c632f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac012ce1701c2339914f15cce9f2c632f">floor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ac012ce1701c2339914f15cce9f2c632f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aebcd6e951da6f7157ec219eb7a8f1ddd" id="r_aebcd6e951da6f7157ec219eb7a8f1ddd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aebcd6e951da6f7157ec219eb7a8f1ddd">fma</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:aebcd6e951da6f7157ec219eb7a8f1ddd" id="r_aebcd6e951da6f7157ec219eb7a8f1ddd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aebcd6e951da6f7157ec219eb7a8f1ddd">fma</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:aebcd6e951da6f7157ec219eb7a8f1ddd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a26e3257cf877154f8a0d434be0bdb034" id="r_a26e3257cf877154f8a0d434be0bdb034"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a26e3257cf877154f8a0d434be0bdb034">fmax</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a26e3257cf877154f8a0d434be0bdb034" id="r_a26e3257cf877154f8a0d434be0bdb034"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a26e3257cf877154f8a0d434be0bdb034">fmax</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a26e3257cf877154f8a0d434be0bdb034"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5c6a3a389f348e1f92e8392b765a32c7" id="r_a5c6a3a389f348e1f92e8392b765a32c7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5c6a3a389f348e1f92e8392b765a32c7">fmax3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a5c6a3a389f348e1f92e8392b765a32c7" id="r_a5c6a3a389f348e1f92e8392b765a32c7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5c6a3a389f348e1f92e8392b765a32c7">fmax3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a5c6a3a389f348e1f92e8392b765a32c7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a923869181c3f576f2d86fba5bfa85633" id="r_a923869181c3f576f2d86fba5bfa85633"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a923869181c3f576f2d86fba5bfa85633">fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a923869181c3f576f2d86fba5bfa85633" id="r_a923869181c3f576f2d86fba5bfa85633"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a923869181c3f576f2d86fba5bfa85633">fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a923869181c3f576f2d86fba5bfa85633"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a7e202ec52bf12bfabdf2265b300acbfa" id="r_a7e202ec52bf12bfabdf2265b300acbfa"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7e202ec52bf12bfabdf2265b300acbfa">fmin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a7e202ec52bf12bfabdf2265b300acbfa" id="r_a7e202ec52bf12bfabdf2265b300acbfa"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7e202ec52bf12bfabdf2265b300acbfa">fmin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a7e202ec52bf12bfabdf2265b300acbfa"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a9531c6a4a520927523961e6eb6b94c1a" id="r_a9531c6a4a520927523961e6eb6b94c1a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9531c6a4a520927523961e6eb6b94c1a">fmin3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a9531c6a4a520927523961e6eb6b94c1a" id="r_a9531c6a4a520927523961e6eb6b94c1a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9531c6a4a520927523961e6eb6b94c1a">fmin3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a9531c6a4a520927523961e6eb6b94c1a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adbec09f18a89f773d7e368ef04a69526" id="r_adbec09f18a89f773d7e368ef04a69526"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adbec09f18a89f773d7e368ef04a69526">fmod</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:adbec09f18a89f773d7e368ef04a69526" id="r_adbec09f18a89f773d7e368ef04a69526"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adbec09f18a89f773d7e368ef04a69526">fmod</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:adbec09f18a89f773d7e368ef04a69526"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa8bb448827503e485eb649eb3edb2d4c" id="r_aa8bb448827503e485eb649eb3edb2d4c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa8bb448827503e485eb649eb3edb2d4c">fract</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aa8bb448827503e485eb649eb3edb2d4c" id="r_aa8bb448827503e485eb649eb3edb2d4c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa8bb448827503e485eb649eb3edb2d4c">fract</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aa8bb448827503e485eb649eb3edb2d4c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a23902df22aeaa859ef673a36381387c2" id="r_a23902df22aeaa859ef673a36381387c2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a23902df22aeaa859ef673a36381387c2">frexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="#ad3dbd387b63373c29e3449609f763ede">exp</a>)</td></tr>
+<tr class="memitem:a23902df22aeaa859ef673a36381387c2" id="r_a23902df22aeaa859ef673a36381387c2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a23902df22aeaa859ef673a36381387c2">frexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="#ad3dbd387b63373c29e3449609f763ede">exp</a>)</td></tr>
 <tr class="separator:a23902df22aeaa859ef673a36381387c2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adb045765987e76c7ad4b511fab0c867e" id="r_adb045765987e76c7ad4b511fab0c867e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adb045765987e76c7ad4b511fab0c867e">ldexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
+<tr class="memitem:adb045765987e76c7ad4b511fab0c867e" id="r_adb045765987e76c7ad4b511fab0c867e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adb045765987e76c7ad4b511fab0c867e">ldexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
 <tr class="separator:adb045765987e76c7ad4b511fab0c867e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aef942e7f9e5c2e58c58644ab1bdd58d1" id="r_aef942e7f9e5c2e58c58644ab1bdd58d1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aef942e7f9e5c2e58c58644ab1bdd58d1">log</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aef942e7f9e5c2e58c58644ab1bdd58d1" id="r_aef942e7f9e5c2e58c58644ab1bdd58d1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aef942e7f9e5c2e58c58644ab1bdd58d1">log</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aef942e7f9e5c2e58c58644ab1bdd58d1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0d1150cf2deee5100a7ea2988b3bb39e" id="r_a0d1150cf2deee5100a7ea2988b3bb39e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0d1150cf2deee5100a7ea2988b3bb39e">log10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a0d1150cf2deee5100a7ea2988b3bb39e" id="r_a0d1150cf2deee5100a7ea2988b3bb39e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0d1150cf2deee5100a7ea2988b3bb39e">log10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a0d1150cf2deee5100a7ea2988b3bb39e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a986ef245dd433ae62af864f5cbb07118" id="r_a986ef245dd433ae62af864f5cbb07118"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a986ef245dd433ae62af864f5cbb07118">log2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a986ef245dd433ae62af864f5cbb07118" id="r_a986ef245dd433ae62af864f5cbb07118"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a986ef245dd433ae62af864f5cbb07118">log2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a986ef245dd433ae62af864f5cbb07118"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a747e2e58092a27fb8b4dd3d16934fb52" id="r_a747e2e58092a27fb8b4dd3d16934fb52"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a747e2e58092a27fb8b4dd3d16934fb52">max</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a747e2e58092a27fb8b4dd3d16934fb52" id="r_a747e2e58092a27fb8b4dd3d16934fb52"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a747e2e58092a27fb8b4dd3d16934fb52">max</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a747e2e58092a27fb8b4dd3d16934fb52"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6fc2cf18ffa8149561864c86dba0f803" id="r_a6fc2cf18ffa8149561864c86dba0f803"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6fc2cf18ffa8149561864c86dba0f803">max3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a6fc2cf18ffa8149561864c86dba0f803" id="r_a6fc2cf18ffa8149561864c86dba0f803"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6fc2cf18ffa8149561864c86dba0f803">max3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a6fc2cf18ffa8149561864c86dba0f803"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a742b55f1e4369921ee7f60d70185bfbc" id="r_a742b55f1e4369921ee7f60d70185bfbc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a742b55f1e4369921ee7f60d70185bfbc">median3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a742b55f1e4369921ee7f60d70185bfbc" id="r_a742b55f1e4369921ee7f60d70185bfbc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a742b55f1e4369921ee7f60d70185bfbc">median3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a742b55f1e4369921ee7f60d70185bfbc"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3e958e56a4712687c381a0b64d123e61" id="r_a3e958e56a4712687c381a0b64d123e61"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3e958e56a4712687c381a0b64d123e61">min</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a3e958e56a4712687c381a0b64d123e61" id="r_a3e958e56a4712687c381a0b64d123e61"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3e958e56a4712687c381a0b64d123e61">min</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a3e958e56a4712687c381a0b64d123e61"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a606a4c1b34ce05ea89ca5af81724036f" id="r_a606a4c1b34ce05ea89ca5af81724036f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a606a4c1b34ce05ea89ca5af81724036f">min3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a606a4c1b34ce05ea89ca5af81724036f" id="r_a606a4c1b34ce05ea89ca5af81724036f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a606a4c1b34ce05ea89ca5af81724036f">min3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a606a4c1b34ce05ea89ca5af81724036f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4583e8be04fc0bd475b97b0934604f23" id="r_a4583e8be04fc0bd475b97b0934604f23"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4583e8be04fc0bd475b97b0934604f23">nextafter</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a4583e8be04fc0bd475b97b0934604f23" id="r_a4583e8be04fc0bd475b97b0934604f23"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4583e8be04fc0bd475b97b0934604f23">nextafter</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a4583e8be04fc0bd475b97b0934604f23"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ade2367eaec894bd2e14a1351c363e003" id="r_ade2367eaec894bd2e14a1351c363e003"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ade2367eaec894bd2e14a1351c363e003">pow</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ade2367eaec894bd2e14a1351c363e003" id="r_ade2367eaec894bd2e14a1351c363e003"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ade2367eaec894bd2e14a1351c363e003">pow</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ade2367eaec894bd2e14a1351c363e003"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4293cbc94175b4dcc724fe4747eb5d5a" id="r_a4293cbc94175b4dcc724fe4747eb5d5a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4293cbc94175b4dcc724fe4747eb5d5a">powr</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a4293cbc94175b4dcc724fe4747eb5d5a" id="r_a4293cbc94175b4dcc724fe4747eb5d5a"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4293cbc94175b4dcc724fe4747eb5d5a">powr</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a4293cbc94175b4dcc724fe4747eb5d5a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa613bc252f8d8069e175ec9e9d05a7ec" id="r_aa613bc252f8d8069e175ec9e9d05a7ec"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa613bc252f8d8069e175ec9e9d05a7ec">rint</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aa613bc252f8d8069e175ec9e9d05a7ec" id="r_aa613bc252f8d8069e175ec9e9d05a7ec"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa613bc252f8d8069e175ec9e9d05a7ec">rint</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aa613bc252f8d8069e175ec9e9d05a7ec"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4cb687257a004726d49e496417eaa40f" id="r_a4cb687257a004726d49e496417eaa40f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4cb687257a004726d49e496417eaa40f">round</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a4cb687257a004726d49e496417eaa40f" id="r_a4cb687257a004726d49e496417eaa40f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4cb687257a004726d49e496417eaa40f">round</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a4cb687257a004726d49e496417eaa40f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa62097c750f1e4b69d09277f19976ab1" id="r_aa62097c750f1e4b69d09277f19976ab1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa62097c750f1e4b69d09277f19976ab1">rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aa62097c750f1e4b69d09277f19976ab1" id="r_aa62097c750f1e4b69d09277f19976ab1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa62097c750f1e4b69d09277f19976ab1">rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aa62097c750f1e4b69d09277f19976ab1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3af771cfe7a135104f9d063147dba270" id="r_a3af771cfe7a135104f9d063147dba270"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3af771cfe7a135104f9d063147dba270">sin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a3af771cfe7a135104f9d063147dba270" id="r_a3af771cfe7a135104f9d063147dba270"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3af771cfe7a135104f9d063147dba270">sin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a3af771cfe7a135104f9d063147dba270"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a990d90b3440e38d1fb4ff5065c6c189b" id="r_a990d90b3440e38d1fb4ff5065c6c189b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a990d90b3440e38d1fb4ff5065c6c189b">sinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a990d90b3440e38d1fb4ff5065c6c189b" id="r_a990d90b3440e38d1fb4ff5065c6c189b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a990d90b3440e38d1fb4ff5065c6c189b">sinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a990d90b3440e38d1fb4ff5065c6c189b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab07a32fe544aa304577d29e0251e87b2" id="r_ab07a32fe544aa304577d29e0251e87b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab07a32fe544aa304577d29e0251e87b2">sinpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ab07a32fe544aa304577d29e0251e87b2" id="r_ab07a32fe544aa304577d29e0251e87b2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab07a32fe544aa304577d29e0251e87b2">sinpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ab07a32fe544aa304577d29e0251e87b2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4218a85c7d8a74cb8055b4755205627e" id="r_a4218a85c7d8a74cb8055b4755205627e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4218a85c7d8a74cb8055b4755205627e">sqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a4218a85c7d8a74cb8055b4755205627e" id="r_a4218a85c7d8a74cb8055b4755205627e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4218a85c7d8a74cb8055b4755205627e">sqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a4218a85c7d8a74cb8055b4755205627e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae34754afa152a6170ac2ae3294174506" id="r_ae34754afa152a6170ac2ae3294174506"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae34754afa152a6170ac2ae3294174506">tan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ae34754afa152a6170ac2ae3294174506" id="r_ae34754afa152a6170ac2ae3294174506"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae34754afa152a6170ac2ae3294174506">tan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ae34754afa152a6170ac2ae3294174506"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a13e6e6ae087b7c558e9a94ddbc864d43" id="r_a13e6e6ae087b7c558e9a94ddbc864d43"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a13e6e6ae087b7c558e9a94ddbc864d43">tanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a13e6e6ae087b7c558e9a94ddbc864d43" id="r_a13e6e6ae087b7c558e9a94ddbc864d43"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a13e6e6ae087b7c558e9a94ddbc864d43">tanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a13e6e6ae087b7c558e9a94ddbc864d43"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a39b2952d4adf1400016c63243798aaf8" id="r_a39b2952d4adf1400016c63243798aaf8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a39b2952d4adf1400016c63243798aaf8">tanpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a39b2952d4adf1400016c63243798aaf8" id="r_a39b2952d4adf1400016c63243798aaf8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a39b2952d4adf1400016c63243798aaf8">tanpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a39b2952d4adf1400016c63243798aaf8"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa62e1075e86c626d97038f16e9433415" id="r_aa62e1075e86c626d97038f16e9433415"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa62e1075e86c626d97038f16e9433415">trunc</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aa62e1075e86c626d97038f16e9433415" id="r_aa62e1075e86c626d97038f16e9433415"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa62e1075e86c626d97038f16e9433415">trunc</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aa62e1075e86c626d97038f16e9433415"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
@@ -207,9 +207,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::abs </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::abs </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -224,9 +224,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::acos </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::acos </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -241,9 +241,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::acosh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::acosh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -258,9 +258,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::asin </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::asin </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -275,9 +275,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::asinh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::asinh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -292,9 +292,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::atan </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::atan </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y_over_x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y_over_x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -309,14 +309,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::atan2 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::atan2 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -330,9 +330,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::atanh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::atanh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -347,9 +347,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::ceil </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::ceil </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -364,9 +364,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::cos </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::cos </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -381,9 +381,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::cosh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::cosh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -398,9 +398,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::cospi </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::cospi </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -415,14 +415,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::divide </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::divide </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -436,9 +436,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::exp </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::exp </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -453,9 +453,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::exp10 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::exp10 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -470,9 +470,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::exp2 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::exp2 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -487,9 +487,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fabs </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fabs </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -504,14 +504,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fdim </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fdim </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -525,9 +525,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::floor </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::floor </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -542,19 +542,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fma </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fma </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -568,14 +568,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmax </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmax </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -589,19 +589,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmax3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmax3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -615,19 +615,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmedian3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmedian3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -641,14 +641,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmin </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmin </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -662,19 +662,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmin3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmin3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -688,14 +688,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmod </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fmod </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -709,9 +709,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fract </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::fract </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -726,9 +726,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::frexp </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::frexp </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -747,9 +747,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::ldexp </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::ldexp </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -768,9 +768,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::log </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::log </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -785,9 +785,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::log10 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::log10 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -802,9 +802,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::log2 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::log2 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -819,14 +819,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::max </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::max </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -840,19 +840,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::max3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::max3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -866,19 +866,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::median3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::median3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -892,14 +892,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::min </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::min </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -913,19 +913,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::min3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::min3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -939,14 +939,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::nextafter </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::nextafter </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -960,14 +960,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::pow </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::pow </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -981,14 +981,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::powr </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::powr </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -1002,9 +1002,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::rint </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::rint </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1019,9 +1019,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::round </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::round </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1036,9 +1036,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::rsqrt </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::rsqrt </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1053,9 +1053,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::sin </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::sin </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1070,9 +1070,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::sinh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::sinh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1087,9 +1087,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::sinpi </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::sinpi </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1104,9 +1104,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::sqrt </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::sqrt </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1121,9 +1121,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::tan </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::tan </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1138,9 +1138,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::tanh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::tanh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1155,9 +1155,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::tanpi </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::tanpi </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1172,9 +1172,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::trunc </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::fast::trunc </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
diff --git a/docs/build/html/namespacemetal_1_1precise.html b/docs/build/html/namespacemetal_1_1precise.html
index 286df3bce..939962812 100644
--- a/docs/build/html/namespacemetal_1_1precise.html
+++ b/docs/build/html/namespacemetal_1_1precise.html
@@ -96,107 +96,107 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:a99f2b2746e813b9ca7b4249afbaf2a14" id="r_a99f2b2746e813b9ca7b4249afbaf2a14"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a99f2b2746e813b9ca7b4249afbaf2a14">abs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a99f2b2746e813b9ca7b4249afbaf2a14" id="r_a99f2b2746e813b9ca7b4249afbaf2a14"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a99f2b2746e813b9ca7b4249afbaf2a14">abs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a99f2b2746e813b9ca7b4249afbaf2a14"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8a2bcc89fc0b7e74f0453f82f89a8604" id="r_a8a2bcc89fc0b7e74f0453f82f89a8604"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8a2bcc89fc0b7e74f0453f82f89a8604">acos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8a2bcc89fc0b7e74f0453f82f89a8604" id="r_a8a2bcc89fc0b7e74f0453f82f89a8604"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8a2bcc89fc0b7e74f0453f82f89a8604">acos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8a2bcc89fc0b7e74f0453f82f89a8604"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1f489fabffab969b8677b56bb1136067" id="r_a1f489fabffab969b8677b56bb1136067"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1f489fabffab969b8677b56bb1136067">acosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a1f489fabffab969b8677b56bb1136067" id="r_a1f489fabffab969b8677b56bb1136067"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1f489fabffab969b8677b56bb1136067">acosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a1f489fabffab969b8677b56bb1136067"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adc7b8b6e12e320cb32030f728dcbf438" id="r_adc7b8b6e12e320cb32030f728dcbf438"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adc7b8b6e12e320cb32030f728dcbf438">asin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:adc7b8b6e12e320cb32030f728dcbf438" id="r_adc7b8b6e12e320cb32030f728dcbf438"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adc7b8b6e12e320cb32030f728dcbf438">asin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:adc7b8b6e12e320cb32030f728dcbf438"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aaad1cdde6687c8011fbc5fda1bb13424" id="r_aaad1cdde6687c8011fbc5fda1bb13424"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aaad1cdde6687c8011fbc5fda1bb13424">asinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:aaad1cdde6687c8011fbc5fda1bb13424" id="r_aaad1cdde6687c8011fbc5fda1bb13424"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aaad1cdde6687c8011fbc5fda1bb13424">asinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:aaad1cdde6687c8011fbc5fda1bb13424"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aaaf4b5f4786a912089bbf0ae7619a6be" id="r_aaaf4b5f4786a912089bbf0ae7619a6be"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aaaf4b5f4786a912089bbf0ae7619a6be">atan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
+<tr class="memitem:aaaf4b5f4786a912089bbf0ae7619a6be" id="r_aaaf4b5f4786a912089bbf0ae7619a6be"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aaaf4b5f4786a912089bbf0ae7619a6be">atan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y_over_x)</td></tr>
 <tr class="separator:aaaf4b5f4786a912089bbf0ae7619a6be"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6f161b049cc6884f87b09b33c2d1cd7f" id="r_a6f161b049cc6884f87b09b33c2d1cd7f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6f161b049cc6884f87b09b33c2d1cd7f">atan2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a6f161b049cc6884f87b09b33c2d1cd7f" id="r_a6f161b049cc6884f87b09b33c2d1cd7f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6f161b049cc6884f87b09b33c2d1cd7f">atan2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a6f161b049cc6884f87b09b33c2d1cd7f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a902994837653b90c47f4285673e712c4" id="r_a902994837653b90c47f4285673e712c4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a902994837653b90c47f4285673e712c4">atanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a902994837653b90c47f4285673e712c4" id="r_a902994837653b90c47f4285673e712c4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a902994837653b90c47f4285673e712c4">atanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a902994837653b90c47f4285673e712c4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8577549a1afeea206dd9a2004af2868d" id="r_a8577549a1afeea206dd9a2004af2868d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8577549a1afeea206dd9a2004af2868d">ceil</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8577549a1afeea206dd9a2004af2868d" id="r_a8577549a1afeea206dd9a2004af2868d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8577549a1afeea206dd9a2004af2868d">ceil</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8577549a1afeea206dd9a2004af2868d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac4941f62e7d8ab9d7cabbd967aa9f220" id="r_ac4941f62e7d8ab9d7cabbd967aa9f220"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac4941f62e7d8ab9d7cabbd967aa9f220">cos</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ac4941f62e7d8ab9d7cabbd967aa9f220" id="r_ac4941f62e7d8ab9d7cabbd967aa9f220"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac4941f62e7d8ab9d7cabbd967aa9f220">cos</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ac4941f62e7d8ab9d7cabbd967aa9f220"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a72d86d508300a9b58f4ccbbe70da4fbc" id="r_a72d86d508300a9b58f4ccbbe70da4fbc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a72d86d508300a9b58f4ccbbe70da4fbc">cosh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a72d86d508300a9b58f4ccbbe70da4fbc" id="r_a72d86d508300a9b58f4ccbbe70da4fbc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a72d86d508300a9b58f4ccbbe70da4fbc">cosh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a72d86d508300a9b58f4ccbbe70da4fbc"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2392b78bd196efdbbac65901c4ab20e7" id="r_a2392b78bd196efdbbac65901c4ab20e7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2392b78bd196efdbbac65901c4ab20e7">cospi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a2392b78bd196efdbbac65901c4ab20e7" id="r_a2392b78bd196efdbbac65901c4ab20e7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2392b78bd196efdbbac65901c4ab20e7">cospi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a2392b78bd196efdbbac65901c4ab20e7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aec0982cdb96a08b61f51129150d82e9d" id="r_aec0982cdb96a08b61f51129150d82e9d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aec0982cdb96a08b61f51129150d82e9d">divide</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:aec0982cdb96a08b61f51129150d82e9d" id="r_aec0982cdb96a08b61f51129150d82e9d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aec0982cdb96a08b61f51129150d82e9d">divide</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:aec0982cdb96a08b61f51129150d82e9d"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8d8d2d5700ce432b33cf47cf22528e8f" id="r_a8d8d2d5700ce432b33cf47cf22528e8f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8d8d2d5700ce432b33cf47cf22528e8f">exp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8d8d2d5700ce432b33cf47cf22528e8f" id="r_a8d8d2d5700ce432b33cf47cf22528e8f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8d8d2d5700ce432b33cf47cf22528e8f">exp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8d8d2d5700ce432b33cf47cf22528e8f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af9addb343c967da3a83e9e123a8521fd" id="r_af9addb343c967da3a83e9e123a8521fd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af9addb343c967da3a83e9e123a8521fd">exp10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:af9addb343c967da3a83e9e123a8521fd" id="r_af9addb343c967da3a83e9e123a8521fd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af9addb343c967da3a83e9e123a8521fd">exp10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:af9addb343c967da3a83e9e123a8521fd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a92a880bd2197efc0da0f8f0f7ec1e4c9" id="r_a92a880bd2197efc0da0f8f0f7ec1e4c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a92a880bd2197efc0da0f8f0f7ec1e4c9">exp2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a92a880bd2197efc0da0f8f0f7ec1e4c9" id="r_a92a880bd2197efc0da0f8f0f7ec1e4c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a92a880bd2197efc0da0f8f0f7ec1e4c9">exp2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a92a880bd2197efc0da0f8f0f7ec1e4c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae4c71d8bc8ef291036a7aaa05f8be3d1" id="r_ae4c71d8bc8ef291036a7aaa05f8be3d1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae4c71d8bc8ef291036a7aaa05f8be3d1">fabs</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ae4c71d8bc8ef291036a7aaa05f8be3d1" id="r_ae4c71d8bc8ef291036a7aaa05f8be3d1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae4c71d8bc8ef291036a7aaa05f8be3d1">fabs</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ae4c71d8bc8ef291036a7aaa05f8be3d1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af693e7c93de446e80dd1377f5e9e7260" id="r_af693e7c93de446e80dd1377f5e9e7260"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af693e7c93de446e80dd1377f5e9e7260">fdim</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:af693e7c93de446e80dd1377f5e9e7260" id="r_af693e7c93de446e80dd1377f5e9e7260"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af693e7c93de446e80dd1377f5e9e7260">fdim</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:af693e7c93de446e80dd1377f5e9e7260"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a66e02b028e3cecfe7c80773460dc7925" id="r_a66e02b028e3cecfe7c80773460dc7925"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a66e02b028e3cecfe7c80773460dc7925">floor</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a66e02b028e3cecfe7c80773460dc7925" id="r_a66e02b028e3cecfe7c80773460dc7925"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a66e02b028e3cecfe7c80773460dc7925">floor</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a66e02b028e3cecfe7c80773460dc7925"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a49391a64d6b66fe3a212516b316a2144" id="r_a49391a64d6b66fe3a212516b316a2144"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a49391a64d6b66fe3a212516b316a2144">fma</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a49391a64d6b66fe3a212516b316a2144" id="r_a49391a64d6b66fe3a212516b316a2144"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a49391a64d6b66fe3a212516b316a2144">fma</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a49391a64d6b66fe3a212516b316a2144"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac7d49f921c2883caf9eec66efc4de1cd" id="r_ac7d49f921c2883caf9eec66efc4de1cd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac7d49f921c2883caf9eec66efc4de1cd">fmax</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ac7d49f921c2883caf9eec66efc4de1cd" id="r_ac7d49f921c2883caf9eec66efc4de1cd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac7d49f921c2883caf9eec66efc4de1cd">fmax</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ac7d49f921c2883caf9eec66efc4de1cd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adf750e51bd83d569994d0967029e3bdc" id="r_adf750e51bd83d569994d0967029e3bdc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adf750e51bd83d569994d0967029e3bdc">fmax3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:adf750e51bd83d569994d0967029e3bdc" id="r_adf750e51bd83d569994d0967029e3bdc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adf750e51bd83d569994d0967029e3bdc">fmax3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:adf750e51bd83d569994d0967029e3bdc"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a48d1d0be889de4043b775bb6b030a989" id="r_a48d1d0be889de4043b775bb6b030a989"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a48d1d0be889de4043b775bb6b030a989">fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a48d1d0be889de4043b775bb6b030a989" id="r_a48d1d0be889de4043b775bb6b030a989"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a48d1d0be889de4043b775bb6b030a989">fmedian3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a48d1d0be889de4043b775bb6b030a989"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a18df8eb481dfa56c92ad31b5bab8e069" id="r_a18df8eb481dfa56c92ad31b5bab8e069"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a18df8eb481dfa56c92ad31b5bab8e069">fmin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a18df8eb481dfa56c92ad31b5bab8e069" id="r_a18df8eb481dfa56c92ad31b5bab8e069"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a18df8eb481dfa56c92ad31b5bab8e069">fmin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a18df8eb481dfa56c92ad31b5bab8e069"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5bb710e6742996d32225a8f54a0f116c" id="r_a5bb710e6742996d32225a8f54a0f116c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5bb710e6742996d32225a8f54a0f116c">fmin3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a5bb710e6742996d32225a8f54a0f116c" id="r_a5bb710e6742996d32225a8f54a0f116c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5bb710e6742996d32225a8f54a0f116c">fmin3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a5bb710e6742996d32225a8f54a0f116c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa99937178a1fc8158054e328eeeae648" id="r_aa99937178a1fc8158054e328eeeae648"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa99937178a1fc8158054e328eeeae648">fmod</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:aa99937178a1fc8158054e328eeeae648" id="r_aa99937178a1fc8158054e328eeeae648"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa99937178a1fc8158054e328eeeae648">fmod</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:aa99937178a1fc8158054e328eeeae648"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0f21c19332a90df1a8ff507a813b5757" id="r_a0f21c19332a90df1a8ff507a813b5757"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0f21c19332a90df1a8ff507a813b5757">fract</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a0f21c19332a90df1a8ff507a813b5757" id="r_a0f21c19332a90df1a8ff507a813b5757"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0f21c19332a90df1a8ff507a813b5757">fract</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a0f21c19332a90df1a8ff507a813b5757"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0fbb1624c308b97380f894f92fd858b4" id="r_a0fbb1624c308b97380f894f92fd858b4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0fbb1624c308b97380f894f92fd858b4">frexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="#a8d8d2d5700ce432b33cf47cf22528e8f">exp</a>)</td></tr>
+<tr class="memitem:a0fbb1624c308b97380f894f92fd858b4" id="r_a0fbb1624c308b97380f894f92fd858b4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0fbb1624c308b97380f894f92fd858b4">frexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, thread int &amp;<a class="el" href="#a8d8d2d5700ce432b33cf47cf22528e8f">exp</a>)</td></tr>
 <tr class="separator:a0fbb1624c308b97380f894f92fd858b4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa0462827a08a9f475fdaeb104c98b6ab" id="r_aa0462827a08a9f475fdaeb104c98b6ab"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa0462827a08a9f475fdaeb104c98b6ab">ldexp</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
+<tr class="memitem:aa0462827a08a9f475fdaeb104c98b6ab" id="r_aa0462827a08a9f475fdaeb104c98b6ab"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa0462827a08a9f475fdaeb104c98b6ab">ldexp</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, int k)</td></tr>
 <tr class="separator:aa0462827a08a9f475fdaeb104c98b6ab"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a341c2b8c27d1bed860f85f8b355023d4" id="r_a341c2b8c27d1bed860f85f8b355023d4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a341c2b8c27d1bed860f85f8b355023d4">log</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a341c2b8c27d1bed860f85f8b355023d4" id="r_a341c2b8c27d1bed860f85f8b355023d4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a341c2b8c27d1bed860f85f8b355023d4">log</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a341c2b8c27d1bed860f85f8b355023d4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a44239067e8e9248b1574353f98e94d72" id="r_a44239067e8e9248b1574353f98e94d72"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a44239067e8e9248b1574353f98e94d72">log10</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a44239067e8e9248b1574353f98e94d72" id="r_a44239067e8e9248b1574353f98e94d72"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a44239067e8e9248b1574353f98e94d72">log10</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a44239067e8e9248b1574353f98e94d72"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a632dbbdcc1a465cf4739a14306147573" id="r_a632dbbdcc1a465cf4739a14306147573"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a632dbbdcc1a465cf4739a14306147573">log2</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a632dbbdcc1a465cf4739a14306147573" id="r_a632dbbdcc1a465cf4739a14306147573"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a632dbbdcc1a465cf4739a14306147573">log2</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a632dbbdcc1a465cf4739a14306147573"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6a954a4e4e3753303d1dc734855a185f" id="r_a6a954a4e4e3753303d1dc734855a185f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6a954a4e4e3753303d1dc734855a185f">max</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a6a954a4e4e3753303d1dc734855a185f" id="r_a6a954a4e4e3753303d1dc734855a185f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6a954a4e4e3753303d1dc734855a185f">max</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a6a954a4e4e3753303d1dc734855a185f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac490e8614ebd2c9343af1ae6c0d4e82c" id="r_ac490e8614ebd2c9343af1ae6c0d4e82c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac490e8614ebd2c9343af1ae6c0d4e82c">max3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:ac490e8614ebd2c9343af1ae6c0d4e82c" id="r_ac490e8614ebd2c9343af1ae6c0d4e82c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac490e8614ebd2c9343af1ae6c0d4e82c">max3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:ac490e8614ebd2c9343af1ae6c0d4e82c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a14555ff99c4388493fec48e070144ae2" id="r_a14555ff99c4388493fec48e070144ae2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a14555ff99c4388493fec48e070144ae2">median3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a14555ff99c4388493fec48e070144ae2" id="r_a14555ff99c4388493fec48e070144ae2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a14555ff99c4388493fec48e070144ae2">median3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a14555ff99c4388493fec48e070144ae2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afed0da2f7df3505b5dffa2389c3cb36e" id="r_afed0da2f7df3505b5dffa2389c3cb36e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afed0da2f7df3505b5dffa2389c3cb36e">min</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:afed0da2f7df3505b5dffa2389c3cb36e" id="r_afed0da2f7df3505b5dffa2389c3cb36e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afed0da2f7df3505b5dffa2389c3cb36e">min</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:afed0da2f7df3505b5dffa2389c3cb36e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4d37ce31c3549ca4772a4ee29798e231" id="r_a4d37ce31c3549ca4772a4ee29798e231"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4d37ce31c3549ca4772a4ee29798e231">min3</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
+<tr class="memitem:a4d37ce31c3549ca4772a4ee29798e231" id="r_a4d37ce31c3549ca4772a4ee29798e231"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4d37ce31c3549ca4772a4ee29798e231">min3</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> z)</td></tr>
 <tr class="separator:a4d37ce31c3549ca4772a4ee29798e231"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad012ceeb55b77f1533749b351331e026" id="r_ad012ceeb55b77f1533749b351331e026"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad012ceeb55b77f1533749b351331e026">nextafter</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ad012ceeb55b77f1533749b351331e026" id="r_ad012ceeb55b77f1533749b351331e026"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad012ceeb55b77f1533749b351331e026">nextafter</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ad012ceeb55b77f1533749b351331e026"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4cce64f1f20c1c6dfd29115bdb7c8d42" id="r_a4cce64f1f20c1c6dfd29115bdb7c8d42"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4cce64f1f20c1c6dfd29115bdb7c8d42">pow</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a4cce64f1f20c1c6dfd29115bdb7c8d42" id="r_a4cce64f1f20c1c6dfd29115bdb7c8d42"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4cce64f1f20c1c6dfd29115bdb7c8d42">pow</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a4cce64f1f20c1c6dfd29115bdb7c8d42"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac9dbab0bd99b2b94e364aba5353bdcd7" id="r_ac9dbab0bd99b2b94e364aba5353bdcd7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac9dbab0bd99b2b94e364aba5353bdcd7">powr</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:ac9dbab0bd99b2b94e364aba5353bdcd7" id="r_ac9dbab0bd99b2b94e364aba5353bdcd7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac9dbab0bd99b2b94e364aba5353bdcd7">powr</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:ac9dbab0bd99b2b94e364aba5353bdcd7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab17bd408098270ad92f37bcd1039c254" id="r_ab17bd408098270ad92f37bcd1039c254"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab17bd408098270ad92f37bcd1039c254">rint</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:ab17bd408098270ad92f37bcd1039c254" id="r_ab17bd408098270ad92f37bcd1039c254"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab17bd408098270ad92f37bcd1039c254">rint</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:ab17bd408098270ad92f37bcd1039c254"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5295ab08055d12534cc3775da855ac12" id="r_a5295ab08055d12534cc3775da855ac12"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5295ab08055d12534cc3775da855ac12">round</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a5295ab08055d12534cc3775da855ac12" id="r_a5295ab08055d12534cc3775da855ac12"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5295ab08055d12534cc3775da855ac12">round</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a5295ab08055d12534cc3775da855ac12"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afb397b477745f12a44423934fa2b05ac" id="r_afb397b477745f12a44423934fa2b05ac"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afb397b477745f12a44423934fa2b05ac">rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:afb397b477745f12a44423934fa2b05ac" id="r_afb397b477745f12a44423934fa2b05ac"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afb397b477745f12a44423934fa2b05ac">rsqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:afb397b477745f12a44423934fa2b05ac"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a71acf77ffd29c56f56afae0195c98a1c" id="r_a71acf77ffd29c56f56afae0195c98a1c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a71acf77ffd29c56f56afae0195c98a1c">sin</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a71acf77ffd29c56f56afae0195c98a1c" id="r_a71acf77ffd29c56f56afae0195c98a1c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a71acf77ffd29c56f56afae0195c98a1c">sin</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a71acf77ffd29c56f56afae0195c98a1c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:abc8f4f59dd6e7204ab5d84f0af96331c" id="r_abc8f4f59dd6e7204ab5d84f0af96331c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abc8f4f59dd6e7204ab5d84f0af96331c">sinh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:abc8f4f59dd6e7204ab5d84f0af96331c" id="r_abc8f4f59dd6e7204ab5d84f0af96331c"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abc8f4f59dd6e7204ab5d84f0af96331c">sinh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:abc8f4f59dd6e7204ab5d84f0af96331c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a78b17dab93519d9c82c2575dafec49c9" id="r_a78b17dab93519d9c82c2575dafec49c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a78b17dab93519d9c82c2575dafec49c9">sinpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a78b17dab93519d9c82c2575dafec49c9" id="r_a78b17dab93519d9c82c2575dafec49c9"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a78b17dab93519d9c82c2575dafec49c9">sinpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a78b17dab93519d9c82c2575dafec49c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:acb213467361cd2cab93a8d5ea1aa5bfd" id="r_acb213467361cd2cab93a8d5ea1aa5bfd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acb213467361cd2cab93a8d5ea1aa5bfd">sqrt</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:acb213467361cd2cab93a8d5ea1aa5bfd" id="r_acb213467361cd2cab93a8d5ea1aa5bfd"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acb213467361cd2cab93a8d5ea1aa5bfd">sqrt</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:acb213467361cd2cab93a8d5ea1aa5bfd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8261ed22f03122ef15b89512358acb1f" id="r_a8261ed22f03122ef15b89512358acb1f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8261ed22f03122ef15b89512358acb1f">tan</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8261ed22f03122ef15b89512358acb1f" id="r_a8261ed22f03122ef15b89512358acb1f"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8261ed22f03122ef15b89512358acb1f">tan</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8261ed22f03122ef15b89512358acb1f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a741c27a10cc968dd1e63473d9fcd8f99" id="r_a741c27a10cc968dd1e63473d9fcd8f99"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a741c27a10cc968dd1e63473d9fcd8f99">tanh</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a741c27a10cc968dd1e63473d9fcd8f99" id="r_a741c27a10cc968dd1e63473d9fcd8f99"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a741c27a10cc968dd1e63473d9fcd8f99">tanh</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a741c27a10cc968dd1e63473d9fcd8f99"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8fae8c20deff43a8e855bba6f3ba20a5" id="r_a8fae8c20deff43a8e855bba6f3ba20a5"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8fae8c20deff43a8e855bba6f3ba20a5">tanpi</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a8fae8c20deff43a8e855bba6f3ba20a5" id="r_a8fae8c20deff43a8e855bba6f3ba20a5"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8fae8c20deff43a8e855bba6f3ba20a5">tanpi</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a8fae8c20deff43a8e855bba6f3ba20a5"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a334183e7a2dd49b983d072d1e8ee2b27" id="r_a334183e7a2dd49b983d072d1e8ee2b27"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a334183e7a2dd49b983d072d1e8ee2b27">trunc</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
+<tr class="memitem:a334183e7a2dd49b983d072d1e8ee2b27" id="r_a334183e7a2dd49b983d072d1e8ee2b27"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a334183e7a2dd49b983d072d1e8ee2b27">trunc</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x)</td></tr>
 <tr class="separator:a334183e7a2dd49b983d072d1e8ee2b27"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
@@ -207,9 +207,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::abs </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::abs </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -224,9 +224,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::acos </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::acos </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -241,9 +241,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::acosh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::acosh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -258,9 +258,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::asin </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::asin </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -275,9 +275,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::asinh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::asinh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -292,9 +292,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::atan </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::atan </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y_over_x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y_over_x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -309,14 +309,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::atan2 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::atan2 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -330,9 +330,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::atanh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::atanh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -347,9 +347,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::ceil </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::ceil </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -364,9 +364,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::cos </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::cos </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -381,9 +381,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::cosh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::cosh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -398,9 +398,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::cospi </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::cospi </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -415,14 +415,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::divide </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::divide </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -436,9 +436,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::exp </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::exp </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -453,9 +453,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::exp10 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::exp10 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -470,9 +470,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::exp2 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::exp2 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -487,9 +487,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fabs </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fabs </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -504,14 +504,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fdim </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fdim </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -525,9 +525,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::floor </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::floor </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -542,19 +542,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fma </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fma </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -568,14 +568,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmax </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmax </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -589,19 +589,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmax3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmax3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -615,19 +615,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmedian3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmedian3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -641,14 +641,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmin </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmin </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -662,19 +662,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmin3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmin3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -688,14 +688,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmod </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fmod </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -709,9 +709,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fract </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::fract </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -726,9 +726,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::frexp </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::frexp </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -747,9 +747,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::ldexp </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::ldexp </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -768,9 +768,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::log </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::log </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -785,9 +785,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::log10 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::log10 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -802,9 +802,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::log2 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::log2 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -819,14 +819,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::max </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::max </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -840,19 +840,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::max3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::max3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -866,19 +866,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::median3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::median3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -892,14 +892,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::min </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::min </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -913,19 +913,19 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::min3 </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::min3 </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>z</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -939,14 +939,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::nextafter </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::nextafter </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -960,14 +960,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::pow </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::pow </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -981,14 +981,14 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::powr </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::powr </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -1002,9 +1002,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::rint </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::rint </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1019,9 +1019,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::round </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::round </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1036,9 +1036,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::rsqrt </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::rsqrt </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1053,9 +1053,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::sin </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::sin </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1070,9 +1070,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::sinh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::sinh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1087,9 +1087,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::sinpi </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::sinpi </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1104,9 +1104,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::sqrt </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::sqrt </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1121,9 +1121,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::tan </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::tan </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1138,9 +1138,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::tanh </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::tanh </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1155,9 +1155,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::tanpi </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::tanpi </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
@@ -1172,9 +1172,9 @@ Functions</h2></td></tr>
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::trunc </td>
+          <td class="memname">METAL_FUNC <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::precise::trunc </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
           <td></td>
         </tr>
       </table>
diff --git a/docs/build/html/namespacemlx_1_1core.html b/docs/build/html/namespacemlx_1_1core.html
index 09ca616f9..b8ef276c1 100644
--- a/docs/build/html/namespacemlx_1_1core.html
+++ b/docs/build/html/namespacemlx_1_1core.html
@@ -107,6 +107,8 @@ Namespaces</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1distributed.html">distributed</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1env.html">env</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1fast.html">fast</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1fft.html">fft</a></td></tr>
@@ -182,6 +184,8 @@ Classes</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_conjugate.html">Conjugate</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">Contiguous</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1core_1_1_contiguous_iterator.html">ContiguousIterator</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_convolution.html">Convolution</a></td></tr>
@@ -434,7 +438,7 @@ Functions</h2></td></tr>
 <tr class="separator:a985c60929757190e0b4ec51f57c767d0"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a3b900ab319948c5a01a3ecd30a709027" id="r_a3b900ab319948c5a01a3ecd30a709027"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3b900ab319948c5a01a3ecd30a709027">compiled_check_contiguity</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, const std::vector&lt; int &gt; &amp;shape)</td></tr>
 <tr class="separator:a3b900ab319948c5a01a3ecd30a709027"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab8c3c4fc05745f586de922c8266f4fce" id="r_ab8c3c4fc05745f586de922c8266f4fce"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab8c3c4fc05745f586de922c8266f4fce">compiled_allocate_outputs</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;outputs, const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs_, const std::unordered_set&lt; uintptr_t &gt; &amp;constant_ids_, bool contiguous, bool move_buffers=false)</td></tr>
+<tr class="memitem:ab8c3c4fc05745f586de922c8266f4fce" id="r_ab8c3c4fc05745f586de922c8266f4fce"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab8c3c4fc05745f586de922c8266f4fce">compiled_allocate_outputs</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;outputs, const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs_, const std::unordered_set&lt; uintptr_t &gt; &amp;constant_ids_, bool <a class="el" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">contiguous</a>, bool move_buffers=false)</td></tr>
 <tr class="separator:ab8c3c4fc05745f586de922c8266f4fce"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a479648542a2bea151b947b18f0e79dd2" id="r_a479648542a2bea151b947b18f0e79dd2"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a479648542a2bea151b947b18f0e79dd2">copy</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;src, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;dst, <a class="el" href="#abd84ff6c5245e4e170b2ef5247594337">CopyType</a> ctype)</td></tr>
 <tr class="separator:a479648542a2bea151b947b18f0e79dd2"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -493,6 +497,10 @@ Functions</h2></td></tr>
 <tr class="separator:a3ba20a804c306067b7023259429e0e48"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:af650e831ce21759da1ac103037d08d84" id="r_af650e831ce21759da1ac103037d08d84"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af650e831ce21759da1ac103037d08d84">is_donatable</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
 <tr class="separator:af650e831ce21759da1ac103037d08d84"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a830a47d8a317dffb0c88e5a7afe6aee2" id="r_a830a47d8a317dffb0c88e5a7afe6aee2"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a830a47d8a317dffb0c88e5a7afe6aee2">move_or_copy</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
+<tr class="separator:a830a47d8a317dffb0c88e5a7afe6aee2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aae1e770954edf1f9a35d19e0de4d857a" id="r_aae1e770954edf1f9a35d19e0de4d857a"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aae1e770954edf1f9a35d19e0de4d857a">move_or_copy</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::vector&lt; size_t &gt; &amp;strides, <a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html">array::Flags</a> flags, size_t data_size, size_t offset=0)</td></tr>
+<tr class="separator:aae1e770954edf1f9a35d19e0de4d857a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ad884f4a36308b5b4f8a5d990d2e086df" id="r_ad884f4a36308b5b4f8a5d990d2e086df"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad884f4a36308b5b4f8a5d990d2e086df">binary_op_gpu</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;outputs, const std::string &amp;<a class="el" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s)</td></tr>
 <tr class="separator:ad884f4a36308b5b4f8a5d990d2e086df"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a094876ea5a2a2445ab64efc8222da202" id="r_a094876ea5a2a2445ab64efc8222da202"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a094876ea5a2a2445ab64efc8222da202">binary_op_gpu</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string &amp;<a class="el" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s)</td></tr>
@@ -534,10 +542,10 @@ Functions</h2></td></tr>
 <tr class="separator:a84ebe6275218070f0ea320f126f64e22"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:afb57825bb763050cc9a9d194aa41ac36" id="r_afb57825bb763050cc9a9d194aa41ac36"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afb57825bb763050cc9a9d194aa41ac36">get_mb_sort_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;idx, int bn, int tn)</td></tr>
 <tr class="separator:afb57825bb763050cc9a9d194aa41ac36"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3bd386cb6db09f636963ce66ceaf8647" id="r_a3bd386cb6db09f636963ce66ceaf8647"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3bd386cb6db09f636963ce66ceaf8647">get_reduce_init_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
-<tr class="separator:a3bd386cb6db09f636963ce66ceaf8647"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a7aa91fcfe8b9caa42d60a957f11bfe6b" id="r_a7aa91fcfe8b9caa42d60a957f11bfe6b"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7aa91fcfe8b9caa42d60a957f11bfe6b">get_reduce_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, int ndim=-1, int bm=-1, int bn=-1)</td></tr>
-<tr class="separator:a7aa91fcfe8b9caa42d60a957f11bfe6b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae0470605dc819efeb6510183619f0299" id="r_ae0470605dc819efeb6510183619f0299"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae0470605dc819efeb6510183619f0299">get_reduce_init_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;out_type)</td></tr>
+<tr class="separator:ae0470605dc819efeb6510183619f0299"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1be32ba7d67137dde7ac191dfe83ff49" id="r_a1be32ba7d67137dde7ac191dfe83ff49"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1be32ba7d67137dde7ac191dfe83ff49">get_reduce_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;in_type, const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;out_type, const std::string &amp;idx_t, int ndim=-1, int bm=-1, int bn=-1)</td></tr>
+<tr class="separator:a1be32ba7d67137dde7ac191dfe83ff49"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a84fa8e0aee321a9d614433a0b933103b" id="r_a84fa8e0aee321a9d614433a0b933103b"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a84fa8e0aee321a9d614433a0b933103b">get_steel_gemm_fused_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;hash_name, const <a class="el" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a> &amp;func_consts, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn)</td></tr>
 <tr class="separator:a84fa8e0aee321a9d614433a0b933103b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:af48c6f2f72b61dbd6766e4f5fea85df5" id="r_af48c6f2f72b61dbd6766e4f5fea85df5"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af48c6f2f72b61dbd6766e4f5fea85df5">get_steel_gemm_splitk_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn, bool mn_aligned, bool k_aligned)</td></tr>
@@ -548,7 +556,7 @@ Functions</h2></td></tr>
 <tr class="separator:ab5f60614e965144b451930fdf935e08d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:adce79d220672f5f3c65cc31d145ca9c4" id="r_adce79d220672f5f3c65cc31d145ca9c4"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adce79d220672f5f3c65cc31d145ca9c4">get_steel_conv_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, int bm, int bn, int bk, int wm, int wn, int n_channel_specialization, bool small_filter)</td></tr>
 <tr class="separator:adce79d220672f5f3c65cc31d145ca9c4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a90c24e0d0b99b68fad9deefcf4d3e818" id="r_a90c24e0d0b99b68fad9deefcf4d3e818"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a90c24e0d0b99b68fad9deefcf4d3e818">get_gemv_masked_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::optional&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;mask_out, const std::optional&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;mask_op, bool transpose_mat, int bm, int bn, int sm, int sn, int tm, int tn, bool contiguous)</td></tr>
+<tr class="memitem:a90c24e0d0b99b68fad9deefcf4d3e818" id="r_a90c24e0d0b99b68fad9deefcf4d3e818"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a90c24e0d0b99b68fad9deefcf4d3e818">get_gemv_masked_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::optional&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;mask_out, const std::optional&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;mask_op, bool transpose_mat, int bm, int bn, int sm, int sn, int tm, int tn, bool <a class="el" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">contiguous</a>)</td></tr>
 <tr class="separator:a90c24e0d0b99b68fad9deefcf4d3e818"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:abce2b67044ee06a7bbe7a91ec7c8c48d" id="r_abce2b67044ee06a7bbe7a91ec7c8c48d"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abce2b67044ee06a7bbe7a91ec7c8c48d">get_steel_conv_general_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, int bm, int bn, int bk, int wm, int wn)</td></tr>
 <tr class="separator:abce2b67044ee06a7bbe7a91ec7c8c48d"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -583,12 +591,8 @@ Functions</h2></td></tr>
 <tr class="separator:aba2b4accc059f30d4dca88db9f7a6e13"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a668fde2bd280a88f63a68b68a343d375" id="r_a668fde2bd280a88f63a68b68a343d375"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a668fde2bd280a88f63a68b68a343d375">unary_op_gpu_inplace</a> (const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;inputs, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string <a class="el" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s)</td></tr>
 <tr class="separator:a668fde2bd280a88f63a68b68a343d375"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a62340bbaa8b216539688a60adcb568bf" id="r_a62340bbaa8b216539688a60adcb568bf"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
-<tr class="memitem:a62340bbaa8b216539688a60adcb568bf"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a62340bbaa8b216539688a60adcb568bf">set_vector_bytes</a> (<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;enc, const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)</td></tr>
-<tr class="separator:a62340bbaa8b216539688a60adcb568bf"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae309cb543dfb0239cfccc53a8ad0408e" id="r_ae309cb543dfb0239cfccc53a8ad0408e"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
-<tr class="memitem:ae309cb543dfb0239cfccc53a8ad0408e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ae309cb543dfb0239cfccc53a8ad0408e">set_vector_bytes</a> (<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;enc, const std::vector&lt; T &gt; &amp;vec, int idx)</td></tr>
-<tr class="separator:ae309cb543dfb0239cfccc53a8ad0408e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aef60e3a8d9c987c9c338b193673d2164" id="r_aef60e3a8d9c987c9c338b193673d2164"><td class="memItemLeft" align="right" valign="top">std::string&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aef60e3a8d9c987c9c338b193673d2164">type_to_name</a> (const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;t)</td></tr>
+<tr class="separator:aef60e3a8d9c987c9c338b193673d2164"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:af1fdfdaa5644394362e6baba30701bae" id="r_af1fdfdaa5644394362e6baba30701bae"><td class="memItemLeft" align="right" valign="top">std::string&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af1fdfdaa5644394362e6baba30701bae">type_to_name</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a)</td></tr>
 <tr class="separator:af1fdfdaa5644394362e6baba30701bae"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a0f0f59d3ffe2d16a684e5fc093302e15" id="r_a0f0f59d3ffe2d16a684e5fc093302e15"><td class="memItemLeft" align="right" valign="top">MTL::Size&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0f0f59d3ffe2d16a684e5fc093302e15">get_block_dims</a> (int dim0, int dim1, int dim2, int pow2=10)</td></tr>
@@ -605,6 +609,12 @@ Functions</h2></td></tr>
 <tr class="separator:a489e45b3a5cd8b46e8ea56b9132eb230"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ad4be35b310a252edd80d9cf04f094a60" id="r_ad4be35b310a252edd80d9cf04f094a60"><td class="memItemLeft" align="right" valign="top">std::string&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad4be35b310a252edd80d9cf04f094a60">get_primitive_string</a> (<a class="el" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> *primitive)</td></tr>
 <tr class="separator:ad4be35b310a252edd80d9cf04f094a60"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a76a2e310857f60f5ea6f1388d45b964d" id="r_a76a2e310857f60f5ea6f1388d45b964d"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:a76a2e310857f60f5ea6f1388d45b964d"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a76a2e310857f60f5ea6f1388d45b964d">concatenate</a> (std::string &amp;acc, T first)</td></tr>
+<tr class="separator:a76a2e310857f60f5ea6f1388d45b964d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aaf51544472fa87fa974686eacdd2a4a6" id="r_aaf51544472fa87fa974686eacdd2a4a6"><td class="memTemplParams" colspan="2">template&lt;typename T , typename... Args&gt; </td></tr>
+<tr class="memitem:aaf51544472fa87fa974686eacdd2a4a6"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aaf51544472fa87fa974686eacdd2a4a6">concatenate</a> (std::string &amp;acc, T first, Args... args)</td></tr>
+<tr class="separator:aaf51544472fa87fa974686eacdd2a4a6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a3ac798e65e59fe10b7fb5c522efce782" id="r_a3ac798e65e59fe10b7fb5c522efce782"><td class="memItemLeft" align="right" valign="top">std::function&lt; std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt;(const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;)&gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3ac798e65e59fe10b7fb5c522efce782">compile</a> (const std::function&lt; std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt;(const std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;)&gt; &amp;fun, bool shapeless=false)</td></tr>
 <tr class="memdesc:a3ac798e65e59fe10b7fb5c522efce782"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compile takes a function and returns a compiled function.  <br /></td></tr>
 <tr class="separator:a3ac798e65e59fe10b7fb5c522efce782"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -1510,6 +1520,8 @@ Functions</h2></td></tr>
 <tr class="separator:gaf8913cabeb9fb193ba687aaeb2087764"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga7ff592a64d528f0cf4f3d098465da029" id="r_ga7ff592a64d528f0cf4f3d098465da029"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__ops.html#ga7ff592a64d528f0cf4f3d098465da029">imag</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a, <a class="el" href="#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
 <tr class="separator:ga7ff592a64d528f0cf4f3d098465da029"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga8ab10aa6c41416d739791164a52b25d5" id="r_ga8ab10aa6c41416d739791164a52b25d5"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">contiguous</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a, bool allow_col_major=false, <a class="el" href="#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
+<tr class="separator:ga8ab10aa6c41416d739791164a52b25d5"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ac198b7e282957c724c84a435e8f1215e" id="r_ac198b7e282957c724c84a435e8f1215e"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac198b7e282957c724c84a435e8f1215e">default_stream</a> (<a class="el" href="structmlx_1_1core_1_1_device.html">Device</a> d)</td></tr>
 <tr class="memdesc:ac198b7e282957c724c84a435e8f1215e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Get the default stream for the given device.  <br /></td></tr>
 <tr class="separator:ac198b7e282957c724c84a435e8f1215e"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -1952,9 +1964,9 @@ Functions</h2></td></tr>
 <tr class="separator:ad38b38a3faf050735d45eed4438ee27a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a358e66ff205bda3e8542427b6d2edadc" id="r_a358e66ff205bda3e8542427b6d2edadc"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a358e66ff205bda3e8542427b6d2edadc">operator+</a> (const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;x, <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> y)</td></tr>
 <tr class="separator:a358e66ff205bda3e8542427b6d2edadc"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af56d4b85e329e39a825c01a50e3a2522" id="r_af56d4b85e329e39a825c01a50e3a2522"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af56d4b85e329e39a825c01a50e3a2522">operator+</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;y)</td></tr>
+<tr class="memitem:af56d4b85e329e39a825c01a50e3a2522" id="r_af56d4b85e329e39a825c01a50e3a2522"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af56d4b85e329e39a825c01a50e3a2522">operator+</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;y)</td></tr>
 <tr class="separator:af56d4b85e329e39a825c01a50e3a2522"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a806a495a129ebaab69cc57ca7db831d6" id="r_a806a495a129ebaab69cc57ca7db831d6"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a806a495a129ebaab69cc57ca7db831d6">operator+</a> (const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a806a495a129ebaab69cc57ca7db831d6" id="r_a806a495a129ebaab69cc57ca7db831d6"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a806a495a129ebaab69cc57ca7db831d6">operator+</a> (const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a806a495a129ebaab69cc57ca7db831d6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a09fc6ebda917969383783a112a8547e7" id="r_a09fc6ebda917969383783a112a8547e7"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a09fc6ebda917969383783a112a8547e7">operator+</a> (float x, const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;y)</td></tr>
 <tr class="separator:a09fc6ebda917969383783a112a8547e7"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -2284,21 +2296,21 @@ Functions</h2></td></tr>
 <tr class="separator:ae78083d766b9cf6f87cded341bbcd63e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:acf36c10779fbf1efbe1e6a7fd41176cd" id="r_acf36c10779fbf1efbe1e6a7fd41176cd"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1___m_l_x___float16.html">_MLX_Float16</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acf36c10779fbf1efbe1e6a7fd41176cd">operator^=</a> (<a class="el" href="structmlx_1_1core_1_1___m_l_x___float16.html">_MLX_Float16</a> &amp;lhs, uint16_t rhs)</td></tr>
 <tr class="separator:acf36c10779fbf1efbe1e6a7fd41176cd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a95fd207028f125eefbafe9e0522407fe" id="r_a95fd207028f125eefbafe9e0522407fe"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a95fd207028f125eefbafe9e0522407fe">operator+</a> (<a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
+<tr class="memitem:a95fd207028f125eefbafe9e0522407fe" id="r_a95fd207028f125eefbafe9e0522407fe"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a95fd207028f125eefbafe9e0522407fe">operator+</a> (<a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
 <tr class="separator:a95fd207028f125eefbafe9e0522407fe"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:abc6425a3fbb386f5ea5964b42507e989" id="r_abc6425a3fbb386f5ea5964b42507e989"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abc6425a3fbb386f5ea5964b42507e989">operator+</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
+<tr class="memitem:abc6425a3fbb386f5ea5964b42507e989" id="r_abc6425a3fbb386f5ea5964b42507e989"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abc6425a3fbb386f5ea5964b42507e989">operator+</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
 <tr class="separator:abc6425a3fbb386f5ea5964b42507e989"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2631e78c6f0a602f6754ac577ec75f83" id="r_a2631e78c6f0a602f6754ac577ec75f83"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2631e78c6f0a602f6754ac577ec75f83">operator-</a> (<a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
+<tr class="memitem:a2631e78c6f0a602f6754ac577ec75f83" id="r_a2631e78c6f0a602f6754ac577ec75f83"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2631e78c6f0a602f6754ac577ec75f83">operator-</a> (<a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
 <tr class="separator:a2631e78c6f0a602f6754ac577ec75f83"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a73d79cbd75d543d0837b8a51bf103f9e" id="r_a73d79cbd75d543d0837b8a51bf103f9e"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a73d79cbd75d543d0837b8a51bf103f9e">operator-</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
+<tr class="memitem:a73d79cbd75d543d0837b8a51bf103f9e" id="r_a73d79cbd75d543d0837b8a51bf103f9e"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a73d79cbd75d543d0837b8a51bf103f9e">operator-</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
 <tr class="separator:a73d79cbd75d543d0837b8a51bf103f9e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:acaaa86b59c7ceb2e092ac07f2a75225c" id="r_acaaa86b59c7ceb2e092ac07f2a75225c"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acaaa86b59c7ceb2e092ac07f2a75225c">operator*</a> (<a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
+<tr class="memitem:acaaa86b59c7ceb2e092ac07f2a75225c" id="r_acaaa86b59c7ceb2e092ac07f2a75225c"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acaaa86b59c7ceb2e092ac07f2a75225c">operator*</a> (<a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
 <tr class="separator:acaaa86b59c7ceb2e092ac07f2a75225c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a067d47823a322b88043cce7ce4a3ec78" id="r_a067d47823a322b88043cce7ce4a3ec78"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a067d47823a322b88043cce7ce4a3ec78">operator*</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
+<tr class="memitem:a067d47823a322b88043cce7ce4a3ec78" id="r_a067d47823a322b88043cce7ce4a3ec78"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a067d47823a322b88043cce7ce4a3ec78">operator*</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
 <tr class="separator:a067d47823a322b88043cce7ce4a3ec78"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a97efcd96d6be666e5608034ae77289ef" id="r_a97efcd96d6be666e5608034ae77289ef"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a97efcd96d6be666e5608034ae77289ef">operator/</a> (<a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
+<tr class="memitem:a97efcd96d6be666e5608034ae77289ef" id="r_a97efcd96d6be666e5608034ae77289ef"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a97efcd96d6be666e5608034ae77289ef">operator/</a> (<a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> lhs, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> rhs)</td></tr>
 <tr class="separator:a97efcd96d6be666e5608034ae77289ef"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a899851f85dbddd96f9d36319b82542a0" id="r_a899851f85dbddd96f9d36319b82542a0"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a899851f85dbddd96f9d36319b82542a0">operator/</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
+<tr class="memitem:a899851f85dbddd96f9d36319b82542a0" id="r_a899851f85dbddd96f9d36319b82542a0"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a899851f85dbddd96f9d36319b82542a0">operator/</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> lhs, <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> rhs)</td></tr>
 <tr class="separator:a899851f85dbddd96f9d36319b82542a0"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a4734a596e57434492ddfe79f2cb9dbf9" id="r_a4734a596e57434492ddfe79f2cb9dbf9"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4734a596e57434492ddfe79f2cb9dbf9">to_stream</a> (<a class="el" href="#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s)</td></tr>
 <tr class="separator:a4734a596e57434492ddfe79f2cb9dbf9"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -2340,7 +2352,7 @@ Functions</h2></td></tr>
 <tr class="separator:a42a19c8442b173606e714364227e7d45"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a57eb97a5eba99a846ac429795e407574" id="r_a57eb97a5eba99a846ac429795e407574"><td class="memItemLeft" align="right" valign="top">std::ostream &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a57eb97a5eba99a846ac429795e407574">operator&lt;&lt;</a> (std::ostream &amp;os, const <a class="el" href="#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> &amp;v)</td></tr>
 <tr class="separator:a57eb97a5eba99a846ac429795e407574"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a7db909d54cf07375e89424c32c07a29c" id="r_a7db909d54cf07375e89424c32c07a29c"><td class="memItemLeft" align="right" valign="top">std::ostream &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7db909d54cf07375e89424c32c07a29c">operator&lt;&lt;</a> (std::ostream &amp;os, const <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &amp;v)</td></tr>
+<tr class="memitem:a7db909d54cf07375e89424c32c07a29c" id="r_a7db909d54cf07375e89424c32c07a29c"><td class="memItemLeft" align="right" valign="top">std::ostream &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7db909d54cf07375e89424c32c07a29c">operator&lt;&lt;</a> (std::ostream &amp;os, const <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &amp;v)</td></tr>
 <tr class="separator:a7db909d54cf07375e89424c32c07a29c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:adacbc4526e8964b267a8ec3eb1bc1a32" id="r_adacbc4526e8964b267a8ec3eb1bc1a32"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adacbc4526e8964b267a8ec3eb1bc1a32">is_power_of_2</a> (int n)</td></tr>
 <tr class="separator:adacbc4526e8964b267a8ec3eb1bc1a32"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -3235,6 +3247,57 @@ template&lt;typename... Arrays, typename  = enable_for_arrays_t&lt;Arrays...&gt;
       </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="a76a2e310857f60f5ea6f1388d45b964d" name="a76a2e310857f60f5ea6f1388d45b964d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a76a2e310857f60f5ea6f1388d45b964d">&#9670;&#160;</a></span>concatenate() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::concatenate </td>
+          <td>(</td>
+          <td class="paramtype">std::string &amp;</td>          <td class="paramname"><span class="paramname"><em>acc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>first</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aaf51544472fa87fa974686eacdd2a4a6" name="aaf51544472fa87fa974686eacdd2a4a6"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aaf51544472fa87fa974686eacdd2a4a6">&#9670;&#160;</a></span>concatenate() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename... Args&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::concatenate </td>
+          <td>(</td>
+          <td class="paramtype">std::string &amp;</td>          <td class="paramname"><span class="paramname"><em>acc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>first</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">Args...</td>          <td class="paramname"><span class="paramname"><em>args</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="a050299d0d366ca5c9d09d1004dcc3e7d" name="a050299d0d366ca5c9d09d1004dcc3e7d"></a>
@@ -4413,8 +4476,8 @@ template&lt;typename... Arrays, typename  = enable_for_arrays_t&lt;Arrays...&gt;
 
 </div>
 </div>
-<a id="a3bd386cb6db09f636963ce66ceaf8647" name="a3bd386cb6db09f636963ce66ceaf8647"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a3bd386cb6db09f636963ce66ceaf8647">&#9670;&#160;</a></span>get_reduce_init_kernel()</h2>
+<a id="ae0470605dc819efeb6510183619f0299" name="ae0470605dc819efeb6510183619f0299"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ae0470605dc819efeb6510183619f0299">&#9670;&#160;</a></span>get_reduce_init_kernel()</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -4442,15 +4505,15 @@ template&lt;typename... Arrays, typename  = enable_for_arrays_t&lt;Arrays...&gt;
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>out</em></span>&#160;)</td>
+          <td class="paramtype">const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>out_type</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a7aa91fcfe8b9caa42d60a957f11bfe6b" name="a7aa91fcfe8b9caa42d60a957f11bfe6b"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7aa91fcfe8b9caa42d60a957f11bfe6b">&#9670;&#160;</a></span>get_reduce_kernel()</h2>
+<a id="a1be32ba7d67137dde7ac191dfe83ff49" name="a1be32ba7d67137dde7ac191dfe83ff49"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1be32ba7d67137dde7ac191dfe83ff49">&#9670;&#160;</a></span>get_reduce_kernel()</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -4478,12 +4541,17 @@ template&lt;typename... Arrays, typename  = enable_for_arrays_t&lt;Arrays...&gt;
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
+          <td class="paramtype">const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>in_type</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
+          <td class="paramtype">const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>out_type</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::string &amp;</td>          <td class="paramname"><span class="paramname"><em>idx_t</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -5751,6 +5819,68 @@ template&lt;typename StrideT &gt; </div>
 </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="a830a47d8a317dffb0c88e5a7afe6aee2" name="a830a47d8a317dffb0c88e5a7afe6aee2"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a830a47d8a317dffb0c88e5a7afe6aee2">&#9670;&#160;</a></span>move_or_copy() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::move_or_copy </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>out</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aae1e770954edf1f9a35d19e0de4d857a" name="aae1e770954edf1f9a35d19e0de4d857a"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aae1e770954edf1f9a35d19e0de4d857a">&#9670;&#160;</a></span>move_or_copy() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::move_or_copy </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="structmlx_1_1core_1_1array_1_1_flags.html">array::Flags</a></td>          <td class="paramname"><span class="paramname"><em>flags</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>data_size</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="a9a9254ce9975ec247a2718bc02d6f201" name="a9a9254ce9975ec247a2718bc02d6f201"></a>
@@ -7446,7 +7576,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="memname">float mlx::core::operator* </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>lhs</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>lhs</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -7654,7 +7784,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>rhs</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>rhs</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -8490,7 +8620,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="memname">float mlx::core::operator+ </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>lhs</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>lhs</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -8519,7 +8649,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="memname"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> mlx::core::operator+ </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -8640,7 +8770,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -9104,7 +9234,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>rhs</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>rhs</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -10135,7 +10265,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="memname">float mlx::core::operator- </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>lhs</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>lhs</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -10368,7 +10498,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>rhs</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>rhs</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -11204,7 +11334,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="memname">float mlx::core::operator/ </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>lhs</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>lhs</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -11412,7 +11542,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>rhs</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>rhs</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -12593,7 +12723,7 @@ template&lt;typename StrideT &gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>v</em></span>&#160;)</td>
+          <td class="paramtype">const <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>v</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -17161,83 +17291,6 @@ template&lt;typename T , typename U , typename OpS , typename OpC , typename Op
 
 <p>Make the stream the default for its device. </p>
 
-</div>
-</div>
-<a id="ae309cb543dfb0239cfccc53a8ad0408e" name="ae309cb543dfb0239cfccc53a8ad0408e"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ae309cb543dfb0239cfccc53a8ad0408e">&#9670;&#160;</a></span>set_vector_bytes() <span class="overload">[1/2]</span></h2>
-
-<div class="memitem">
-<div class="memproto">
-<div class="memtemplate">
-template&lt;typename T &gt; </div>
-<table class="mlabels">
-  <tr>
-  <td class="mlabels-left">
-      <table class="memname">
-        <tr>
-          <td class="memname">void mlx::core::set_vector_bytes </td>
-          <td>(</td>
-          <td class="paramtype"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>enc</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const std::vector&lt; T &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>vec</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>idx</em></span>&#160;)</td>
-        </tr>
-      </table>
-  </td>
-  <td class="mlabels-right">
-<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
-  </tr>
-</table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a62340bbaa8b216539688a60adcb568bf" name="a62340bbaa8b216539688a60adcb568bf"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a62340bbaa8b216539688a60adcb568bf">&#9670;&#160;</a></span>set_vector_bytes() <span class="overload">[2/2]</span></h2>
-
-<div class="memitem">
-<div class="memproto">
-<div class="memtemplate">
-template&lt;typename T &gt; </div>
-<table class="mlabels">
-  <tr>
-  <td class="mlabels-left">
-      <table class="memname">
-        <tr>
-          <td class="memname">void mlx::core::set_vector_bytes </td>
-          <td>(</td>
-          <td class="paramtype"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>enc</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const std::vector&lt; T &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>vec</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>nelems</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>idx</em></span>&#160;)</td>
-        </tr>
-      </table>
-  </td>
-  <td class="mlabels-right">
-<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
-  </tr>
-</table>
-</div><div class="memdoc">
-
 </div>
 </div>
 <a id="a44c3ea6db6553c3f6552b9ba64a69494" name="a44c3ea6db6553c3f6552b9ba64a69494"></a>
@@ -17747,7 +17800,7 @@ template&lt;typename T &gt; </div>
 </div>
 </div>
 <a id="af1fdfdaa5644394362e6baba30701bae" name="af1fdfdaa5644394362e6baba30701bae"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#af1fdfdaa5644394362e6baba30701bae">&#9670;&#160;</a></span>type_to_name()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#af1fdfdaa5644394362e6baba30701bae">&#9670;&#160;</a></span>type_to_name() <span class="overload">[1/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -17761,6 +17814,23 @@ template&lt;typename T &gt; </div>
       </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="aef60e3a8d9c987c9c338b193673d2164" name="aef60e3a8d9c987c9c338b193673d2164"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aef60e3a8d9c987c9c338b193673d2164">&#9670;&#160;</a></span>type_to_name() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">std::string mlx::core::type_to_name </td>
+          <td>(</td>
+          <td class="paramtype">const <a class="el" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>t</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="aba2b4accc059f30d4dca88db9f7a6e13" name="aba2b4accc059f30d4dca88db9f7a6e13"></a>
diff --git a/docs/build/html/namespacemlx_1_1core_1_1env.html b/docs/build/html/namespacemlx_1_1core_1_1env.html
new file mode 100644
index 000000000..69709f5cd
--- /dev/null
+++ b/docs/build/html/namespacemlx_1_1core_1_1env.html
@@ -0,0 +1,185 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx::core::env Namespace Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1core.html">core</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1core_1_1env.html">env</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#func-members">Functions</a>  </div>
+  <div class="headertitle"><div class="title">mlx::core::env Namespace Reference</div></div>
+</div><!--header-->
+<div class="contents">
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
+Functions</h2></td></tr>
+<tr class="memitem:a0efecbf9efe695adafad12b5a4945df3" id="r_a0efecbf9efe695adafad12b5a4945df3"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0efecbf9efe695adafad12b5a4945df3">get_var</a> (const char *name, int default_value)</td></tr>
+<tr class="separator:a0efecbf9efe695adafad12b5a4945df3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac3266e1259a64c8b56bdc6c7029179f2" id="r_ac3266e1259a64c8b56bdc6c7029179f2"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac3266e1259a64c8b56bdc6c7029179f2">bfs_max_width</a> ()</td></tr>
+<tr class="separator:ac3266e1259a64c8b56bdc6c7029179f2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aedbf4e739553024c33dd0094dd9107aa" id="r_aedbf4e739553024c33dd0094dd9107aa"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aedbf4e739553024c33dd0094dd9107aa">max_ops_per_buffer</a> ()</td></tr>
+<tr class="separator:aedbf4e739553024c33dd0094dd9107aa"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Function Documentation</h2>
+<a id="ac3266e1259a64c8b56bdc6c7029179f2" name="ac3266e1259a64c8b56bdc6c7029179f2"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ac3266e1259a64c8b56bdc6c7029179f2">&#9670;&#160;</a></span>bfs_max_width()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::core::env::bfs_max_width </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a0efecbf9efe695adafad12b5a4945df3" name="a0efecbf9efe695adafad12b5a4945df3"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a0efecbf9efe695adafad12b5a4945df3">&#9670;&#160;</a></span>get_var()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::core::env::get_var </td>
+          <td>(</td>
+          <td class="paramtype">const char *</td>          <td class="paramname"><span class="paramname"><em>name</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>default_value</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aedbf4e739553024c33dd0094dd9107aa" name="aedbf4e739553024c33dd0094dd9107aa"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aedbf4e739553024c33dd0094dd9107aa">&#9670;&#160;</a></span>max_ops_per_buffer()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::core::env::max_ops_per_buffer </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/namespacemlx_1_1core_1_1fast.html b/docs/build/html/namespacemlx_1_1core_1_1fast.html
index 35fc317ea..bc6c047fa 100644
--- a/docs/build/html/namespacemlx_1_1core_1_1fast.html
+++ b/docs/build/html/namespacemlx_1_1core_1_1fast.html
@@ -139,8 +139,6 @@ Functions</h2></td></tr>
 <tr class="separator:a3663b50265b0a9c0cca2b5376852e059"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aa4b5f6886b2288cb6dfdd8598579f080" id="r_aa4b5f6886b2288cb6dfdd8598579f080"><td class="memItemLeft" align="right" valign="top">std::tuple&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a>, <a class="el" href="classmlx_1_1core_1_1array.html">array</a>, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa4b5f6886b2288cb6dfdd8598579f080">affine_quantize</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;w, int group_size=64, int bits=4, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
 <tr class="separator:aa4b5f6886b2288cb6dfdd8598579f080"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a638c7e9b9ea8677f01786d8f9738baf8" id="r_a638c7e9b9ea8677f01786d8f9738baf8"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a638c7e9b9ea8677f01786d8f9738baf8">affine_quantize</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;w, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;scales, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;biases, int group_size=64, int bits=4, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
-<tr class="separator:a638c7e9b9ea8677f01786d8f9738baf8"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a12c7ef41409d6fb378008e67b6fab328" id="r_a12c7ef41409d6fb378008e67b6fab328"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a12c7ef41409d6fb378008e67b6fab328">affine_dequantize</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;w, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;scales, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;biases, int group_size=64, int bits=4, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
 <tr class="separator:a12c7ef41409d6fb378008e67b6fab328"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ab16436b465dc10ce472193d541d8426e" id="r_ab16436b465dc10ce472193d541d8426e"><td class="memItemLeft" align="right" valign="top"><a class="el" href="#a0e8c2c4ea7a946568c8fe5b4810417e0">MetalKernelFunction</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab16436b465dc10ce472193d541d8426e">metal_kernel</a> (const std::string &amp;name, const std::vector&lt; std::string &gt; &amp;input_names, const std::vector&lt; std::string &gt; &amp;output_names, const std::string &amp;source, const std::string &amp;header=&quot;&quot;, bool ensure_row_contiguous=true, bool atomic_outputs=false)</td></tr>
@@ -215,51 +213,10 @@ Functions</h2></td></tr>
       </table>
 </div><div class="memdoc">
 
-</div>
-</div>
-<a id="a638c7e9b9ea8677f01786d8f9738baf8" name="a638c7e9b9ea8677f01786d8f9738baf8"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a638c7e9b9ea8677f01786d8f9738baf8">&#9670;&#160;</a></span>affine_quantize() <span class="overload">[1/2]</span></h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname"><a class="el" href="classmlx_1_1core_1_1array.html">array</a> mlx::core::fast::affine_quantize </td>
-          <td>(</td>
-          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>w</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>scales</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>biases</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>group_size</em></span><span class="paramdefsep"> = </span><span class="paramdefval">64</span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>bits</em></span><span class="paramdefsep"> = </span><span class="paramdefval">4</span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype"><a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a></td>          <td class="paramname"><span class="paramname"><em>s</em></span><span class="paramdefsep"> = </span><span class="paramdefval">{}</span>&#160;)</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
 </div>
 </div>
 <a id="aa4b5f6886b2288cb6dfdd8598579f080" name="aa4b5f6886b2288cb6dfdd8598579f080"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa4b5f6886b2288cb6dfdd8598579f080">&#9670;&#160;</a></span>affine_quantize() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aa4b5f6886b2288cb6dfdd8598579f080">&#9670;&#160;</a></span>affine_quantize()</h2>
 
 <div class="memitem">
 <div class="memproto">
diff --git a/docs/build/html/namespacemlx_1_1steel.html b/docs/build/html/namespacemlx_1_1steel.html
index 9d7a7e158..277961582 100644
--- a/docs/build/html/namespacemlx_1_1steel.html
+++ b/docs/build/html/namespacemlx_1_1steel.html
@@ -101,12 +101,16 @@ $(function(){ initResizable(false); });
 Classes</h2></td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_accum_helper.html">AccumHelper</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">AttnParams</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">BaseMMAFrag&lt; T, 8, 8 &gt;</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">BlockLoaderT</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">BlockMMA</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html">BlockSwizzle</a></td></tr>
@@ -139,6 +143,8 @@ Classes</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html">Conv2DWeightBlockLoaderSmallChannels</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_c_shape.html">CShape</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html">GEMMAddMMParams</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">GEMMKernel</a></td></tr>
@@ -155,10 +161,14 @@ Classes</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1is__integral_3_01integral__constant_3_01_t_00_01v_01_4_01_4.html">is_integral&lt; integral_constant&lt; T, v &gt; &gt;</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">Layout2D</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html">LoopAlignment</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">Shape2D</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">TransformAdd</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">TransformAxpby</a></td></tr>
diff --git a/docs/build/html/namespaces.html b/docs/build/html/namespaces.html
index 58de4a85c..3ac6b1cfa 100644
--- a/docs/build/html/namespaces.html
+++ b/docs/build/html/namespaces.html
@@ -98,17 +98,18 @@ $(function(){ initResizable(false); });
 <tr id="row_1_0_1_" class="even"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1detail.html" target="_self">detail</a></td><td class="desc"></td></tr>
 <tr id="row_1_0_2_" class="odd"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span id="arr_1_0_2_" class="arrow" onclick="dynsection.toggleFolder('1_0_2_')">&#9660;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1distributed.html" target="_self">distributed</a></td><td class="desc"></td></tr>
 <tr id="row_1_0_2_0_" class="even"><td class="entry"><span style="width:64px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1distributed_1_1detail.html" target="_self">detail</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_3_" class="odd"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1fast.html" target="_self">fast</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_4_" class="even"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1fft.html" target="_self">fft</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_5_" class="odd"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1io.html" target="_self">io</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_6_" class="even"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1linalg.html" target="_self">linalg</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_7_" class="odd"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1metal.html" target="_self">metal</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_8_" class="even"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1random.html" target="_self">random</a></td><td class="desc"></td></tr>
-<tr id="row_1_0_9_" class="odd"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1scheduler.html" target="_self">scheduler</a></td><td class="desc"></td></tr>
-<tr id="row_1_1_" class="even"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1steel.html" target="_self">steel</a></td><td class="desc"></td></tr>
-<tr id="row_2_" class="odd"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_2_" class="arrow" onclick="dynsection.toggleFolder('2_')">&#9660;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacepocketfft.html" target="_self">pocketfft</a></td><td class="desc"></td></tr>
-<tr id="row_2_0_" class="even"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_2_0_" class="arrow" onclick="dynsection.toggleFolder('2_0_')">&#9660;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacepocketfft_1_1detail.html" target="_self">detail</a></td><td class="desc"></td></tr>
-<tr id="row_2_0_0_" class="odd"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacepocketfft_1_1detail_1_1threading.html" target="_self">threading</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_3_" class="odd"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1env.html" target="_self">env</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_4_" class="even"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1fast.html" target="_self">fast</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_5_" class="odd"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1fft.html" target="_self">fft</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_6_" class="even"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1io.html" target="_self">io</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_7_" class="odd"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1linalg.html" target="_self">linalg</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_8_" class="even"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1metal.html" target="_self">metal</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_9_" class="odd"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1random.html" target="_self">random</a></td><td class="desc"></td></tr>
+<tr id="row_1_0_10_" class="even"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1core_1_1scheduler.html" target="_self">scheduler</a></td><td class="desc"></td></tr>
+<tr id="row_1_1_" class="odd"><td class="entry"><span style="width:32px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacemlx_1_1steel.html" target="_self">steel</a></td><td class="desc"></td></tr>
+<tr id="row_2_" class="even"><td class="entry"><span style="width:0px;display:inline-block;">&#160;</span><span id="arr_2_" class="arrow" onclick="dynsection.toggleFolder('2_')">&#9660;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacepocketfft.html" target="_self">pocketfft</a></td><td class="desc"></td></tr>
+<tr id="row_2_0_" class="odd"><td class="entry"><span style="width:16px;display:inline-block;">&#160;</span><span id="arr_2_0_" class="arrow" onclick="dynsection.toggleFolder('2_0_')">&#9660;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacepocketfft_1_1detail.html" target="_self">detail</a></td><td class="desc"></td></tr>
+<tr id="row_2_0_0_" class="even"><td class="entry"><span style="width:48px;display:inline-block;">&#160;</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacepocketfft_1_1detail_1_1threading.html" target="_self">threading</a></td><td class="desc"></td></tr>
 </table>
 </div><!-- directory -->
 </div><!-- contents -->
diff --git a/docs/build/html/objects.inv b/docs/build/html/objects.inv
index cdee61ccc..1b32b1bd2 100644
Binary files a/docs/build/html/objects.inv and b/docs/build/html/objects.inv differ
diff --git a/docs/build/html/ops_8h.html b/docs/build/html/ops_8h.html
index e0bf03b11..86d44a0d1 100644
--- a/docs/build/html/ops_8h.html
+++ b/docs/build/html/ops_8h.html
@@ -938,6 +938,8 @@ Functions</h2></td></tr>
 <tr class="separator:gaf8913cabeb9fb193ba687aaeb2087764"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga7ff592a64d528f0cf4f3d098465da029" id="r_ga7ff592a64d528f0cf4f3d098465da029"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__ops.html#ga7ff592a64d528f0cf4f3d098465da029">mlx::core::imag</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
 <tr class="separator:ga7ff592a64d528f0cf4f3d098465da029"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga8ab10aa6c41416d739791164a52b25d5" id="r_ga8ab10aa6c41416d739791164a52b25d5"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classmlx_1_1core_1_1array.html">array</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">mlx::core::contiguous</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a, bool allow_col_major=false, <a class="el" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s={})</td></tr>
+<tr class="separator:ga8ab10aa6c41416d739791164a52b25d5"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/ops_8h_source.html b/docs/build/html/ops_8h_source.html
index 9f21be084..31355be0c 100644
--- a/docs/build/html/ops_8h_source.html
+++ b/docs/build/html/ops_8h_source.html
@@ -294,11 +294,11 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; a_max = std::nullopt,</div>
 <div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>    <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
 <div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span> </div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno"><a class="line" href="group__ops.html#gabdc36fa65697d0361c8d67495de77129">  222</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="group__ops.html#gabdc36fa65697d0361c8d67495de77129">concatenate</a>(</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno"><a class="line" href="group__ops.html#gabdc36fa65697d0361c8d67495de77129">  222</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">concatenate</a>(</div>
 <div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>    <span class="keyword">const</span> std::vector&lt;array&gt;&amp; arrays,</div>
 <div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>    <span class="keywordtype">int</span> axis,</div>
 <div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>    <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno"><a class="line" href="group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8">  226</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="group__ops.html#gabdc36fa65697d0361c8d67495de77129">concatenate</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; arrays, <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno"><a class="line" href="group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8">  226</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">concatenate</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; arrays, <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
 <div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span> </div>
 <div class="line"><a id="l00229" name="l00229"></a><span class="lineno"><a class="line" href="group__ops.html#gaf8f2ec2b98a4b59eca73d7471df6e032">  229</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="group__ops.html#gaf8f2ec2b98a4b59eca73d7471df6e032">stack</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; arrays, <span class="keywordtype">int</span> axis, <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
 <div class="line"><a id="l00230" name="l00230"></a><span class="lineno"><a class="line" href="group__ops.html#ga82216209dce901296fc737fe8efa5c94">  230</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="group__ops.html#gaf8f2ec2b98a4b59eca73d7471df6e032">stack</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; arrays, <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
@@ -1364,7 +1364,13 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l01486" name="l01486"></a><span class="lineno"> 1486</span><span class="comment">/* The imaginary part of a complex array. */</span></div>
 <div class="line"><a id="l01487" name="l01487"></a><span class="lineno"><a class="line" href="group__ops.html#ga7ff592a64d528f0cf4f3d098465da029"> 1487</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="group__ops.html#ga7ff592a64d528f0cf4f3d098465da029">imag</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; a, <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
 <div class="line"><a id="l01488" name="l01488"></a><span class="lineno"> 1488</span> </div>
-<div class="line"><a id="l01491" name="l01491"></a><span class="lineno"> 1491</span>} <span class="comment">// namespace mlx::core</span></div>
+<div class="line"><a id="l01489" name="l01489"></a><span class="lineno"> 1489</span><span class="comment">/* Ensure the array&#39;s underlying memory is contiguous. */</span></div>
+<div class="line"><a id="l01490" name="l01490"></a><span class="lineno"><a class="line" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5"> 1490</a></span><a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> <a class="code hl_function" href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">contiguous</a>(</div>
+<div class="line"><a id="l01491" name="l01491"></a><span class="lineno"> 1491</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; a,</div>
+<div class="line"><a id="l01492" name="l01492"></a><span class="lineno"> 1492</span>    <span class="keywordtype">bool</span> allow_col_major = <span class="keyword">false</span>,</div>
+<div class="line"><a id="l01493" name="l01493"></a><span class="lineno"> 1493</span>    <a class="code hl_typedef" href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">StreamOrDevice</a> s = {});</div>
+<div class="line"><a id="l01494" name="l01494"></a><span class="lineno"> 1494</span> </div>
+<div class="line"><a id="l01497" name="l01497"></a><span class="lineno"> 1497</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
 <div class="ttc" id="adevice_8h_html"><div class="ttname"><a href="device_8h.html">device.h</a></div></div>
@@ -1474,6 +1480,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="agroup__ops_html_ga89682bf78491761e062d4ee7bef0c829"><div class="ttname"><a href="group__ops.html#ga89682bf78491761e062d4ee7bef0c829">mlx::core::left_shift</a></div><div class="ttdeci">array left_shift(const array &amp;a, const array &amp;b, StreamOrDevice s={})</div><div class="ttdoc">Shift bits to the left.</div></div>
 <div class="ttc" id="agroup__ops_html_ga8a2056f8c9bb30914c40bcf509386491"><div class="ttname"><a href="group__ops.html#ga8a2056f8c9bb30914c40bcf509386491">mlx::core::where</a></div><div class="ttdeci">array where(const array &amp;condition, const array &amp;x, const array &amp;y, StreamOrDevice s={})</div><div class="ttdoc">Select from x or y depending on condition.</div></div>
 <div class="ttc" id="agroup__ops_html_ga8a3b04e23e347d99ecf411fd6f4e5125"><div class="ttname"><a href="group__ops.html#ga8a3b04e23e347d99ecf411fd6f4e5125">mlx::core::exp</a></div><div class="ttdeci">array exp(const array &amp;a, StreamOrDevice s={})</div><div class="ttdoc">Exponential of the elements of an array.</div></div>
+<div class="ttc" id="agroup__ops_html_ga8ab10aa6c41416d739791164a52b25d5"><div class="ttname"><a href="group__ops.html#ga8ab10aa6c41416d739791164a52b25d5">mlx::core::contiguous</a></div><div class="ttdeci">array contiguous(const array &amp;a, bool allow_col_major=false, StreamOrDevice s={})</div></div>
 <div class="ttc" id="agroup__ops_html_ga8af4f22c08c11c4ffab7e3d45e0f3cd6"><div class="ttname"><a href="group__ops.html#ga8af4f22c08c11c4ffab7e3d45e0f3cd6">mlx::core::bitwise_or</a></div><div class="ttdeci">array bitwise_or(const array &amp;a, const array &amp;b, StreamOrDevice s={})</div><div class="ttdoc">Bitwise inclusive or.</div></div>
 <div class="ttc" id="agroup__ops_html_ga8d50480266d258cac40ff51bcb0fc6a7"><div class="ttname"><a href="group__ops.html#ga8d50480266d258cac40ff51bcb0fc6a7">mlx::core::gather_mm</a></div><div class="ttdeci">array gather_mm(array a, array b, std::optional&lt; array &gt; lhs_indices=std::nullopt, std::optional&lt; array &gt; rhs_indices=std::nullopt, StreamOrDevice s={})</div><div class="ttdoc">Compute matrix product with matrix-level gather.</div></div>
 <div class="ttc" id="agroup__ops_html_ga8d656904aa2690b60955ae745aecfc30"><div class="ttname"><a href="group__ops.html#ga8d656904aa2690b60955ae745aecfc30">mlx::core::floor</a></div><div class="ttdeci">array floor(const array &amp;a, StreamOrDevice s={})</div><div class="ttdoc">Floor the element of an array.</div></div>
@@ -1506,7 +1513,6 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="agroup__ops_html_gaba4d25e7a2bf87ba4feb7837ec7fa396"><div class="ttname"><a href="group__ops.html#gaba4d25e7a2bf87ba4feb7837ec7fa396">mlx::core::atleast_1d</a></div><div class="ttdeci">array atleast_1d(const array &amp;a, StreamOrDevice s={})</div><div class="ttdoc">convert an array to an atleast ndim array</div></div>
 <div class="ttc" id="agroup__ops_html_gabc46eed81ab6c6247903e4ec0c4ec1fb"><div class="ttname"><a href="group__ops.html#gabc46eed81ab6c6247903e4ec0c4ec1fb">mlx::core::swapaxes</a></div><div class="ttdeci">array swapaxes(const array &amp;a, int axis1, int axis2, StreamOrDevice s={})</div><div class="ttdoc">Swap two axes of an array.</div></div>
 <div class="ttc" id="agroup__ops_html_gabca78d34ce93f0de2814e62225bb2a53"><div class="ttname"><a href="group__ops.html#gabca78d34ce93f0de2814e62225bb2a53">mlx::core::logical_not</a></div><div class="ttdeci">array logical_not(const array &amp;a, StreamOrDevice s={})</div><div class="ttdoc">Logical not of an array.</div></div>
-<div class="ttc" id="agroup__ops_html_gabdc36fa65697d0361c8d67495de77129"><div class="ttname"><a href="group__ops.html#gabdc36fa65697d0361c8d67495de77129">mlx::core::concatenate</a></div><div class="ttdeci">array concatenate(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})</div><div class="ttdoc">Concatenate arrays along a given axis.</div></div>
 <div class="ttc" id="agroup__ops_html_gabf786129c7660ed8d5acb5499bc6fefd"><div class="ttname"><a href="group__ops.html#gabf786129c7660ed8d5acb5499bc6fefd">mlx::core::trace</a></div><div class="ttdeci">array trace(const array &amp;a, int offset, int axis1, int axis2, Dtype dtype, StreamOrDevice s={})</div><div class="ttdoc">Return the sum along a specified diagonal in the given array.</div></div>
 <div class="ttc" id="agroup__ops_html_gabfa4208fb1f9b1cdd0abc563b19175af"><div class="ttname"><a href="group__ops.html#gabfa4208fb1f9b1cdd0abc563b19175af">mlx::core::quantized_matmul</a></div><div class="ttdeci">array quantized_matmul(array x, array w, array scales, array biases, bool transpose=true, int group_size=64, int bits=4, StreamOrDevice s={})</div><div class="ttdoc">Quantized matmul multiplies x with a quantized matrix w.</div></div>
 <div class="ttc" id="agroup__ops_html_gabff758a5c1ce32ad7e8b78aba0164077"><div class="ttname"><a href="group__ops.html#gabff758a5c1ce32ad7e8b78aba0164077">mlx::core::dequantize</a></div><div class="ttdeci">array dequantize(const array &amp;w, const array &amp;scales, const array &amp;biases, int group_size=64, int bits=4, StreamOrDevice s={})</div><div class="ttdoc">Dequantize a matrix produced by quantize()</div></div>
@@ -1555,6 +1561,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="anamespacemlx_1_1core_html_a479648542a2bea151b947b18f0e79dd2"><div class="ttname"><a href="namespacemlx_1_1core.html#a479648542a2bea151b947b18f0e79dd2">mlx::core::copy</a></div><div class="ttdeci">void copy(const array &amp;src, array &amp;dst, CopyType ctype)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a5d6373aad1444edc9de1eb07bfe5cad3"><div class="ttname"><a href="namespacemlx_1_1core.html#a5d6373aad1444edc9de1eb07bfe5cad3">mlx::core::int32</a></div><div class="ttdeci">constexpr Dtype int32</div><div class="ttdef"><b>Definition</b> dtype.h:76</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a6894543b340321193dfb8052c438a319"><div class="ttname"><a href="namespacemlx_1_1core.html#a6894543b340321193dfb8052c438a319">mlx::core::float32</a></div><div class="ttdeci">constexpr Dtype float32</div><div class="ttdef"><b>Definition</b> dtype.h:80</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_a76a2e310857f60f5ea6f1388d45b964d"><div class="ttname"><a href="namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d">mlx::core::concatenate</a></div><div class="ttdeci">void concatenate(std::string &amp;acc, T first)</div><div class="ttdef"><b>Definition</b> utils.h:66</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a937503d72b66c661bf3f5fdcd98ef97c"><div class="ttname"><a href="namespacemlx_1_1core.html#a937503d72b66c661bf3f5fdcd98ef97c">mlx::core::operator==</a></div><div class="ttdeci">bool operator==(const Device &amp;lhs, const Device &amp;rhs)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a94d00a1b7f8a4717ab3f26f45e4da655"><div class="ttname"><a href="namespacemlx_1_1core.html#a94d00a1b7f8a4717ab3f26f45e4da655">mlx::core::operator!=</a></div><div class="ttdeci">bool operator!=(const Device &amp;lhs, const Device &amp;rhs)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a95fc1013cc48fbfee0c54310711a5e58"><div class="ttname"><a href="namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58">mlx::core::StreamOrDevice</a></div><div class="ttdeci">std::variant&lt; std::monostate, Stream, Device &gt; StreamOrDevice</div><div class="ttdef"><b>Definition</b> utils.h:14</div></div>
diff --git a/docs/build/html/primitives_8h.html b/docs/build/html/primitives_8h.html
index 9172341a3..f226fe2d4 100644
--- a/docs/build/html/primitives_8h.html
+++ b/docs/build/html/primitives_8h.html
@@ -157,6 +157,8 @@ Classes</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_conjugate.html">mlx::core::Conjugate</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_convolution.html">mlx::core::Convolution</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">class &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="classmlx_1_1core_1_1_copy.html">mlx::core::Copy</a></td></tr>
diff --git a/docs/build/html/primitives_8h_source.html b/docs/build/html/primitives_8h_source.html
index f096756f8..1bc493fdb 100644
--- a/docs/build/html/primitives_8h_source.html
+++ b/docs/build/html/primitives_8h_source.html
@@ -803,1829 +803,1852 @@ $(function(){ initResizable(false); });
 </div>
 <div class="line"><a id="l00641" name="l00641"></a><span class="lineno">  641</span> </div>
 <div class="foldopen" id="foldopen00642" data-start="{" data-end="};">
-<div class="line"><a id="l00642" name="l00642"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html">  642</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_convolution.html">Convolution</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00642" name="l00642"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_contiguous.html">  642</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_contiguous.html">Contiguous</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
 <div class="line"><a id="l00643" name="l00643"></a><span class="lineno">  643</span> <span class="keyword">public</span>:</div>
 <div class="foldopen" id="foldopen00644" data-start="{" data-end="}">
-<div class="line"><a id="l00644" name="l00644"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef">  644</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef">Convolution</a>(</div>
-<div class="line"><a id="l00645" name="l00645"></a><span class="lineno">  645</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l00646" name="l00646"></a><span class="lineno">  646</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; kernel_strides,</div>
-<div class="line"><a id="l00647" name="l00647"></a><span class="lineno">  647</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; padding,</div>
-<div class="line"><a id="l00648" name="l00648"></a><span class="lineno">  648</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; kernel_dilation,</div>
-<div class="line"><a id="l00649" name="l00649"></a><span class="lineno">  649</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; input_dilation,</div>
-<div class="line"><a id="l00650" name="l00650"></a><span class="lineno">  650</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> groups = 1,</div>
-<div class="line"><a id="l00651" name="l00651"></a><span class="lineno">  651</span>      <span class="keyword">const</span> <span class="keywordtype">bool</span> flip = <span class="keyword">false</span>)</div>
-<div class="line"><a id="l00652" name="l00652"></a><span class="lineno">  652</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
-<div class="line"><a id="l00653" name="l00653"></a><span class="lineno">  653</span>        padding_(padding),</div>
-<div class="line"><a id="l00654" name="l00654"></a><span class="lineno">  654</span>        kernel_strides_(kernel_strides),</div>
-<div class="line"><a id="l00655" name="l00655"></a><span class="lineno">  655</span>        kernel_dilation_(kernel_dilation),</div>
-<div class="line"><a id="l00656" name="l00656"></a><span class="lineno">  656</span>        input_dilation_(input_dilation),</div>
-<div class="line"><a id="l00657" name="l00657"></a><span class="lineno">  657</span>        groups_(groups),</div>
-<div class="line"><a id="l00658" name="l00658"></a><span class="lineno">  658</span>        flip_(flip) {}</div>
+<div class="line"><a id="l00644" name="l00644"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_contiguous.html#a3e83f414c02ae0b92a50b6f8e402e1c0">  644</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_contiguous.html#a3e83f414c02ae0b92a50b6f8e402e1c0">Contiguous</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> allow_col_major)</div>
+<div class="line"><a id="l00645" name="l00645"></a><span class="lineno">  645</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), allow_col_major_(allow_col_major) {}</div>
 </div>
-<div class="line"><a id="l00659" name="l00659"></a><span class="lineno">  659</span> </div>
-<div class="line"><a id="l00660" name="l00660"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">  660</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00661" name="l00661"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">  661</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00662" name="l00662"></a><span class="lineno">  662</span> </div>
-<div class="line"><a id="l00663" name="l00663"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">  663</a></span>  std::vector&lt;array&gt; <a class="code hl_function" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">vjp</a>(</div>
-<div class="line"><a id="l00664" name="l00664"></a><span class="lineno">  664</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; primals,</div>
-<div class="line"><a id="l00665" name="l00665"></a><span class="lineno">  665</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; cotangents,</div>
-<div class="line"><a id="l00666" name="l00666"></a><span class="lineno">  666</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; argnums,</div>
-<div class="line"><a id="l00667" name="l00667"></a><span class="lineno">  667</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; outputs) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00668" name="l00668"></a><span class="lineno">  668</span> </div>
-<div class="line"><a id="l00669" name="l00669"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd">  669</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_convolution.html">Convolution</a>)</div>
-<div class="line"><a id="l00670" name="l00670"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de">  670</a></span>  bool is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l00671" name="l00671"></a><span class="lineno">  671</span> </div>
-<div class="line"><a id="l00672" name="l00672"></a><span class="lineno">  672</span> private:</div>
-<div class="line"><a id="l00673" name="l00673"></a><span class="lineno">  673</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; padding_;</div>
-<div class="line"><a id="l00674" name="l00674"></a><span class="lineno">  674</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; kernel_strides_;</div>
-<div class="line"><a id="l00675" name="l00675"></a><span class="lineno">  675</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; kernel_dilation_;</div>
-<div class="line"><a id="l00676" name="l00676"></a><span class="lineno">  676</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; input_dilation_;</div>
-<div class="line"><a id="l00677" name="l00677"></a><span class="lineno">  677</span>  <span class="keywordtype">int</span> groups_;</div>
-<div class="line"><a id="l00678" name="l00678"></a><span class="lineno">  678</span>  <span class="keywordtype">bool</span> flip_;</div>
-<div class="line"><a id="l00679" name="l00679"></a><span class="lineno">  679</span> </div>
-<div class="line"><a id="l00680" name="l00680"></a><span class="lineno">  680</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00681" name="l00681"></a><span class="lineno">  681</span>};</div>
+<div class="line"><a id="l00646" name="l00646"></a><span class="lineno">  646</span> </div>
+<div class="line"><a id="l00647" name="l00647"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336">  647</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00648" name="l00648"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f">  648</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00649" name="l00649"></a><span class="lineno">  649</span> </div>
+<div class="line"><a id="l00650" name="l00650"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_contiguous.html#a563221e90b15aa90bfae23d29c10e4ec">  650</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00651" name="l00651"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_contiguous.html#a1f9fcae7235e0ae9217825b78cb0f991">  651</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00652" name="l00652"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_contiguous.html#aca8a4ba9a58cc10f063e6b082fa2fc23">  652</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_contiguous.html">Contiguous</a>)</div>
+<div class="line"><a id="l00653" name="l00653"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_contiguous.html#a1a53623d7c591ba6567ac1533fbc2b7c">  653</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00654" name="l00654"></a><span class="lineno">  654</span> </div>
+<div class="line"><a id="l00655" name="l00655"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_contiguous.html#aa5d273a461fc6e64f3c9a67c24cb3372">  655</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l00656" name="l00656"></a><span class="lineno">  656</span> </div>
+<div class="line"><a id="l00657" name="l00657"></a><span class="lineno">  657</span> private:</div>
+<div class="line"><a id="l00658" name="l00658"></a><span class="lineno">  658</span>  <span class="keywordtype">bool</span> allow_col_major_;</div>
+<div class="line"><a id="l00659" name="l00659"></a><span class="lineno">  659</span>};</div>
 </div>
-<div class="line"><a id="l00682" name="l00682"></a><span class="lineno">  682</span> </div>
-<div class="foldopen" id="foldopen00683" data-start="{" data-end="};">
-<div class="line"><a id="l00683" name="l00683"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html">  683</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_copy.html">Copy</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00684" name="l00684"></a><span class="lineno">  684</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00685" name="l00685"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584">  685</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584">Copy</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00686" name="l00686"></a><span class="lineno">  686</span> </div>
-<div class="line"><a id="l00687" name="l00687"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">  687</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00688" name="l00688"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">  688</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00689" name="l00689"></a><span class="lineno">  689</span> </div>
-<div class="line"><a id="l00690" name="l00690"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61">  690</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00691" name="l00691"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc">  691</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00692" name="l00692"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008">  692</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_copy.html">Copy</a>)</div>
-<div class="line"><a id="l00693" name="l00693"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da">  693</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00694" name="l00694"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3">  694</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00695" name="l00695"></a><span class="lineno">  695</span> </div>
-<div class="line"><a id="l00696" name="l00696"></a><span class="lineno">  696</span> private:</div>
-<div class="line"><a id="l00697" name="l00697"></a><span class="lineno">  697</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00698" name="l00698"></a><span class="lineno">  698</span>};</div>
+<div class="line"><a id="l00660" name="l00660"></a><span class="lineno">  660</span> </div>
+<div class="foldopen" id="foldopen00661" data-start="{" data-end="};">
+<div class="line"><a id="l00661" name="l00661"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html">  661</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_convolution.html">Convolution</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00662" name="l00662"></a><span class="lineno">  662</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen00663" data-start="{" data-end="}">
+<div class="line"><a id="l00663" name="l00663"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef">  663</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef">Convolution</a>(</div>
+<div class="line"><a id="l00664" name="l00664"></a><span class="lineno">  664</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l00665" name="l00665"></a><span class="lineno">  665</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; kernel_strides,</div>
+<div class="line"><a id="l00666" name="l00666"></a><span class="lineno">  666</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; padding,</div>
+<div class="line"><a id="l00667" name="l00667"></a><span class="lineno">  667</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; kernel_dilation,</div>
+<div class="line"><a id="l00668" name="l00668"></a><span class="lineno">  668</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; input_dilation,</div>
+<div class="line"><a id="l00669" name="l00669"></a><span class="lineno">  669</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> groups = 1,</div>
+<div class="line"><a id="l00670" name="l00670"></a><span class="lineno">  670</span>      <span class="keyword">const</span> <span class="keywordtype">bool</span> flip = <span class="keyword">false</span>)</div>
+<div class="line"><a id="l00671" name="l00671"></a><span class="lineno">  671</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
+<div class="line"><a id="l00672" name="l00672"></a><span class="lineno">  672</span>        padding_(padding),</div>
+<div class="line"><a id="l00673" name="l00673"></a><span class="lineno">  673</span>        kernel_strides_(kernel_strides),</div>
+<div class="line"><a id="l00674" name="l00674"></a><span class="lineno">  674</span>        kernel_dilation_(kernel_dilation),</div>
+<div class="line"><a id="l00675" name="l00675"></a><span class="lineno">  675</span>        input_dilation_(input_dilation),</div>
+<div class="line"><a id="l00676" name="l00676"></a><span class="lineno">  676</span>        groups_(groups),</div>
+<div class="line"><a id="l00677" name="l00677"></a><span class="lineno">  677</span>        flip_(flip) {}</div>
 </div>
-<div class="line"><a id="l00699" name="l00699"></a><span class="lineno">  699</span> </div>
-<div class="foldopen" id="foldopen00700" data-start="{" data-end="};">
-<div class="line"><a id="l00700" name="l00700"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html">  700</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_cos.html">Cos</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00701" name="l00701"></a><span class="lineno">  701</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00702" name="l00702"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995">  702</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995">Cos</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00703" name="l00703"></a><span class="lineno">  703</span> </div>
-<div class="line"><a id="l00704" name="l00704"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">  704</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00705" name="l00705"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">  705</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00706" name="l00706"></a><span class="lineno">  706</span> </div>
-<div class="line"><a id="l00707" name="l00707"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6">  707</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00708" name="l00708"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1">  708</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00709" name="l00709"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696">  709</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_cos.html">Cos</a>)</div>
-<div class="line"><a id="l00710" name="l00710"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417">  710</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00711" name="l00711"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b">  711</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00712" name="l00712"></a><span class="lineno">  712</span> </div>
-<div class="line"><a id="l00713" name="l00713"></a><span class="lineno">  713</span> private:</div>
-<div class="line"><a id="l00714" name="l00714"></a><span class="lineno">  714</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00715" name="l00715"></a><span class="lineno">  715</span>};</div>
+<div class="line"><a id="l00678" name="l00678"></a><span class="lineno">  678</span> </div>
+<div class="line"><a id="l00679" name="l00679"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">  679</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00680" name="l00680"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">  680</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00681" name="l00681"></a><span class="lineno">  681</span> </div>
+<div class="line"><a id="l00682" name="l00682"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">  682</a></span>  std::vector&lt;array&gt; <a class="code hl_function" href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">vjp</a>(</div>
+<div class="line"><a id="l00683" name="l00683"></a><span class="lineno">  683</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; primals,</div>
+<div class="line"><a id="l00684" name="l00684"></a><span class="lineno">  684</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; cotangents,</div>
+<div class="line"><a id="l00685" name="l00685"></a><span class="lineno">  685</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; argnums,</div>
+<div class="line"><a id="l00686" name="l00686"></a><span class="lineno">  686</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; outputs) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00687" name="l00687"></a><span class="lineno">  687</span> </div>
+<div class="line"><a id="l00688" name="l00688"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd">  688</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_convolution.html">Convolution</a>)</div>
+<div class="line"><a id="l00689" name="l00689"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de">  689</a></span>  bool is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l00690" name="l00690"></a><span class="lineno">  690</span> </div>
+<div class="line"><a id="l00691" name="l00691"></a><span class="lineno">  691</span> private:</div>
+<div class="line"><a id="l00692" name="l00692"></a><span class="lineno">  692</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; padding_;</div>
+<div class="line"><a id="l00693" name="l00693"></a><span class="lineno">  693</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; kernel_strides_;</div>
+<div class="line"><a id="l00694" name="l00694"></a><span class="lineno">  694</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; kernel_dilation_;</div>
+<div class="line"><a id="l00695" name="l00695"></a><span class="lineno">  695</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; input_dilation_;</div>
+<div class="line"><a id="l00696" name="l00696"></a><span class="lineno">  696</span>  <span class="keywordtype">int</span> groups_;</div>
+<div class="line"><a id="l00697" name="l00697"></a><span class="lineno">  697</span>  <span class="keywordtype">bool</span> flip_;</div>
+<div class="line"><a id="l00698" name="l00698"></a><span class="lineno">  698</span> </div>
+<div class="line"><a id="l00699" name="l00699"></a><span class="lineno">  699</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00700" name="l00700"></a><span class="lineno">  700</span>};</div>
 </div>
-<div class="line"><a id="l00716" name="l00716"></a><span class="lineno">  716</span> </div>
-<div class="foldopen" id="foldopen00717" data-start="{" data-end="};">
-<div class="line"><a id="l00717" name="l00717"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html">  717</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_cosh.html">Cosh</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00718" name="l00718"></a><span class="lineno">  718</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00719" name="l00719"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1">  719</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1">Cosh</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00720" name="l00720"></a><span class="lineno">  720</span> </div>
-<div class="line"><a id="l00721" name="l00721"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">  721</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00722" name="l00722"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">  722</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00723" name="l00723"></a><span class="lineno">  723</span> </div>
-<div class="line"><a id="l00724" name="l00724"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406">  724</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00725" name="l00725"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863">  725</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00726" name="l00726"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2">  726</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_cosh.html">Cosh</a>)</div>
-<div class="line"><a id="l00727" name="l00727"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9">  727</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00728" name="l00728"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962">  728</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00729" name="l00729"></a><span class="lineno">  729</span> </div>
-<div class="line"><a id="l00730" name="l00730"></a><span class="lineno">  730</span> private:</div>
-<div class="line"><a id="l00731" name="l00731"></a><span class="lineno">  731</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00732" name="l00732"></a><span class="lineno">  732</span>};</div>
+<div class="line"><a id="l00701" name="l00701"></a><span class="lineno">  701</span> </div>
+<div class="foldopen" id="foldopen00702" data-start="{" data-end="};">
+<div class="line"><a id="l00702" name="l00702"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html">  702</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_copy.html">Copy</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00703" name="l00703"></a><span class="lineno">  703</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00704" name="l00704"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584">  704</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584">Copy</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00705" name="l00705"></a><span class="lineno">  705</span> </div>
+<div class="line"><a id="l00706" name="l00706"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">  706</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00707" name="l00707"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">  707</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00708" name="l00708"></a><span class="lineno">  708</span> </div>
+<div class="line"><a id="l00709" name="l00709"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61">  709</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00710" name="l00710"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc">  710</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00711" name="l00711"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008">  711</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_copy.html">Copy</a>)</div>
+<div class="line"><a id="l00712" name="l00712"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da">  712</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00713" name="l00713"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3">  713</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00714" name="l00714"></a><span class="lineno">  714</span> </div>
+<div class="line"><a id="l00715" name="l00715"></a><span class="lineno">  715</span> private:</div>
+<div class="line"><a id="l00716" name="l00716"></a><span class="lineno">  716</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00717" name="l00717"></a><span class="lineno">  717</span>};</div>
 </div>
-<div class="line"><a id="l00733" name="l00733"></a><span class="lineno">  733</span> </div>
-<div class="foldopen" id="foldopen00734" data-start="{" data-end="};">
-<div class="line"><a id="l00734" name="l00734"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html">  734</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_custom_transforms.html">CustomTransforms</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
-<div class="line"><a id="l00735" name="l00735"></a><span class="lineno">  735</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen00736" data-start="{" data-end="}">
-<div class="line"><a id="l00736" name="l00736"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488">  736</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488">CustomTransforms</a>(</div>
-<div class="line"><a id="l00737" name="l00737"></a><span class="lineno">  737</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l00738" name="l00738"></a><span class="lineno">  738</span>      <span class="keywordtype">int</span> num_outputs,</div>
-<div class="line"><a id="l00739" name="l00739"></a><span class="lineno">  739</span>      std::function&lt;std::vector&lt;array&gt;(</div>
-<div class="line"><a id="l00740" name="l00740"></a><span class="lineno">  740</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00741" name="l00741"></a><span class="lineno">  741</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00742" name="l00742"></a><span class="lineno">  742</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;)&gt; vjp,</div>
-<div class="line"><a id="l00743" name="l00743"></a><span class="lineno">  743</span>      std::function&lt;std::vector&lt;array&gt;(</div>
-<div class="line"><a id="l00744" name="l00744"></a><span class="lineno">  744</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00745" name="l00745"></a><span class="lineno">  745</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00746" name="l00746"></a><span class="lineno">  746</span>          <span class="keyword">const</span> std::vector&lt;int&gt;&amp;)&gt; jvp,</div>
-<div class="line"><a id="l00747" name="l00747"></a><span class="lineno">  747</span>      std::function&lt;std::pair&lt;std::vector&lt;array&gt;, std::vector&lt;int&gt;&gt;(</div>
-<div class="line"><a id="l00748" name="l00748"></a><span class="lineno">  748</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00749" name="l00749"></a><span class="lineno">  749</span>          <span class="keyword">const</span> std::vector&lt;int&gt;&amp;)&gt; vmap)</div>
-<div class="line"><a id="l00750" name="l00750"></a><span class="lineno">  750</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream),</div>
-<div class="line"><a id="l00751" name="l00751"></a><span class="lineno">  751</span>        num_outputs_(num_outputs),</div>
-<div class="line"><a id="l00752" name="l00752"></a><span class="lineno">  752</span>        vjp_fun_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(<a class="code hl_function" href="namespacemlx_1_1core.html#a1b33e2c2e3471420490cf0be2de6de18">vjp</a>)),</div>
-<div class="line"><a id="l00753" name="l00753"></a><span class="lineno">  753</span>        jvp_fun_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(<a class="code hl_function" href="namespacemlx_1_1core.html#a179a632200366c223d6ab56d3e032592">jvp</a>)),</div>
-<div class="line"><a id="l00754" name="l00754"></a><span class="lineno">  754</span>        vmap_fun_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(<a class="code hl_function" href="namespacemlx_1_1core.html#ac3caec2fa65375ed4c3bf1206177b84c">vmap</a>)) {}</div>
+<div class="line"><a id="l00718" name="l00718"></a><span class="lineno">  718</span> </div>
+<div class="foldopen" id="foldopen00719" data-start="{" data-end="};">
+<div class="line"><a id="l00719" name="l00719"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html">  719</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_cos.html">Cos</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00720" name="l00720"></a><span class="lineno">  720</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00721" name="l00721"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995">  721</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995">Cos</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00722" name="l00722"></a><span class="lineno">  722</span> </div>
+<div class="line"><a id="l00723" name="l00723"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">  723</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00724" name="l00724"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">  724</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00725" name="l00725"></a><span class="lineno">  725</span> </div>
+<div class="line"><a id="l00726" name="l00726"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6">  726</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00727" name="l00727"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1">  727</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00728" name="l00728"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696">  728</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_cos.html">Cos</a>)</div>
+<div class="line"><a id="l00729" name="l00729"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417">  729</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00730" name="l00730"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b">  730</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00731" name="l00731"></a><span class="lineno">  731</span> </div>
+<div class="line"><a id="l00732" name="l00732"></a><span class="lineno">  732</span> private:</div>
+<div class="line"><a id="l00733" name="l00733"></a><span class="lineno">  733</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00734" name="l00734"></a><span class="lineno">  734</span>};</div>
 </div>
-<div class="line"><a id="l00755" name="l00755"></a><span class="lineno">  755</span> </div>
-<div class="line"><a id="l00756" name="l00756"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184">  756</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l00757" name="l00757"></a><span class="lineno">  757</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00758" name="l00758"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667">  758</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l00759" name="l00759"></a><span class="lineno">  759</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00760" name="l00760"></a><span class="lineno">  760</span> </div>
-<div class="line"><a id="l00761" name="l00761"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720">  761</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>();</div>
-<div class="line"><a id="l00762" name="l00762"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b">  762</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>();</div>
-<div class="line"><a id="l00763" name="l00763"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298">  763</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_custom_transforms.html">CustomTransforms</a>);</div>
-<div class="line"><a id="l00764" name="l00764"></a><span class="lineno">  764</span> </div>
-<div class="line"><a id="l00765" name="l00765"></a><span class="lineno">  765</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00766" name="l00766"></a><span class="lineno">  766</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
-<div class="line"><a id="l00767" name="l00767"></a><span class="lineno">  767</span> </div>
-<div class="line"><a id="l00768" name="l00768"></a><span class="lineno">  768</span>  <span class="keywordtype">int</span> num_outputs_;</div>
-<div class="line"><a id="l00769" name="l00769"></a><span class="lineno">  769</span> </div>
-<div class="line"><a id="l00770" name="l00770"></a><span class="lineno">  770</span>  std::function&lt;std::vector&lt;array&gt;(</div>
-<div class="line"><a id="l00771" name="l00771"></a><span class="lineno">  771</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00772" name="l00772"></a><span class="lineno">  772</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00773" name="l00773"></a><span class="lineno">  773</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;)&gt;</div>
-<div class="line"><a id="l00774" name="l00774"></a><span class="lineno">  774</span>      vjp_fun_;</div>
-<div class="line"><a id="l00775" name="l00775"></a><span class="lineno">  775</span>  std::function&lt;std::vector&lt;array&gt;(</div>
-<div class="line"><a id="l00776" name="l00776"></a><span class="lineno">  776</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00777" name="l00777"></a><span class="lineno">  777</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00778" name="l00778"></a><span class="lineno">  778</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp;)&gt;</div>
-<div class="line"><a id="l00779" name="l00779"></a><span class="lineno">  779</span>      jvp_fun_;</div>
-<div class="line"><a id="l00780" name="l00780"></a><span class="lineno">  780</span>  std::function&lt;std::pair&lt;std::vector&lt;array&gt;, std::vector&lt;int&gt;&gt;(</div>
-<div class="line"><a id="l00781" name="l00781"></a><span class="lineno">  781</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
-<div class="line"><a id="l00782" name="l00782"></a><span class="lineno">  782</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp;)&gt;</div>
-<div class="line"><a id="l00783" name="l00783"></a><span class="lineno">  783</span>      vmap_fun_;</div>
-<div class="line"><a id="l00784" name="l00784"></a><span class="lineno">  784</span>};</div>
+<div class="line"><a id="l00735" name="l00735"></a><span class="lineno">  735</span> </div>
+<div class="foldopen" id="foldopen00736" data-start="{" data-end="};">
+<div class="line"><a id="l00736" name="l00736"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html">  736</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_cosh.html">Cosh</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00737" name="l00737"></a><span class="lineno">  737</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00738" name="l00738"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1">  738</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1">Cosh</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00739" name="l00739"></a><span class="lineno">  739</span> </div>
+<div class="line"><a id="l00740" name="l00740"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">  740</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00741" name="l00741"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">  741</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00742" name="l00742"></a><span class="lineno">  742</span> </div>
+<div class="line"><a id="l00743" name="l00743"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406">  743</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00744" name="l00744"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863">  744</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00745" name="l00745"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2">  745</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_cosh.html">Cosh</a>)</div>
+<div class="line"><a id="l00746" name="l00746"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9">  746</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00747" name="l00747"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962">  747</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00748" name="l00748"></a><span class="lineno">  748</span> </div>
+<div class="line"><a id="l00749" name="l00749"></a><span class="lineno">  749</span> private:</div>
+<div class="line"><a id="l00750" name="l00750"></a><span class="lineno">  750</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00751" name="l00751"></a><span class="lineno">  751</span>};</div>
 </div>
-<div class="line"><a id="l00785" name="l00785"></a><span class="lineno">  785</span> </div>
-<div class="foldopen" id="foldopen00786" data-start="{" data-end="};">
-<div class="line"><a id="l00786" name="l00786"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html">  786</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_depends.html">Depends</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
-<div class="line"><a id="l00787" name="l00787"></a><span class="lineno">  787</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00788" name="l00788"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#a4ccb792c99f5d8d133d3fac29f7d3f62">  788</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_depends.html#a4ccb792c99f5d8d133d3fac29f7d3f62">Depends</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream) {}</div>
-<div class="line"><a id="l00789" name="l00789"></a><span class="lineno">  789</span> </div>
-<div class="line"><a id="l00790" name="l00790"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e">  790</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l00791" name="l00791"></a><span class="lineno">  791</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00792" name="l00792"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28">  792</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l00793" name="l00793"></a><span class="lineno">  793</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00794" name="l00794"></a><span class="lineno">  794</span> </div>
-<div class="line"><a id="l00795" name="l00795"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">  795</a></span>  std::vector&lt;array&gt; <a class="code hl_function" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">vjp</a>(</div>
-<div class="line"><a id="l00796" name="l00796"></a><span class="lineno">  796</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; primals,</div>
-<div class="line"><a id="l00797" name="l00797"></a><span class="lineno">  797</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; cotan,</div>
-<div class="line"><a id="l00798" name="l00798"></a><span class="lineno">  798</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; argnums,</div>
-<div class="line"><a id="l00799" name="l00799"></a><span class="lineno">  799</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; outputs) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00800" name="l00800"></a><span class="lineno">  800</span> </div>
-<div class="line"><a id="l00801" name="l00801"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82">  801</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_depends.html">Depends</a>);</div>
-<div class="line"><a id="l00802" name="l00802"></a><span class="lineno">  802</span> </div>
-<div class="line"><a id="l00803" name="l00803"></a><span class="lineno">  803</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00804" name="l00804"></a><span class="lineno">  804</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
-<div class="line"><a id="l00805" name="l00805"></a><span class="lineno">  805</span>};</div>
+<div class="line"><a id="l00752" name="l00752"></a><span class="lineno">  752</span> </div>
+<div class="foldopen" id="foldopen00753" data-start="{" data-end="};">
+<div class="line"><a id="l00753" name="l00753"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html">  753</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_custom_transforms.html">CustomTransforms</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
+<div class="line"><a id="l00754" name="l00754"></a><span class="lineno">  754</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen00755" data-start="{" data-end="}">
+<div class="line"><a id="l00755" name="l00755"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488">  755</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488">CustomTransforms</a>(</div>
+<div class="line"><a id="l00756" name="l00756"></a><span class="lineno">  756</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l00757" name="l00757"></a><span class="lineno">  757</span>      <span class="keywordtype">int</span> num_outputs,</div>
+<div class="line"><a id="l00758" name="l00758"></a><span class="lineno">  758</span>      std::function&lt;std::vector&lt;array&gt;(</div>
+<div class="line"><a id="l00759" name="l00759"></a><span class="lineno">  759</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00760" name="l00760"></a><span class="lineno">  760</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00761" name="l00761"></a><span class="lineno">  761</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;)&gt; vjp,</div>
+<div class="line"><a id="l00762" name="l00762"></a><span class="lineno">  762</span>      std::function&lt;std::vector&lt;array&gt;(</div>
+<div class="line"><a id="l00763" name="l00763"></a><span class="lineno">  763</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00764" name="l00764"></a><span class="lineno">  764</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00765" name="l00765"></a><span class="lineno">  765</span>          <span class="keyword">const</span> std::vector&lt;int&gt;&amp;)&gt; jvp,</div>
+<div class="line"><a id="l00766" name="l00766"></a><span class="lineno">  766</span>      std::function&lt;std::pair&lt;std::vector&lt;array&gt;, std::vector&lt;int&gt;&gt;(</div>
+<div class="line"><a id="l00767" name="l00767"></a><span class="lineno">  767</span>          <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00768" name="l00768"></a><span class="lineno">  768</span>          <span class="keyword">const</span> std::vector&lt;int&gt;&amp;)&gt; vmap)</div>
+<div class="line"><a id="l00769" name="l00769"></a><span class="lineno">  769</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream),</div>
+<div class="line"><a id="l00770" name="l00770"></a><span class="lineno">  770</span>        num_outputs_(num_outputs),</div>
+<div class="line"><a id="l00771" name="l00771"></a><span class="lineno">  771</span>        vjp_fun_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(<a class="code hl_function" href="namespacemlx_1_1core.html#a1b33e2c2e3471420490cf0be2de6de18">vjp</a>)),</div>
+<div class="line"><a id="l00772" name="l00772"></a><span class="lineno">  772</span>        jvp_fun_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(<a class="code hl_function" href="namespacemlx_1_1core.html#a179a632200366c223d6ab56d3e032592">jvp</a>)),</div>
+<div class="line"><a id="l00773" name="l00773"></a><span class="lineno">  773</span>        vmap_fun_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(<a class="code hl_function" href="namespacemlx_1_1core.html#ac3caec2fa65375ed4c3bf1206177b84c">vmap</a>)) {}</div>
 </div>
-<div class="line"><a id="l00806" name="l00806"></a><span class="lineno">  806</span> </div>
-<div class="foldopen" id="foldopen00807" data-start="{" data-end="};">
-<div class="line"><a id="l00807" name="l00807"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html">  807</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_divide.html">Divide</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00808" name="l00808"></a><span class="lineno">  808</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00809" name="l00809"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb">  809</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb">Divide</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00810" name="l00810"></a><span class="lineno">  810</span> </div>
-<div class="line"><a id="l00811" name="l00811"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">  811</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00812" name="l00812"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">  812</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00774" name="l00774"></a><span class="lineno">  774</span> </div>
+<div class="line"><a id="l00775" name="l00775"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184">  775</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l00776" name="l00776"></a><span class="lineno">  776</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00777" name="l00777"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667">  777</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l00778" name="l00778"></a><span class="lineno">  778</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00779" name="l00779"></a><span class="lineno">  779</span> </div>
+<div class="line"><a id="l00780" name="l00780"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720">  780</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>();</div>
+<div class="line"><a id="l00781" name="l00781"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b">  781</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>();</div>
+<div class="line"><a id="l00782" name="l00782"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298">  782</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_custom_transforms.html">CustomTransforms</a>);</div>
+<div class="line"><a id="l00783" name="l00783"></a><span class="lineno">  783</span> </div>
+<div class="line"><a id="l00784" name="l00784"></a><span class="lineno">  784</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00785" name="l00785"></a><span class="lineno">  785</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
+<div class="line"><a id="l00786" name="l00786"></a><span class="lineno">  786</span> </div>
+<div class="line"><a id="l00787" name="l00787"></a><span class="lineno">  787</span>  <span class="keywordtype">int</span> num_outputs_;</div>
+<div class="line"><a id="l00788" name="l00788"></a><span class="lineno">  788</span> </div>
+<div class="line"><a id="l00789" name="l00789"></a><span class="lineno">  789</span>  std::function&lt;std::vector&lt;array&gt;(</div>
+<div class="line"><a id="l00790" name="l00790"></a><span class="lineno">  790</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00791" name="l00791"></a><span class="lineno">  791</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00792" name="l00792"></a><span class="lineno">  792</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;)&gt;</div>
+<div class="line"><a id="l00793" name="l00793"></a><span class="lineno">  793</span>      vjp_fun_;</div>
+<div class="line"><a id="l00794" name="l00794"></a><span class="lineno">  794</span>  std::function&lt;std::vector&lt;array&gt;(</div>
+<div class="line"><a id="l00795" name="l00795"></a><span class="lineno">  795</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00796" name="l00796"></a><span class="lineno">  796</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00797" name="l00797"></a><span class="lineno">  797</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp;)&gt;</div>
+<div class="line"><a id="l00798" name="l00798"></a><span class="lineno">  798</span>      jvp_fun_;</div>
+<div class="line"><a id="l00799" name="l00799"></a><span class="lineno">  799</span>  std::function&lt;std::pair&lt;std::vector&lt;array&gt;, std::vector&lt;int&gt;&gt;(</div>
+<div class="line"><a id="l00800" name="l00800"></a><span class="lineno">  800</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp;,</div>
+<div class="line"><a id="l00801" name="l00801"></a><span class="lineno">  801</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp;)&gt;</div>
+<div class="line"><a id="l00802" name="l00802"></a><span class="lineno">  802</span>      vmap_fun_;</div>
+<div class="line"><a id="l00803" name="l00803"></a><span class="lineno">  803</span>};</div>
+</div>
+<div class="line"><a id="l00804" name="l00804"></a><span class="lineno">  804</span> </div>
+<div class="foldopen" id="foldopen00805" data-start="{" data-end="};">
+<div class="line"><a id="l00805" name="l00805"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html">  805</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_depends.html">Depends</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
+<div class="line"><a id="l00806" name="l00806"></a><span class="lineno">  806</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00807" name="l00807"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#a4ccb792c99f5d8d133d3fac29f7d3f62">  807</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_depends.html#a4ccb792c99f5d8d133d3fac29f7d3f62">Depends</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream) {}</div>
+<div class="line"><a id="l00808" name="l00808"></a><span class="lineno">  808</span> </div>
+<div class="line"><a id="l00809" name="l00809"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e">  809</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l00810" name="l00810"></a><span class="lineno">  810</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00811" name="l00811"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28">  811</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l00812" name="l00812"></a><span class="lineno">  812</span>      <span class="keyword">override</span>;</div>
 <div class="line"><a id="l00813" name="l00813"></a><span class="lineno">  813</span> </div>
-<div class="line"><a id="l00814" name="l00814"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242">  814</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00815" name="l00815"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c">  815</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00816" name="l00816"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6">  816</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_divide.html">Divide</a>)</div>
-<div class="line"><a id="l00817" name="l00817"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650">  817</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00818" name="l00818"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994">  818</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00814" name="l00814"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">  814</a></span>  std::vector&lt;array&gt; <a class="code hl_function" href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">vjp</a>(</div>
+<div class="line"><a id="l00815" name="l00815"></a><span class="lineno">  815</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; primals,</div>
+<div class="line"><a id="l00816" name="l00816"></a><span class="lineno">  816</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; cotan,</div>
+<div class="line"><a id="l00817" name="l00817"></a><span class="lineno">  817</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; argnums,</div>
+<div class="line"><a id="l00818" name="l00818"></a><span class="lineno">  818</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; outputs) <span class="keyword">override</span>;</div>
 <div class="line"><a id="l00819" name="l00819"></a><span class="lineno">  819</span> </div>
-<div class="line"><a id="l00820" name="l00820"></a><span class="lineno">  820</span> private:</div>
-<div class="line"><a id="l00821" name="l00821"></a><span class="lineno">  821</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00822" name="l00822"></a><span class="lineno">  822</span>};</div>
+<div class="line"><a id="l00820" name="l00820"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82">  820</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_depends.html">Depends</a>);</div>
+<div class="line"><a id="l00821" name="l00821"></a><span class="lineno">  821</span> </div>
+<div class="line"><a id="l00822" name="l00822"></a><span class="lineno">  822</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00823" name="l00823"></a><span class="lineno">  823</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
+<div class="line"><a id="l00824" name="l00824"></a><span class="lineno">  824</span>};</div>
 </div>
-<div class="line"><a id="l00823" name="l00823"></a><span class="lineno">  823</span> </div>
-<div class="foldopen" id="foldopen00824" data-start="{" data-end="};">
-<div class="line"><a id="l00824" name="l00824"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html">  824</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_div_mod.html">DivMod</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
-<div class="line"><a id="l00825" name="l00825"></a><span class="lineno">  825</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00826" name="l00826"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826">  826</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826">DivMod</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream) {}</div>
-<div class="line"><a id="l00827" name="l00827"></a><span class="lineno">  827</span> </div>
-<div class="line"><a id="l00828" name="l00828"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3">  828</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l00829" name="l00829"></a><span class="lineno">  829</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00830" name="l00830"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc">  830</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l00831" name="l00831"></a><span class="lineno">  831</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00825" name="l00825"></a><span class="lineno">  825</span> </div>
+<div class="foldopen" id="foldopen00826" data-start="{" data-end="};">
+<div class="line"><a id="l00826" name="l00826"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html">  826</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_divide.html">Divide</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00827" name="l00827"></a><span class="lineno">  827</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00828" name="l00828"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb">  828</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb">Divide</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00829" name="l00829"></a><span class="lineno">  829</span> </div>
+<div class="line"><a id="l00830" name="l00830"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">  830</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00831" name="l00831"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">  831</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
 <div class="line"><a id="l00832" name="l00832"></a><span class="lineno">  832</span> </div>
-<div class="line"><a id="l00833" name="l00833"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942">  833</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00834" name="l00834"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9">  834</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00835" name="l00835"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1">  835</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_div_mod.html">DivMod</a>)</div>
-<div class="line"><a id="l00836" name="l00836"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a">  836</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="foldopen" id="foldopen00837" data-start="{" data-end="}">
-<div class="line"><a id="l00837" name="l00837"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b">  837</a></span>  std::vector&lt;std::vector&lt;<span class="keywordtype">int</span>&gt;&gt; output_shapes(</div>
-<div class="line"><a id="l00838" name="l00838"></a><span class="lineno">  838</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs)<span class="keyword"> override </span>{</div>
-<div class="line"><a id="l00839" name="l00839"></a><span class="lineno">  839</span>    <span class="keywordflow">return</span> std::vector{inputs[0].shape(), inputs[0].shape()};</div>
-<div class="line"><a id="l00840" name="l00840"></a><span class="lineno">  840</span>  }</div>
+<div class="line"><a id="l00833" name="l00833"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242">  833</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00834" name="l00834"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c">  834</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00835" name="l00835"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6">  835</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_divide.html">Divide</a>)</div>
+<div class="line"><a id="l00836" name="l00836"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650">  836</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00837" name="l00837"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994">  837</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00838" name="l00838"></a><span class="lineno">  838</span> </div>
+<div class="line"><a id="l00839" name="l00839"></a><span class="lineno">  839</span> private:</div>
+<div class="line"><a id="l00840" name="l00840"></a><span class="lineno">  840</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00841" name="l00841"></a><span class="lineno">  841</span>};</div>
 </div>
-<div class="line"><a id="l00841" name="l00841"></a><span class="lineno">  841</span> </div>
-<div class="line"><a id="l00842" name="l00842"></a><span class="lineno">  842</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00843" name="l00843"></a><span class="lineno">  843</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
-<div class="line"><a id="l00844" name="l00844"></a><span class="lineno">  844</span>};</div>
+<div class="line"><a id="l00842" name="l00842"></a><span class="lineno">  842</span> </div>
+<div class="foldopen" id="foldopen00843" data-start="{" data-end="};">
+<div class="line"><a id="l00843" name="l00843"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html">  843</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_div_mod.html">DivMod</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
+<div class="line"><a id="l00844" name="l00844"></a><span class="lineno">  844</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00845" name="l00845"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826">  845</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826">DivMod</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream) {}</div>
+<div class="line"><a id="l00846" name="l00846"></a><span class="lineno">  846</span> </div>
+<div class="line"><a id="l00847" name="l00847"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3">  847</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l00848" name="l00848"></a><span class="lineno">  848</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00849" name="l00849"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc">  849</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l00850" name="l00850"></a><span class="lineno">  850</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00851" name="l00851"></a><span class="lineno">  851</span> </div>
+<div class="line"><a id="l00852" name="l00852"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942">  852</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00853" name="l00853"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9">  853</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00854" name="l00854"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1">  854</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_div_mod.html">DivMod</a>)</div>
+<div class="line"><a id="l00855" name="l00855"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a">  855</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="foldopen" id="foldopen00856" data-start="{" data-end="}">
+<div class="line"><a id="l00856" name="l00856"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b">  856</a></span>  std::vector&lt;std::vector&lt;<span class="keywordtype">int</span>&gt;&gt; output_shapes(</div>
+<div class="line"><a id="l00857" name="l00857"></a><span class="lineno">  857</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs)<span class="keyword"> override </span>{</div>
+<div class="line"><a id="l00858" name="l00858"></a><span class="lineno">  858</span>    <span class="keywordflow">return</span> std::vector{inputs[0].shape(), inputs[0].shape()};</div>
+<div class="line"><a id="l00859" name="l00859"></a><span class="lineno">  859</span>  }</div>
 </div>
-<div class="line"><a id="l00845" name="l00845"></a><span class="lineno">  845</span> </div>
-<div class="foldopen" id="foldopen00846" data-start="{" data-end="};">
-<div class="line"><a id="l00846" name="l00846"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html">  846</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_select.html">Select</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00847" name="l00847"></a><span class="lineno">  847</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00848" name="l00848"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9">  848</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9">Select</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00849" name="l00849"></a><span class="lineno">  849</span> </div>
-<div class="line"><a id="l00850" name="l00850"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">  850</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00851" name="l00851"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">  851</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00852" name="l00852"></a><span class="lineno">  852</span> </div>
-<div class="line"><a id="l00853" name="l00853"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f">  853</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00854" name="l00854"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6">  854</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00855" name="l00855"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7">  855</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_select.html">Select</a>)</div>
-<div class="line"><a id="l00856" name="l00856"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8">  856</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00857" name="l00857"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867">  857</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00858" name="l00858"></a><span class="lineno">  858</span> </div>
-<div class="line"><a id="l00859" name="l00859"></a><span class="lineno">  859</span> private:</div>
-<div class="line"><a id="l00860" name="l00860"></a><span class="lineno">  860</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00861" name="l00861"></a><span class="lineno">  861</span>};</div>
+<div class="line"><a id="l00860" name="l00860"></a><span class="lineno">  860</span> </div>
+<div class="line"><a id="l00861" name="l00861"></a><span class="lineno">  861</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00862" name="l00862"></a><span class="lineno">  862</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
+<div class="line"><a id="l00863" name="l00863"></a><span class="lineno">  863</span>};</div>
 </div>
-<div class="line"><a id="l00862" name="l00862"></a><span class="lineno">  862</span> </div>
-<div class="foldopen" id="foldopen00863" data-start="{" data-end="};">
-<div class="line"><a id="l00863" name="l00863"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html">  863</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_remainder.html">Remainder</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00864" name="l00864"></a><span class="lineno">  864</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00865" name="l00865"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a4f3eada4a21898af4a77d1d27ce14641">  865</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_remainder.html#a4f3eada4a21898af4a77d1d27ce14641">Remainder</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00866" name="l00866"></a><span class="lineno">  866</span> </div>
-<div class="line"><a id="l00867" name="l00867"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">  867</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00868" name="l00868"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">  868</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00869" name="l00869"></a><span class="lineno">  869</span> </div>
-<div class="line"><a id="l00870" name="l00870"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d">  870</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00871" name="l00871"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79">  871</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00872" name="l00872"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4">  872</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_remainder.html">Remainder</a>)</div>
-<div class="line"><a id="l00873" name="l00873"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814">  873</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00874" name="l00874"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666">  874</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00875" name="l00875"></a><span class="lineno">  875</span> </div>
-<div class="line"><a id="l00876" name="l00876"></a><span class="lineno">  876</span> private:</div>
-<div class="line"><a id="l00877" name="l00877"></a><span class="lineno">  877</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00878" name="l00878"></a><span class="lineno">  878</span>};</div>
+<div class="line"><a id="l00864" name="l00864"></a><span class="lineno">  864</span> </div>
+<div class="foldopen" id="foldopen00865" data-start="{" data-end="};">
+<div class="line"><a id="l00865" name="l00865"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html">  865</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_select.html">Select</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00866" name="l00866"></a><span class="lineno">  866</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00867" name="l00867"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9">  867</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9">Select</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00868" name="l00868"></a><span class="lineno">  868</span> </div>
+<div class="line"><a id="l00869" name="l00869"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">  869</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00870" name="l00870"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">  870</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00871" name="l00871"></a><span class="lineno">  871</span> </div>
+<div class="line"><a id="l00872" name="l00872"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f">  872</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00873" name="l00873"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6">  873</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00874" name="l00874"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7">  874</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_select.html">Select</a>)</div>
+<div class="line"><a id="l00875" name="l00875"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8">  875</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00876" name="l00876"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867">  876</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00877" name="l00877"></a><span class="lineno">  877</span> </div>
+<div class="line"><a id="l00878" name="l00878"></a><span class="lineno">  878</span> private:</div>
+<div class="line"><a id="l00879" name="l00879"></a><span class="lineno">  879</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00880" name="l00880"></a><span class="lineno">  880</span>};</div>
 </div>
-<div class="line"><a id="l00879" name="l00879"></a><span class="lineno">  879</span> </div>
-<div class="foldopen" id="foldopen00880" data-start="{" data-end="};">
-<div class="line"><a id="l00880" name="l00880"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html">  880</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_equal.html">Equal</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00881" name="l00881"></a><span class="lineno">  881</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen00882" data-start="{" data-end="}">
-<div class="line"><a id="l00882" name="l00882"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a4af81cf2dd071db5bbf8ce1df95fdf36">  882</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_equal.html#a4af81cf2dd071db5bbf8ce1df95fdf36">Equal</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> equal_nan = <span class="keyword">false</span>)</div>
-<div class="line"><a id="l00883" name="l00883"></a><span class="lineno">  883</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), equal_nan_(equal_nan) {}</div>
+<div class="line"><a id="l00881" name="l00881"></a><span class="lineno">  881</span> </div>
+<div class="foldopen" id="foldopen00882" data-start="{" data-end="};">
+<div class="line"><a id="l00882" name="l00882"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html">  882</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_remainder.html">Remainder</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00883" name="l00883"></a><span class="lineno">  883</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00884" name="l00884"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a4f3eada4a21898af4a77d1d27ce14641">  884</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_remainder.html#a4f3eada4a21898af4a77d1d27ce14641">Remainder</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00885" name="l00885"></a><span class="lineno">  885</span> </div>
+<div class="line"><a id="l00886" name="l00886"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">  886</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00887" name="l00887"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">  887</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00888" name="l00888"></a><span class="lineno">  888</span> </div>
+<div class="line"><a id="l00889" name="l00889"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d">  889</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00890" name="l00890"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79">  890</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00891" name="l00891"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4">  891</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_remainder.html">Remainder</a>)</div>
+<div class="line"><a id="l00892" name="l00892"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814">  892</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00893" name="l00893"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666">  893</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00894" name="l00894"></a><span class="lineno">  894</span> </div>
+<div class="line"><a id="l00895" name="l00895"></a><span class="lineno">  895</span> private:</div>
+<div class="line"><a id="l00896" name="l00896"></a><span class="lineno">  896</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00897" name="l00897"></a><span class="lineno">  897</span>};</div>
 </div>
-<div class="line"><a id="l00884" name="l00884"></a><span class="lineno">  884</span> </div>
-<div class="line"><a id="l00885" name="l00885"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">  885</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00886" name="l00886"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">  886</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00887" name="l00887"></a><span class="lineno">  887</span> </div>
-<div class="line"><a id="l00888" name="l00888"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca">  888</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00889" name="l00889"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f">  889</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00890" name="l00890"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02">  890</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00891" name="l00891"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9">  891</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00892" name="l00892"></a><span class="lineno">  892</span> </div>
-<div class="foldopen" id="foldopen00893" data-start="{" data-end="}">
-<div class="line"><a id="l00893" name="l00893"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774">  893</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
-<div class="line"><a id="l00894" name="l00894"></a><span class="lineno">  894</span>    <span class="keywordflow">if</span> (equal_nan_) {</div>
-<div class="line"><a id="l00895" name="l00895"></a><span class="lineno">  895</span>      os &lt;&lt; <span class="stringliteral">&quot;NaNEqual&quot;</span>;</div>
-<div class="line"><a id="l00896" name="l00896"></a><span class="lineno">  896</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00897" name="l00897"></a><span class="lineno">  897</span>      os &lt;&lt; <span class="stringliteral">&quot;Equal&quot;</span>;</div>
-<div class="line"><a id="l00898" name="l00898"></a><span class="lineno">  898</span>    }</div>
-<div class="line"><a id="l00899" name="l00899"></a><span class="lineno">  899</span>  }</div>
+<div class="line"><a id="l00898" name="l00898"></a><span class="lineno">  898</span> </div>
+<div class="foldopen" id="foldopen00899" data-start="{" data-end="};">
+<div class="line"><a id="l00899" name="l00899"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html">  899</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_equal.html">Equal</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00900" name="l00900"></a><span class="lineno">  900</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen00901" data-start="{" data-end="}">
+<div class="line"><a id="l00901" name="l00901"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a4af81cf2dd071db5bbf8ce1df95fdf36">  901</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_equal.html#a4af81cf2dd071db5bbf8ce1df95fdf36">Equal</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> equal_nan = <span class="keyword">false</span>)</div>
+<div class="line"><a id="l00902" name="l00902"></a><span class="lineno">  902</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), equal_nan_(equal_nan) {}</div>
 </div>
-<div class="line"><a id="l00900" name="l00900"></a><span class="lineno">  900</span> </div>
-<div class="line"><a id="l00901" name="l00901"></a><span class="lineno">  901</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00902" name="l00902"></a><span class="lineno">  902</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00903" name="l00903"></a><span class="lineno">  903</span>  <span class="keywordtype">bool</span> equal_nan_;</div>
-<div class="line"><a id="l00904" name="l00904"></a><span class="lineno">  904</span>};</div>
+<div class="line"><a id="l00903" name="l00903"></a><span class="lineno">  903</span> </div>
+<div class="line"><a id="l00904" name="l00904"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">  904</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00905" name="l00905"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">  905</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00906" name="l00906"></a><span class="lineno">  906</span> </div>
+<div class="line"><a id="l00907" name="l00907"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca">  907</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00908" name="l00908"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f">  908</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00909" name="l00909"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02">  909</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00910" name="l00910"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9">  910</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00911" name="l00911"></a><span class="lineno">  911</span> </div>
+<div class="foldopen" id="foldopen00912" data-start="{" data-end="}">
+<div class="line"><a id="l00912" name="l00912"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774">  912</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
+<div class="line"><a id="l00913" name="l00913"></a><span class="lineno">  913</span>    <span class="keywordflow">if</span> (equal_nan_) {</div>
+<div class="line"><a id="l00914" name="l00914"></a><span class="lineno">  914</span>      os &lt;&lt; <span class="stringliteral">&quot;NaNEqual&quot;</span>;</div>
+<div class="line"><a id="l00915" name="l00915"></a><span class="lineno">  915</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00916" name="l00916"></a><span class="lineno">  916</span>      os &lt;&lt; <span class="stringliteral">&quot;Equal&quot;</span>;</div>
+<div class="line"><a id="l00917" name="l00917"></a><span class="lineno">  917</span>    }</div>
+<div class="line"><a id="l00918" name="l00918"></a><span class="lineno">  918</span>  }</div>
 </div>
-<div class="line"><a id="l00905" name="l00905"></a><span class="lineno">  905</span> </div>
-<div class="foldopen" id="foldopen00906" data-start="{" data-end="};">
-<div class="line"><a id="l00906" name="l00906"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html">  906</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_erf.html">Erf</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00907" name="l00907"></a><span class="lineno">  907</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00908" name="l00908"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#a702f76f848928d8d7d3d0881ac6e4c82">  908</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf.html#a702f76f848928d8d7d3d0881ac6e4c82">Erf</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00909" name="l00909"></a><span class="lineno">  909</span> </div>
-<div class="line"><a id="l00910" name="l00910"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">  910</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00911" name="l00911"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">  911</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00912" name="l00912"></a><span class="lineno">  912</span> </div>
-<div class="line"><a id="l00913" name="l00913"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa">  913</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00914" name="l00914"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe">  914</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00915" name="l00915"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c">  915</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_erf.html">Erf</a>)</div>
-<div class="line"><a id="l00916" name="l00916"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82">  916</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00917" name="l00917"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187">  917</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00918" name="l00918"></a><span class="lineno">  918</span> </div>
-<div class="line"><a id="l00919" name="l00919"></a><span class="lineno">  919</span> private:</div>
-<div class="line"><a id="l00920" name="l00920"></a><span class="lineno">  920</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00921" name="l00921"></a><span class="lineno">  921</span>};</div>
+<div class="line"><a id="l00919" name="l00919"></a><span class="lineno">  919</span> </div>
+<div class="line"><a id="l00920" name="l00920"></a><span class="lineno">  920</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00921" name="l00921"></a><span class="lineno">  921</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00922" name="l00922"></a><span class="lineno">  922</span>  <span class="keywordtype">bool</span> equal_nan_;</div>
+<div class="line"><a id="l00923" name="l00923"></a><span class="lineno">  923</span>};</div>
 </div>
-<div class="line"><a id="l00922" name="l00922"></a><span class="lineno">  922</span> </div>
-<div class="foldopen" id="foldopen00923" data-start="{" data-end="};">
-<div class="line"><a id="l00923" name="l00923"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html">  923</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_erf_inv.html">ErfInv</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00924" name="l00924"></a><span class="lineno">  924</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00925" name="l00925"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478">  925</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478">ErfInv</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00926" name="l00926"></a><span class="lineno">  926</span> </div>
-<div class="line"><a id="l00927" name="l00927"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">  927</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00928" name="l00928"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">  928</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00929" name="l00929"></a><span class="lineno">  929</span> </div>
-<div class="line"><a id="l00930" name="l00930"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9">  930</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00931" name="l00931"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be">  931</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00932" name="l00932"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9">  932</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_erf_inv.html">ErfInv</a>)</div>
-<div class="line"><a id="l00933" name="l00933"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832">  933</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00934" name="l00934"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639">  934</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00935" name="l00935"></a><span class="lineno">  935</span> </div>
-<div class="line"><a id="l00936" name="l00936"></a><span class="lineno">  936</span> private:</div>
-<div class="line"><a id="l00937" name="l00937"></a><span class="lineno">  937</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00938" name="l00938"></a><span class="lineno">  938</span>};</div>
+<div class="line"><a id="l00924" name="l00924"></a><span class="lineno">  924</span> </div>
+<div class="foldopen" id="foldopen00925" data-start="{" data-end="};">
+<div class="line"><a id="l00925" name="l00925"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html">  925</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_erf.html">Erf</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00926" name="l00926"></a><span class="lineno">  926</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00927" name="l00927"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#a702f76f848928d8d7d3d0881ac6e4c82">  927</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf.html#a702f76f848928d8d7d3d0881ac6e4c82">Erf</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00928" name="l00928"></a><span class="lineno">  928</span> </div>
+<div class="line"><a id="l00929" name="l00929"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">  929</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00930" name="l00930"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">  930</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00931" name="l00931"></a><span class="lineno">  931</span> </div>
+<div class="line"><a id="l00932" name="l00932"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa">  932</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00933" name="l00933"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe">  933</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00934" name="l00934"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c">  934</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_erf.html">Erf</a>)</div>
+<div class="line"><a id="l00935" name="l00935"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82">  935</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00936" name="l00936"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187">  936</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00937" name="l00937"></a><span class="lineno">  937</span> </div>
+<div class="line"><a id="l00938" name="l00938"></a><span class="lineno">  938</span> private:</div>
+<div class="line"><a id="l00939" name="l00939"></a><span class="lineno">  939</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00940" name="l00940"></a><span class="lineno">  940</span>};</div>
 </div>
-<div class="line"><a id="l00939" name="l00939"></a><span class="lineno">  939</span> </div>
-<div class="foldopen" id="foldopen00940" data-start="{" data-end="};">
-<div class="line"><a id="l00940" name="l00940"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html">  940</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_exp.html">Exp</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00941" name="l00941"></a><span class="lineno">  941</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00942" name="l00942"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#a1d0a618cbb91ab29ef53b57ff6ed6e06">  942</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_exp.html#a1d0a618cbb91ab29ef53b57ff6ed6e06">Exp</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00943" name="l00943"></a><span class="lineno">  943</span> </div>
-<div class="line"><a id="l00944" name="l00944"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">  944</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00945" name="l00945"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">  945</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00946" name="l00946"></a><span class="lineno">  946</span> </div>
-<div class="line"><a id="l00947" name="l00947"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37">  947</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00948" name="l00948"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59">  948</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00949" name="l00949"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a">  949</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_exp.html">Exp</a>)</div>
-<div class="line"><a id="l00950" name="l00950"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357">  950</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l00951" name="l00951"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670">  951</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00952" name="l00952"></a><span class="lineno">  952</span> </div>
-<div class="line"><a id="l00953" name="l00953"></a><span class="lineno">  953</span> private:</div>
-<div class="line"><a id="l00954" name="l00954"></a><span class="lineno">  954</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00955" name="l00955"></a><span class="lineno">  955</span>};</div>
+<div class="line"><a id="l00941" name="l00941"></a><span class="lineno">  941</span> </div>
+<div class="foldopen" id="foldopen00942" data-start="{" data-end="};">
+<div class="line"><a id="l00942" name="l00942"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html">  942</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_erf_inv.html">ErfInv</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00943" name="l00943"></a><span class="lineno">  943</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00944" name="l00944"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478">  944</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478">ErfInv</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00945" name="l00945"></a><span class="lineno">  945</span> </div>
+<div class="line"><a id="l00946" name="l00946"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">  946</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00947" name="l00947"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">  947</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00948" name="l00948"></a><span class="lineno">  948</span> </div>
+<div class="line"><a id="l00949" name="l00949"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9">  949</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00950" name="l00950"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be">  950</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00951" name="l00951"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9">  951</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_erf_inv.html">ErfInv</a>)</div>
+<div class="line"><a id="l00952" name="l00952"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832">  952</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00953" name="l00953"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639">  953</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00954" name="l00954"></a><span class="lineno">  954</span> </div>
+<div class="line"><a id="l00955" name="l00955"></a><span class="lineno">  955</span> private:</div>
+<div class="line"><a id="l00956" name="l00956"></a><span class="lineno">  956</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00957" name="l00957"></a><span class="lineno">  957</span>};</div>
 </div>
-<div class="line"><a id="l00956" name="l00956"></a><span class="lineno">  956</span> </div>
-<div class="foldopen" id="foldopen00957" data-start="{" data-end="};">
-<div class="line"><a id="l00957" name="l00957"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html">  957</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_expm1.html">Expm1</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00958" name="l00958"></a><span class="lineno">  958</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00959" name="l00959"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#a47c2a1b2a4ef6bb07ba77c55ddddaec2">  959</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_expm1.html#a47c2a1b2a4ef6bb07ba77c55ddddaec2">Expm1</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l00960" name="l00960"></a><span class="lineno">  960</span> </div>
-<div class="line"><a id="l00961" name="l00961"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">  961</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00962" name="l00962"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">  962</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00963" name="l00963"></a><span class="lineno">  963</span> </div>
-<div class="line"><a id="l00964" name="l00964"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296">  964</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00965" name="l00965"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1">  965</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00966" name="l00966"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1">  966</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_expm1.html">Expm1</a>)</div>
-<div class="line"><a id="l00967" name="l00967"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08">  967</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l00968" name="l00968"></a><span class="lineno">  968</span> </div>
-<div class="line"><a id="l00969" name="l00969"></a><span class="lineno">  969</span> private:</div>
-<div class="line"><a id="l00970" name="l00970"></a><span class="lineno">  970</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00971" name="l00971"></a><span class="lineno">  971</span>};</div>
+<div class="line"><a id="l00958" name="l00958"></a><span class="lineno">  958</span> </div>
+<div class="foldopen" id="foldopen00959" data-start="{" data-end="};">
+<div class="line"><a id="l00959" name="l00959"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html">  959</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_exp.html">Exp</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00960" name="l00960"></a><span class="lineno">  960</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00961" name="l00961"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#a1d0a618cbb91ab29ef53b57ff6ed6e06">  961</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_exp.html#a1d0a618cbb91ab29ef53b57ff6ed6e06">Exp</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00962" name="l00962"></a><span class="lineno">  962</span> </div>
+<div class="line"><a id="l00963" name="l00963"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">  963</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00964" name="l00964"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">  964</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00965" name="l00965"></a><span class="lineno">  965</span> </div>
+<div class="line"><a id="l00966" name="l00966"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37">  966</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00967" name="l00967"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59">  967</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00968" name="l00968"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a">  968</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_exp.html">Exp</a>)</div>
+<div class="line"><a id="l00969" name="l00969"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357">  969</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l00970" name="l00970"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670">  970</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00971" name="l00971"></a><span class="lineno">  971</span> </div>
+<div class="line"><a id="l00972" name="l00972"></a><span class="lineno">  972</span> private:</div>
+<div class="line"><a id="l00973" name="l00973"></a><span class="lineno">  973</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00974" name="l00974"></a><span class="lineno">  974</span>};</div>
 </div>
-<div class="line"><a id="l00972" name="l00972"></a><span class="lineno">  972</span> </div>
-<div class="foldopen" id="foldopen00973" data-start="{" data-end="};">
-<div class="line"><a id="l00973" name="l00973"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html">  973</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_f_f_t.html">FFT</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l00974" name="l00974"></a><span class="lineno">  974</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen00975" data-start="{" data-end="}">
-<div class="line"><a id="l00975" name="l00975"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a0cdce626ed2c8eeeecc6949418437839">  975</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_f_f_t.html#a0cdce626ed2c8eeeecc6949418437839">FFT</a>(</div>
-<div class="line"><a id="l00976" name="l00976"></a><span class="lineno">  976</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l00977" name="l00977"></a><span class="lineno">  977</span>      <span class="keyword">const</span> std::vector&lt;size_t&gt;&amp; axes,</div>
-<div class="line"><a id="l00978" name="l00978"></a><span class="lineno">  978</span>      <span class="keywordtype">bool</span> inverse,</div>
-<div class="line"><a id="l00979" name="l00979"></a><span class="lineno">  979</span>      <span class="keywordtype">bool</span> real)</div>
-<div class="line"><a id="l00980" name="l00980"></a><span class="lineno">  980</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), axes_(axes), inverse_(inverse), real_(<a class="code hl_function" href="group__ops.html#gaf8913cabeb9fb193ba687aaeb2087764">real</a>) {}</div>
+<div class="line"><a id="l00975" name="l00975"></a><span class="lineno">  975</span> </div>
+<div class="foldopen" id="foldopen00976" data-start="{" data-end="};">
+<div class="line"><a id="l00976" name="l00976"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html">  976</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_expm1.html">Expm1</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00977" name="l00977"></a><span class="lineno">  977</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00978" name="l00978"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#a47c2a1b2a4ef6bb07ba77c55ddddaec2">  978</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_expm1.html#a47c2a1b2a4ef6bb07ba77c55ddddaec2">Expm1</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l00979" name="l00979"></a><span class="lineno">  979</span> </div>
+<div class="line"><a id="l00980" name="l00980"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">  980</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00981" name="l00981"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">  981</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l00982" name="l00982"></a><span class="lineno">  982</span> </div>
+<div class="line"><a id="l00983" name="l00983"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296">  983</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l00984" name="l00984"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1">  984</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l00985" name="l00985"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1">  985</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_expm1.html">Expm1</a>)</div>
+<div class="line"><a id="l00986" name="l00986"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08">  986</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l00987" name="l00987"></a><span class="lineno">  987</span> </div>
+<div class="line"><a id="l00988" name="l00988"></a><span class="lineno">  988</span> private:</div>
+<div class="line"><a id="l00989" name="l00989"></a><span class="lineno">  989</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00990" name="l00990"></a><span class="lineno">  990</span>};</div>
 </div>
-<div class="line"><a id="l00981" name="l00981"></a><span class="lineno">  981</span> </div>
-<div class="line"><a id="l00982" name="l00982"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">  982</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00983" name="l00983"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">  983</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l00984" name="l00984"></a><span class="lineno">  984</span> </div>
-<div class="line"><a id="l00985" name="l00985"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1">  985</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l00986" name="l00986"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6">  986</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l00987" name="l00987"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf">  987</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_f_f_t.html">FFT</a>)</div>
-<div class="line"><a id="l00988" name="l00988"></a><span class="lineno">  988</span> </div>
-<div class="line"><a id="l00989" name="l00989"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06">  989</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l00990" name="l00990"></a><span class="lineno">  990</span> </div>
-<div class="line"><a id="l00991" name="l00991"></a><span class="lineno">  991</span> private:</div>
-<div class="line"><a id="l00992" name="l00992"></a><span class="lineno">  992</span>  std::vector&lt;<span class="keywordtype">size_t</span>&gt; axes_;</div>
-<div class="line"><a id="l00993" name="l00993"></a><span class="lineno">  993</span>  <span class="keywordtype">bool</span> inverse_;</div>
-<div class="line"><a id="l00994" name="l00994"></a><span class="lineno">  994</span>  <span class="keywordtype">bool</span> real_;</div>
-<div class="line"><a id="l00995" name="l00995"></a><span class="lineno">  995</span> </div>
-<div class="line"><a id="l00996" name="l00996"></a><span class="lineno">  996</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00997" name="l00997"></a><span class="lineno">  997</span>};</div>
+<div class="line"><a id="l00991" name="l00991"></a><span class="lineno">  991</span> </div>
+<div class="foldopen" id="foldopen00992" data-start="{" data-end="};">
+<div class="line"><a id="l00992" name="l00992"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html">  992</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_f_f_t.html">FFT</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l00993" name="l00993"></a><span class="lineno">  993</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen00994" data-start="{" data-end="}">
+<div class="line"><a id="l00994" name="l00994"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a0cdce626ed2c8eeeecc6949418437839">  994</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_f_f_t.html#a0cdce626ed2c8eeeecc6949418437839">FFT</a>(</div>
+<div class="line"><a id="l00995" name="l00995"></a><span class="lineno">  995</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l00996" name="l00996"></a><span class="lineno">  996</span>      <span class="keyword">const</span> std::vector&lt;size_t&gt;&amp; axes,</div>
+<div class="line"><a id="l00997" name="l00997"></a><span class="lineno">  997</span>      <span class="keywordtype">bool</span> inverse,</div>
+<div class="line"><a id="l00998" name="l00998"></a><span class="lineno">  998</span>      <span class="keywordtype">bool</span> real)</div>
+<div class="line"><a id="l00999" name="l00999"></a><span class="lineno">  999</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), axes_(axes), inverse_(inverse), real_(<a class="code hl_function" href="group__ops.html#gaf8913cabeb9fb193ba687aaeb2087764">real</a>) {}</div>
 </div>
-<div class="line"><a id="l00998" name="l00998"></a><span class="lineno">  998</span> </div>
-<div class="foldopen" id="foldopen00999" data-start="{" data-end="};">
-<div class="line"><a id="l00999" name="l00999"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html">  999</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_floor.html">Floor</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01000" name="l01000"></a><span class="lineno"> 1000</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01001" name="l01001"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#ada4e979b784b732696313d7094e91340"> 1001</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_floor.html#ada4e979b784b732696313d7094e91340">Floor</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01002" name="l01002"></a><span class="lineno"> 1002</span> </div>
-<div class="line"><a id="l01003" name="l01003"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7"> 1003</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01004" name="l01004"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65"> 1004</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01005" name="l01005"></a><span class="lineno"> 1005</span> </div>
-<div class="line"><a id="l01006" name="l01006"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10"> 1006</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01007" name="l01007"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af"> 1007</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01008" name="l01008"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6"> 1008</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_floor.html">Floor</a>)</div>
-<div class="line"><a id="l01009" name="l01009"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94"> 1009</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01010" name="l01010"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015"> 1010</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01011" name="l01011"></a><span class="lineno"> 1011</span> </div>
-<div class="line"><a id="l01012" name="l01012"></a><span class="lineno"> 1012</span> private:</div>
-<div class="line"><a id="l01013" name="l01013"></a><span class="lineno"> 1013</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01014" name="l01014"></a><span class="lineno"> 1014</span>};</div>
+<div class="line"><a id="l01000" name="l01000"></a><span class="lineno"> 1000</span> </div>
+<div class="line"><a id="l01001" name="l01001"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635"> 1001</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01002" name="l01002"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd"> 1002</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01003" name="l01003"></a><span class="lineno"> 1003</span> </div>
+<div class="line"><a id="l01004" name="l01004"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1"> 1004</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01005" name="l01005"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6"> 1005</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01006" name="l01006"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf"> 1006</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_f_f_t.html">FFT</a>)</div>
+<div class="line"><a id="l01007" name="l01007"></a><span class="lineno"> 1007</span> </div>
+<div class="line"><a id="l01008" name="l01008"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06"> 1008</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01009" name="l01009"></a><span class="lineno"> 1009</span> </div>
+<div class="line"><a id="l01010" name="l01010"></a><span class="lineno"> 1010</span> private:</div>
+<div class="line"><a id="l01011" name="l01011"></a><span class="lineno"> 1011</span>  std::vector&lt;<span class="keywordtype">size_t</span>&gt; axes_;</div>
+<div class="line"><a id="l01012" name="l01012"></a><span class="lineno"> 1012</span>  <span class="keywordtype">bool</span> inverse_;</div>
+<div class="line"><a id="l01013" name="l01013"></a><span class="lineno"> 1013</span>  <span class="keywordtype">bool</span> real_;</div>
+<div class="line"><a id="l01014" name="l01014"></a><span class="lineno"> 1014</span> </div>
+<div class="line"><a id="l01015" name="l01015"></a><span class="lineno"> 1015</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01016" name="l01016"></a><span class="lineno"> 1016</span>};</div>
 </div>
-<div class="line"><a id="l01015" name="l01015"></a><span class="lineno"> 1015</span> </div>
-<div class="foldopen" id="foldopen01016" data-start="{" data-end="};">
-<div class="line"><a id="l01016" name="l01016"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html"> 1016</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_full.html">Full</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01017" name="l01017"></a><span class="lineno"> 1017</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01018" name="l01018"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#aafcb86a2e41353853ec48c717e0c54d6"> 1018</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_full.html#aafcb86a2e41353853ec48c717e0c54d6">Full</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01019" name="l01019"></a><span class="lineno"> 1019</span> </div>
-<div class="line"><a id="l01020" name="l01020"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c"> 1020</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01021" name="l01021"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872"> 1021</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01022" name="l01022"></a><span class="lineno"> 1022</span> </div>
-<div class="line"><a id="l01023" name="l01023"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95"> 1023</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01024" name="l01024"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407"> 1024</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01025" name="l01025"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013"> 1025</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_full.html">Full</a>)</div>
-<div class="line"><a id="l01026" name="l01026"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792"> 1026</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01027" name="l01027"></a><span class="lineno"> 1027</span> </div>
-<div class="line"><a id="l01028" name="l01028"></a><span class="lineno"> 1028</span> private:</div>
-<div class="line"><a id="l01029" name="l01029"></a><span class="lineno"> 1029</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01030" name="l01030"></a><span class="lineno"> 1030</span>};</div>
+<div class="line"><a id="l01017" name="l01017"></a><span class="lineno"> 1017</span> </div>
+<div class="foldopen" id="foldopen01018" data-start="{" data-end="};">
+<div class="line"><a id="l01018" name="l01018"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html"> 1018</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_floor.html">Floor</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01019" name="l01019"></a><span class="lineno"> 1019</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01020" name="l01020"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#ada4e979b784b732696313d7094e91340"> 1020</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_floor.html#ada4e979b784b732696313d7094e91340">Floor</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01021" name="l01021"></a><span class="lineno"> 1021</span> </div>
+<div class="line"><a id="l01022" name="l01022"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7"> 1022</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01023" name="l01023"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65"> 1023</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01024" name="l01024"></a><span class="lineno"> 1024</span> </div>
+<div class="line"><a id="l01025" name="l01025"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10"> 1025</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01026" name="l01026"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af"> 1026</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01027" name="l01027"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6"> 1027</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_floor.html">Floor</a>)</div>
+<div class="line"><a id="l01028" name="l01028"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94"> 1028</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01029" name="l01029"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015"> 1029</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01030" name="l01030"></a><span class="lineno"> 1030</span> </div>
+<div class="line"><a id="l01031" name="l01031"></a><span class="lineno"> 1031</span> private:</div>
+<div class="line"><a id="l01032" name="l01032"></a><span class="lineno"> 1032</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01033" name="l01033"></a><span class="lineno"> 1033</span>};</div>
 </div>
-<div class="line"><a id="l01031" name="l01031"></a><span class="lineno"> 1031</span> </div>
-<div class="foldopen" id="foldopen01032" data-start="{" data-end="};">
-<div class="line"><a id="l01032" name="l01032"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html"> 1032</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_gather.html">Gather</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01033" name="l01033"></a><span class="lineno"> 1033</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01034" data-start="{" data-end="}">
-<div class="line"><a id="l01034" name="l01034"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#a5b5f47ceff1d43477c87be5116f261d0"> 1034</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather.html#a5b5f47ceff1d43477c87be5116f261d0">Gather</a>(</div>
-<div class="line"><a id="l01035" name="l01035"></a><span class="lineno"> 1035</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01036" name="l01036"></a><span class="lineno"> 1036</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes,</div>
-<div class="line"><a id="l01037" name="l01037"></a><span class="lineno"> 1037</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; slice_sizes)</div>
-<div class="line"><a id="l01038" name="l01038"></a><span class="lineno"> 1038</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), axes_(axes), slice_sizes_(slice_sizes) {}</div>
+<div class="line"><a id="l01034" name="l01034"></a><span class="lineno"> 1034</span> </div>
+<div class="foldopen" id="foldopen01035" data-start="{" data-end="};">
+<div class="line"><a id="l01035" name="l01035"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html"> 1035</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_full.html">Full</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01036" name="l01036"></a><span class="lineno"> 1036</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01037" name="l01037"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#aafcb86a2e41353853ec48c717e0c54d6"> 1037</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_full.html#aafcb86a2e41353853ec48c717e0c54d6">Full</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01038" name="l01038"></a><span class="lineno"> 1038</span> </div>
+<div class="line"><a id="l01039" name="l01039"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c"> 1039</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01040" name="l01040"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872"> 1040</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01041" name="l01041"></a><span class="lineno"> 1041</span> </div>
+<div class="line"><a id="l01042" name="l01042"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95"> 1042</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01043" name="l01043"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407"> 1043</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01044" name="l01044"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013"> 1044</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_full.html">Full</a>)</div>
+<div class="line"><a id="l01045" name="l01045"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792"> 1045</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01046" name="l01046"></a><span class="lineno"> 1046</span> </div>
+<div class="line"><a id="l01047" name="l01047"></a><span class="lineno"> 1047</span> private:</div>
+<div class="line"><a id="l01048" name="l01048"></a><span class="lineno"> 1048</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01049" name="l01049"></a><span class="lineno"> 1049</span>};</div>
 </div>
-<div class="line"><a id="l01039" name="l01039"></a><span class="lineno"> 1039</span> </div>
-<div class="line"><a id="l01040" name="l01040"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290"> 1040</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01041" name="l01041"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8"> 1041</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01042" name="l01042"></a><span class="lineno"> 1042</span> </div>
-<div class="line"><a id="l01043" name="l01043"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275"> 1043</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01044" name="l01044"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d"> 1044</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01045" name="l01045"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91"> 1045</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_gather.html">Gather</a>)</div>
-<div class="line"><a id="l01046" name="l01046"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa"> 1046</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01047" name="l01047"></a><span class="lineno"> 1047</span> </div>
-<div class="line"><a id="l01048" name="l01048"></a><span class="lineno"> 1048</span> private:</div>
-<div class="line"><a id="l01049" name="l01049"></a><span class="lineno"> 1049</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01050" name="l01050"></a><span class="lineno"> 1050</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; axes_;</div>
-<div class="line"><a id="l01051" name="l01051"></a><span class="lineno"> 1051</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; slice_sizes_;</div>
-<div class="line"><a id="l01052" name="l01052"></a><span class="lineno"> 1052</span>};</div>
+<div class="line"><a id="l01050" name="l01050"></a><span class="lineno"> 1050</span> </div>
+<div class="foldopen" id="foldopen01051" data-start="{" data-end="};">
+<div class="line"><a id="l01051" name="l01051"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html"> 1051</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_gather.html">Gather</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01052" name="l01052"></a><span class="lineno"> 1052</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01053" data-start="{" data-end="}">
+<div class="line"><a id="l01053" name="l01053"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#a5b5f47ceff1d43477c87be5116f261d0"> 1053</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather.html#a5b5f47ceff1d43477c87be5116f261d0">Gather</a>(</div>
+<div class="line"><a id="l01054" name="l01054"></a><span class="lineno"> 1054</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01055" name="l01055"></a><span class="lineno"> 1055</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes,</div>
+<div class="line"><a id="l01056" name="l01056"></a><span class="lineno"> 1056</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; slice_sizes)</div>
+<div class="line"><a id="l01057" name="l01057"></a><span class="lineno"> 1057</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), axes_(axes), slice_sizes_(slice_sizes) {}</div>
 </div>
-<div class="line"><a id="l01053" name="l01053"></a><span class="lineno"> 1053</span> </div>
-<div class="foldopen" id="foldopen01054" data-start="{" data-end="};">
-<div class="line"><a id="l01054" name="l01054"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html"> 1054</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_greater.html">Greater</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01055" name="l01055"></a><span class="lineno"> 1055</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01056" name="l01056"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b"> 1056</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b">Greater</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01057" name="l01057"></a><span class="lineno"> 1057</span> </div>
-<div class="line"><a id="l01058" name="l01058"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae"> 1058</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01059" name="l01059"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878"> 1059</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01060" name="l01060"></a><span class="lineno"> 1060</span> </div>
-<div class="line"><a id="l01061" name="l01061"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0"> 1061</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01062" name="l01062"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1"> 1062</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01063" name="l01063"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04"> 1063</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_greater.html">Greater</a>)</div>
-<div class="line"><a id="l01064" name="l01064"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1"> 1064</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01065" name="l01065"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46"> 1065</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01058" name="l01058"></a><span class="lineno"> 1058</span> </div>
+<div class="line"><a id="l01059" name="l01059"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290"> 1059</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01060" name="l01060"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8"> 1060</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01061" name="l01061"></a><span class="lineno"> 1061</span> </div>
+<div class="line"><a id="l01062" name="l01062"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275"> 1062</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01063" name="l01063"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d"> 1063</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01064" name="l01064"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91"> 1064</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_gather.html">Gather</a>)</div>
+<div class="line"><a id="l01065" name="l01065"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa"> 1065</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
 <div class="line"><a id="l01066" name="l01066"></a><span class="lineno"> 1066</span> </div>
 <div class="line"><a id="l01067" name="l01067"></a><span class="lineno"> 1067</span> private:</div>
 <div class="line"><a id="l01068" name="l01068"></a><span class="lineno"> 1068</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01069" name="l01069"></a><span class="lineno"> 1069</span>};</div>
+<div class="line"><a id="l01069" name="l01069"></a><span class="lineno"> 1069</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; axes_;</div>
+<div class="line"><a id="l01070" name="l01070"></a><span class="lineno"> 1070</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; slice_sizes_;</div>
+<div class="line"><a id="l01071" name="l01071"></a><span class="lineno"> 1071</span>};</div>
 </div>
-<div class="line"><a id="l01070" name="l01070"></a><span class="lineno"> 1070</span> </div>
-<div class="foldopen" id="foldopen01071" data-start="{" data-end="};">
-<div class="line"><a id="l01071" name="l01071"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html"> 1071</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_greater_equal.html">GreaterEqual</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01072" name="l01072"></a><span class="lineno"> 1072</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01073" name="l01073"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#a19a3c49d5a9b40e17da0e56ef6908527"> 1073</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater_equal.html#a19a3c49d5a9b40e17da0e56ef6908527">GreaterEqual</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01074" name="l01074"></a><span class="lineno"> 1074</span> </div>
-<div class="line"><a id="l01075" name="l01075"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075"> 1075</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01076" name="l01076"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24"> 1076</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01077" name="l01077"></a><span class="lineno"> 1077</span> </div>
-<div class="line"><a id="l01078" name="l01078"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d"> 1078</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01079" name="l01079"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20"> 1079</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01080" name="l01080"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef"> 1080</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_greater_equal.html">GreaterEqual</a>)</div>
-<div class="line"><a id="l01081" name="l01081"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc"> 1081</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01082" name="l01082"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f"> 1082</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01083" name="l01083"></a><span class="lineno"> 1083</span> </div>
-<div class="line"><a id="l01084" name="l01084"></a><span class="lineno"> 1084</span> private:</div>
-<div class="line"><a id="l01085" name="l01085"></a><span class="lineno"> 1085</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01086" name="l01086"></a><span class="lineno"> 1086</span>};</div>
+<div class="line"><a id="l01072" name="l01072"></a><span class="lineno"> 1072</span> </div>
+<div class="foldopen" id="foldopen01073" data-start="{" data-end="};">
+<div class="line"><a id="l01073" name="l01073"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html"> 1073</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_greater.html">Greater</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01074" name="l01074"></a><span class="lineno"> 1074</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01075" name="l01075"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b"> 1075</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b">Greater</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01076" name="l01076"></a><span class="lineno"> 1076</span> </div>
+<div class="line"><a id="l01077" name="l01077"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae"> 1077</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01078" name="l01078"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878"> 1078</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01079" name="l01079"></a><span class="lineno"> 1079</span> </div>
+<div class="line"><a id="l01080" name="l01080"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0"> 1080</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01081" name="l01081"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1"> 1081</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01082" name="l01082"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04"> 1082</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_greater.html">Greater</a>)</div>
+<div class="line"><a id="l01083" name="l01083"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1"> 1083</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01084" name="l01084"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46"> 1084</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01085" name="l01085"></a><span class="lineno"> 1085</span> </div>
+<div class="line"><a id="l01086" name="l01086"></a><span class="lineno"> 1086</span> private:</div>
+<div class="line"><a id="l01087" name="l01087"></a><span class="lineno"> 1087</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01088" name="l01088"></a><span class="lineno"> 1088</span>};</div>
 </div>
-<div class="line"><a id="l01087" name="l01087"></a><span class="lineno"> 1087</span> </div>
-<div class="foldopen" id="foldopen01088" data-start="{" data-end="};">
-<div class="line"><a id="l01088" name="l01088"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html"> 1088</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_hadamard.html">Hadamard</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01089" name="l01089"></a><span class="lineno"> 1089</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01090" data-start="{" data-end="}">
-<div class="line"><a id="l01090" name="l01090"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#abe4a0ed820b126940beec519d4239923"> 1090</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_hadamard.html#abe4a0ed820b126940beec519d4239923">Hadamard</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">float</span> scale)</div>
-<div class="line"><a id="l01091" name="l01091"></a><span class="lineno"> 1091</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), scale_(scale) {}</div>
-</div>
-<div class="line"><a id="l01092" name="l01092"></a><span class="lineno"> 1092</span> </div>
-<div class="line"><a id="l01093" name="l01093"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d"> 1093</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01094" name="l01094"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733"> 1094</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01095" name="l01095"></a><span class="lineno"> 1095</span> </div>
-<div class="line"><a id="l01096" name="l01096"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c"> 1096</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01097" name="l01097"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a"> 1097</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01098" name="l01098"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6"> 1098</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_hadamard.html">Hadamard</a>)</div>
-<div class="line"><a id="l01099" name="l01099"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde"> 1099</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01100" name="l01100"></a><span class="lineno"> 1100</span> </div>
-<div class="line"><a id="l01101" name="l01101"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8"> 1101</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01089" name="l01089"></a><span class="lineno"> 1089</span> </div>
+<div class="foldopen" id="foldopen01090" data-start="{" data-end="};">
+<div class="line"><a id="l01090" name="l01090"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html"> 1090</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_greater_equal.html">GreaterEqual</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01091" name="l01091"></a><span class="lineno"> 1091</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01092" name="l01092"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#a19a3c49d5a9b40e17da0e56ef6908527"> 1092</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater_equal.html#a19a3c49d5a9b40e17da0e56ef6908527">GreaterEqual</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01093" name="l01093"></a><span class="lineno"> 1093</span> </div>
+<div class="line"><a id="l01094" name="l01094"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075"> 1094</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01095" name="l01095"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24"> 1095</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01096" name="l01096"></a><span class="lineno"> 1096</span> </div>
+<div class="line"><a id="l01097" name="l01097"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d"> 1097</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01098" name="l01098"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20"> 1098</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01099" name="l01099"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef"> 1099</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_greater_equal.html">GreaterEqual</a>)</div>
+<div class="line"><a id="l01100" name="l01100"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc"> 1100</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01101" name="l01101"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f"> 1101</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
 <div class="line"><a id="l01102" name="l01102"></a><span class="lineno"> 1102</span> </div>
 <div class="line"><a id="l01103" name="l01103"></a><span class="lineno"> 1103</span> private:</div>
-<div class="line"><a id="l01104" name="l01104"></a><span class="lineno"> 1104</span>  <span class="keywordtype">float</span> scale_;</div>
-<div class="line"><a id="l01105" name="l01105"></a><span class="lineno"> 1105</span> </div>
-<div class="line"><a id="l01106" name="l01106"></a><span class="lineno"> 1106</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01107" name="l01107"></a><span class="lineno"> 1107</span>};</div>
+<div class="line"><a id="l01104" name="l01104"></a><span class="lineno"> 1104</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01105" name="l01105"></a><span class="lineno"> 1105</span>};</div>
 </div>
-<div class="line"><a id="l01108" name="l01108"></a><span class="lineno"> 1108</span> </div>
-<div class="foldopen" id="foldopen01109" data-start="{" data-end="};">
-<div class="line"><a id="l01109" name="l01109"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html"> 1109</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_imag.html">Imag</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01110" name="l01110"></a><span class="lineno"> 1110</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01111" name="l01111"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a284b7de34a316110fdc98e7b753372b2"> 1111</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_imag.html#a284b7de34a316110fdc98e7b753372b2">Imag</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01112" name="l01112"></a><span class="lineno"> 1112</span> </div>
-<div class="line"><a id="l01113" name="l01113"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829"> 1113</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01114" name="l01114"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6"> 1114</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01115" name="l01115"></a><span class="lineno"> 1115</span> </div>
-<div class="line"><a id="l01116" name="l01116"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3"> 1116</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01117" name="l01117"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a"> 1117</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01118" name="l01118"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d"> 1118</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_imag.html">Imag</a>)</div>
-<div class="line"><a id="l01119" name="l01119"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5"> 1119</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01120" name="l01120"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48"> 1120</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01121" name="l01121"></a><span class="lineno"> 1121</span>};</div>
+<div class="line"><a id="l01106" name="l01106"></a><span class="lineno"> 1106</span> </div>
+<div class="foldopen" id="foldopen01107" data-start="{" data-end="};">
+<div class="line"><a id="l01107" name="l01107"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html"> 1107</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_hadamard.html">Hadamard</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01108" name="l01108"></a><span class="lineno"> 1108</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01109" data-start="{" data-end="}">
+<div class="line"><a id="l01109" name="l01109"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#abe4a0ed820b126940beec519d4239923"> 1109</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_hadamard.html#abe4a0ed820b126940beec519d4239923">Hadamard</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">float</span> scale)</div>
+<div class="line"><a id="l01110" name="l01110"></a><span class="lineno"> 1110</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), scale_(scale) {}</div>
 </div>
-<div class="line"><a id="l01122" name="l01122"></a><span class="lineno"> 1122</span> </div>
-<div class="foldopen" id="foldopen01123" data-start="{" data-end="};">
-<div class="line"><a id="l01123" name="l01123"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html"> 1123</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_less.html">Less</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01124" name="l01124"></a><span class="lineno"> 1124</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01125" name="l01125"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#aa55c5cfbab0ac30e1b72c080fe9525d7"> 1125</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less.html#aa55c5cfbab0ac30e1b72c080fe9525d7">Less</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01126" name="l01126"></a><span class="lineno"> 1126</span> </div>
-<div class="line"><a id="l01127" name="l01127"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef"> 1127</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01128" name="l01128"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917"> 1128</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01129" name="l01129"></a><span class="lineno"> 1129</span> </div>
-<div class="line"><a id="l01130" name="l01130"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e"> 1130</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01131" name="l01131"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce"> 1131</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01132" name="l01132"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78"> 1132</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_less.html">Less</a>)</div>
-<div class="line"><a id="l01133" name="l01133"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63"> 1133</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01134" name="l01134"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278"> 1134</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01135" name="l01135"></a><span class="lineno"> 1135</span> </div>
-<div class="line"><a id="l01136" name="l01136"></a><span class="lineno"> 1136</span> private:</div>
-<div class="line"><a id="l01137" name="l01137"></a><span class="lineno"> 1137</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01138" name="l01138"></a><span class="lineno"> 1138</span>};</div>
+<div class="line"><a id="l01111" name="l01111"></a><span class="lineno"> 1111</span> </div>
+<div class="line"><a id="l01112" name="l01112"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d"> 1112</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01113" name="l01113"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733"> 1113</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01114" name="l01114"></a><span class="lineno"> 1114</span> </div>
+<div class="line"><a id="l01115" name="l01115"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c"> 1115</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01116" name="l01116"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a"> 1116</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01117" name="l01117"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6"> 1117</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_hadamard.html">Hadamard</a>)</div>
+<div class="line"><a id="l01118" name="l01118"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde"> 1118</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01119" name="l01119"></a><span class="lineno"> 1119</span> </div>
+<div class="line"><a id="l01120" name="l01120"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8"> 1120</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01121" name="l01121"></a><span class="lineno"> 1121</span> </div>
+<div class="line"><a id="l01122" name="l01122"></a><span class="lineno"> 1122</span> private:</div>
+<div class="line"><a id="l01123" name="l01123"></a><span class="lineno"> 1123</span>  <span class="keywordtype">float</span> scale_;</div>
+<div class="line"><a id="l01124" name="l01124"></a><span class="lineno"> 1124</span> </div>
+<div class="line"><a id="l01125" name="l01125"></a><span class="lineno"> 1125</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01126" name="l01126"></a><span class="lineno"> 1126</span>};</div>
 </div>
-<div class="line"><a id="l01139" name="l01139"></a><span class="lineno"> 1139</span> </div>
-<div class="foldopen" id="foldopen01140" data-start="{" data-end="};">
-<div class="line"><a id="l01140" name="l01140"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html"> 1140</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_less_equal.html">LessEqual</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01141" name="l01141"></a><span class="lineno"> 1141</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01142" name="l01142"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a52492a43224d47e7851beec646c27bbc"> 1142</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less_equal.html#a52492a43224d47e7851beec646c27bbc">LessEqual</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01143" name="l01143"></a><span class="lineno"> 1143</span> </div>
-<div class="line"><a id="l01144" name="l01144"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16"> 1144</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01145" name="l01145"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac"> 1145</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01146" name="l01146"></a><span class="lineno"> 1146</span> </div>
-<div class="line"><a id="l01147" name="l01147"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480"> 1147</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01148" name="l01148"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f"> 1148</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01149" name="l01149"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950"> 1149</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_less_equal.html">LessEqual</a>)</div>
-<div class="line"><a id="l01150" name="l01150"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af"> 1150</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01151" name="l01151"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f"> 1151</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01152" name="l01152"></a><span class="lineno"> 1152</span> </div>
-<div class="line"><a id="l01153" name="l01153"></a><span class="lineno"> 1153</span> private:</div>
-<div class="line"><a id="l01154" name="l01154"></a><span class="lineno"> 1154</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01155" name="l01155"></a><span class="lineno"> 1155</span>};</div>
+<div class="line"><a id="l01127" name="l01127"></a><span class="lineno"> 1127</span> </div>
+<div class="foldopen" id="foldopen01128" data-start="{" data-end="};">
+<div class="line"><a id="l01128" name="l01128"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html"> 1128</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_imag.html">Imag</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01129" name="l01129"></a><span class="lineno"> 1129</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01130" name="l01130"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a284b7de34a316110fdc98e7b753372b2"> 1130</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_imag.html#a284b7de34a316110fdc98e7b753372b2">Imag</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01131" name="l01131"></a><span class="lineno"> 1131</span> </div>
+<div class="line"><a id="l01132" name="l01132"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829"> 1132</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01133" name="l01133"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6"> 1133</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01134" name="l01134"></a><span class="lineno"> 1134</span> </div>
+<div class="line"><a id="l01135" name="l01135"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3"> 1135</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01136" name="l01136"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a"> 1136</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01137" name="l01137"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d"> 1137</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_imag.html">Imag</a>)</div>
+<div class="line"><a id="l01138" name="l01138"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5"> 1138</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01139" name="l01139"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48"> 1139</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01140" name="l01140"></a><span class="lineno"> 1140</span>};</div>
 </div>
-<div class="line"><a id="l01156" name="l01156"></a><span class="lineno"> 1156</span> </div>
-<div class="foldopen" id="foldopen01157" data-start="{" data-end="};">
-<div class="line"><a id="l01157" name="l01157"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html"> 1157</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_load.html">Load</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01158" name="l01158"></a><span class="lineno"> 1158</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01159" data-start="{" data-end="}">
-<div class="line"><a id="l01159" name="l01159"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a"> 1159</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a">Load</a>(</div>
-<div class="line"><a id="l01160" name="l01160"></a><span class="lineno"> 1160</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01161" name="l01161"></a><span class="lineno"> 1161</span>      std::shared_ptr&lt;io::Reader&gt; reader,</div>
-<div class="line"><a id="l01162" name="l01162"></a><span class="lineno"> 1162</span>      <span class="keywordtype">size_t</span> offset,</div>
-<div class="line"><a id="l01163" name="l01163"></a><span class="lineno"> 1163</span>      <span class="keywordtype">bool</span> swap_endianness = <span class="keyword">false</span>)</div>
-<div class="line"><a id="l01164" name="l01164"></a><span class="lineno"> 1164</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
-<div class="line"><a id="l01165" name="l01165"></a><span class="lineno"> 1165</span>        reader_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(reader)),</div>
-<div class="line"><a id="l01166" name="l01166"></a><span class="lineno"> 1166</span>        offset_(offset),</div>
-<div class="line"><a id="l01167" name="l01167"></a><span class="lineno"> 1167</span>        swap_endianness_(swap_endianness) {</div>
-<div class="line"><a id="l01168" name="l01168"></a><span class="lineno"> 1168</span>    <span class="keywordflow">if</span> (stream.<a class="code hl_variable" href="structmlx_1_1core_1_1_stream.html#a406b1b0162287a4162fab1f70e2ff3bb">device</a> == Device::gpu) {</div>
-<div class="line"><a id="l01169" name="l01169"></a><span class="lineno"> 1169</span>      io_stream();</div>
-<div class="line"><a id="l01170" name="l01170"></a><span class="lineno"> 1170</span>    }</div>
-<div class="line"><a id="l01171" name="l01171"></a><span class="lineno"> 1171</span>  }</div>
+<div class="line"><a id="l01141" name="l01141"></a><span class="lineno"> 1141</span> </div>
+<div class="foldopen" id="foldopen01142" data-start="{" data-end="};">
+<div class="line"><a id="l01142" name="l01142"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html"> 1142</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_less.html">Less</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01143" name="l01143"></a><span class="lineno"> 1143</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01144" name="l01144"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#aa55c5cfbab0ac30e1b72c080fe9525d7"> 1144</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less.html#aa55c5cfbab0ac30e1b72c080fe9525d7">Less</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01145" name="l01145"></a><span class="lineno"> 1145</span> </div>
+<div class="line"><a id="l01146" name="l01146"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef"> 1146</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01147" name="l01147"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917"> 1147</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01148" name="l01148"></a><span class="lineno"> 1148</span> </div>
+<div class="line"><a id="l01149" name="l01149"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e"> 1149</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01150" name="l01150"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce"> 1150</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01151" name="l01151"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78"> 1151</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_less.html">Less</a>)</div>
+<div class="line"><a id="l01152" name="l01152"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63"> 1152</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01153" name="l01153"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278"> 1153</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01154" name="l01154"></a><span class="lineno"> 1154</span> </div>
+<div class="line"><a id="l01155" name="l01155"></a><span class="lineno"> 1155</span> private:</div>
+<div class="line"><a id="l01156" name="l01156"></a><span class="lineno"> 1156</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01157" name="l01157"></a><span class="lineno"> 1157</span>};</div>
+</div>
+<div class="line"><a id="l01158" name="l01158"></a><span class="lineno"> 1158</span> </div>
+<div class="foldopen" id="foldopen01159" data-start="{" data-end="};">
+<div class="line"><a id="l01159" name="l01159"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html"> 1159</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_less_equal.html">LessEqual</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01160" name="l01160"></a><span class="lineno"> 1160</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01161" name="l01161"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a52492a43224d47e7851beec646c27bbc"> 1161</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less_equal.html#a52492a43224d47e7851beec646c27bbc">LessEqual</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01162" name="l01162"></a><span class="lineno"> 1162</span> </div>
+<div class="line"><a id="l01163" name="l01163"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16"> 1163</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01164" name="l01164"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac"> 1164</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01165" name="l01165"></a><span class="lineno"> 1165</span> </div>
+<div class="line"><a id="l01166" name="l01166"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480"> 1166</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01167" name="l01167"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f"> 1167</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01168" name="l01168"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950"> 1168</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_less_equal.html">LessEqual</a>)</div>
+<div class="line"><a id="l01169" name="l01169"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af"> 1169</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01170" name="l01170"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f"> 1170</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01171" name="l01171"></a><span class="lineno"> 1171</span> </div>
+<div class="line"><a id="l01172" name="l01172"></a><span class="lineno"> 1172</span> private:</div>
+<div class="line"><a id="l01173" name="l01173"></a><span class="lineno"> 1173</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01174" name="l01174"></a><span class="lineno"> 1174</span>};</div>
 </div>
-<div class="line"><a id="l01172" name="l01172"></a><span class="lineno"> 1172</span> </div>
-<div class="line"><a id="l01173" name="l01173"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a"> 1173</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01174" name="l01174"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d"> 1174</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
 <div class="line"><a id="l01175" name="l01175"></a><span class="lineno"> 1175</span> </div>
-<div class="line"><a id="l01176" name="l01176"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa"> 1176</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_load.html">Load</a>)</div>
-<div class="line"><a id="l01177" name="l01177"></a><span class="lineno"> 1177</span> </div>
-<div class="line"><a id="l01178" name="l01178"></a><span class="lineno"> 1178</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l01179" name="l01179"></a><span class="lineno"> 1179</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a>&amp; io_stream() {</div>
-<div class="line"><a id="l01180" name="l01180"></a><span class="lineno"> 1180</span>    <span class="keyword">static</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> io_stream = new_stream(Device::cpu);</div>
-<div class="line"><a id="l01181" name="l01181"></a><span class="lineno"> 1181</span>    <span class="keywordflow">return</span> io_stream;</div>
-<div class="line"><a id="l01182" name="l01182"></a><span class="lineno"> 1182</span>  };</div>
-<div class="line"><a id="l01183" name="l01183"></a><span class="lineno"> 1183</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01184" name="l01184"></a><span class="lineno"> 1184</span>  std::shared_ptr&lt;io::Reader&gt; reader_;</div>
-<div class="line"><a id="l01185" name="l01185"></a><span class="lineno"> 1185</span>  <span class="keywordtype">size_t</span> offset_;</div>
-<div class="line"><a id="l01186" name="l01186"></a><span class="lineno"> 1186</span>  <span class="keywordtype">bool</span> swap_endianness_;</div>
-<div class="line"><a id="l01187" name="l01187"></a><span class="lineno"> 1187</span>};</div>
+<div class="foldopen" id="foldopen01176" data-start="{" data-end="};">
+<div class="line"><a id="l01176" name="l01176"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html"> 1176</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_load.html">Load</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01177" name="l01177"></a><span class="lineno"> 1177</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01178" data-start="{" data-end="}">
+<div class="line"><a id="l01178" name="l01178"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a"> 1178</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a">Load</a>(</div>
+<div class="line"><a id="l01179" name="l01179"></a><span class="lineno"> 1179</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01180" name="l01180"></a><span class="lineno"> 1180</span>      std::shared_ptr&lt;io::Reader&gt; reader,</div>
+<div class="line"><a id="l01181" name="l01181"></a><span class="lineno"> 1181</span>      <span class="keywordtype">size_t</span> offset,</div>
+<div class="line"><a id="l01182" name="l01182"></a><span class="lineno"> 1182</span>      <span class="keywordtype">bool</span> swap_endianness = <span class="keyword">false</span>)</div>
+<div class="line"><a id="l01183" name="l01183"></a><span class="lineno"> 1183</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
+<div class="line"><a id="l01184" name="l01184"></a><span class="lineno"> 1184</span>        reader_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(reader)),</div>
+<div class="line"><a id="l01185" name="l01185"></a><span class="lineno"> 1185</span>        offset_(offset),</div>
+<div class="line"><a id="l01186" name="l01186"></a><span class="lineno"> 1186</span>        swap_endianness_(swap_endianness) {</div>
+<div class="line"><a id="l01187" name="l01187"></a><span class="lineno"> 1187</span>    <span class="keywordflow">if</span> (stream.<a class="code hl_variable" href="structmlx_1_1core_1_1_stream.html#a406b1b0162287a4162fab1f70e2ff3bb">device</a> == Device::gpu) {</div>
+<div class="line"><a id="l01188" name="l01188"></a><span class="lineno"> 1188</span>      io_stream();</div>
+<div class="line"><a id="l01189" name="l01189"></a><span class="lineno"> 1189</span>    }</div>
+<div class="line"><a id="l01190" name="l01190"></a><span class="lineno"> 1190</span>  }</div>
 </div>
-<div class="line"><a id="l01188" name="l01188"></a><span class="lineno"> 1188</span> </div>
-<div class="foldopen" id="foldopen01189" data-start="{" data-end="};">
-<div class="line"><a id="l01189" name="l01189"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html"> 1189</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_log.html">Log</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01190" name="l01190"></a><span class="lineno"> 1190</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01191" name="l01191"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421"> 1191</a></span>  <span class="keyword">enum</span> <a class="code hl_enumeration" href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421">Base</a> { two, ten, e };</div>
-<div class="line"><a id="l01192" name="l01192"></a><span class="lineno"> 1192</span> </div>
-<div class="foldopen" id="foldopen01193" data-start="{" data-end="}">
-<div class="line"><a id="l01193" name="l01193"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9"> 1193</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9">Log</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <a class="code hl_enumeration" href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421">Base</a> base)</div>
-<div class="line"><a id="l01194" name="l01194"></a><span class="lineno"> 1194</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), base_(base) {}</div>
+<div class="line"><a id="l01191" name="l01191"></a><span class="lineno"> 1191</span> </div>
+<div class="line"><a id="l01192" name="l01192"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a"> 1192</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01193" name="l01193"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d"> 1193</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01194" name="l01194"></a><span class="lineno"> 1194</span> </div>
+<div class="line"><a id="l01195" name="l01195"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa"> 1195</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_load.html">Load</a>)</div>
+<div class="line"><a id="l01196" name="l01196"></a><span class="lineno"> 1196</span> </div>
+<div class="line"><a id="l01197" name="l01197"></a><span class="lineno"> 1197</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l01198" name="l01198"></a><span class="lineno"> 1198</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a>&amp; io_stream() {</div>
+<div class="line"><a id="l01199" name="l01199"></a><span class="lineno"> 1199</span>    <span class="keyword">static</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> io_stream = new_stream(Device::cpu);</div>
+<div class="line"><a id="l01200" name="l01200"></a><span class="lineno"> 1200</span>    <span class="keywordflow">return</span> io_stream;</div>
+<div class="line"><a id="l01201" name="l01201"></a><span class="lineno"> 1201</span>  };</div>
+<div class="line"><a id="l01202" name="l01202"></a><span class="lineno"> 1202</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01203" name="l01203"></a><span class="lineno"> 1203</span>  std::shared_ptr&lt;io::Reader&gt; reader_;</div>
+<div class="line"><a id="l01204" name="l01204"></a><span class="lineno"> 1204</span>  <span class="keywordtype">size_t</span> offset_;</div>
+<div class="line"><a id="l01205" name="l01205"></a><span class="lineno"> 1205</span>  <span class="keywordtype">bool</span> swap_endianness_;</div>
+<div class="line"><a id="l01206" name="l01206"></a><span class="lineno"> 1206</span>};</div>
 </div>
-<div class="line"><a id="l01195" name="l01195"></a><span class="lineno"> 1195</span> </div>
-<div class="line"><a id="l01196" name="l01196"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f"> 1196</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01197" name="l01197"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390"> 1197</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01198" name="l01198"></a><span class="lineno"> 1198</span> </div>
-<div class="line"><a id="l01199" name="l01199"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49"> 1199</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01200" name="l01200"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832"> 1200</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01201" name="l01201"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8"> 1201</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01202" name="l01202"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d"> 1202</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01203" name="l01203"></a><span class="lineno"> 1203</span> </div>
-<div class="foldopen" id="foldopen01204" data-start="{" data-end="}">
-<div class="line"><a id="l01204" name="l01204"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d"> 1204</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
-<div class="line"><a id="l01205" name="l01205"></a><span class="lineno"> 1205</span>    <span class="keywordflow">switch</span> (base_) {</div>
-<div class="line"><a id="l01206" name="l01206"></a><span class="lineno"> 1206</span>      <span class="keywordflow">case</span> e:</div>
-<div class="line"><a id="l01207" name="l01207"></a><span class="lineno"> 1207</span>        os &lt;&lt; <span class="stringliteral">&quot;Log&quot;</span>;</div>
-<div class="line"><a id="l01208" name="l01208"></a><span class="lineno"> 1208</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01209" name="l01209"></a><span class="lineno"> 1209</span>      <span class="keywordflow">case</span> two:</div>
-<div class="line"><a id="l01210" name="l01210"></a><span class="lineno"> 1210</span>        os &lt;&lt; <span class="stringliteral">&quot;Log2&quot;</span>;</div>
-<div class="line"><a id="l01211" name="l01211"></a><span class="lineno"> 1211</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01212" name="l01212"></a><span class="lineno"> 1212</span>      <span class="keywordflow">case</span> ten:</div>
-<div class="line"><a id="l01213" name="l01213"></a><span class="lineno"> 1213</span>        os &lt;&lt; <span class="stringliteral">&quot;Log10&quot;</span>;</div>
-<div class="line"><a id="l01214" name="l01214"></a><span class="lineno"> 1214</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01215" name="l01215"></a><span class="lineno"> 1215</span>    }</div>
-<div class="line"><a id="l01216" name="l01216"></a><span class="lineno"> 1216</span>  }</div>
+<div class="line"><a id="l01207" name="l01207"></a><span class="lineno"> 1207</span> </div>
+<div class="foldopen" id="foldopen01208" data-start="{" data-end="};">
+<div class="line"><a id="l01208" name="l01208"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html"> 1208</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_log.html">Log</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01209" name="l01209"></a><span class="lineno"> 1209</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01210" name="l01210"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421"> 1210</a></span>  <span class="keyword">enum</span> <a class="code hl_enumeration" href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421">Base</a> { two, ten, e };</div>
+<div class="line"><a id="l01211" name="l01211"></a><span class="lineno"> 1211</span> </div>
+<div class="foldopen" id="foldopen01212" data-start="{" data-end="}">
+<div class="line"><a id="l01212" name="l01212"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9"> 1212</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9">Log</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <a class="code hl_enumeration" href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421">Base</a> base)</div>
+<div class="line"><a id="l01213" name="l01213"></a><span class="lineno"> 1213</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), base_(base) {}</div>
 </div>
+<div class="line"><a id="l01214" name="l01214"></a><span class="lineno"> 1214</span> </div>
+<div class="line"><a id="l01215" name="l01215"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f"> 1215</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01216" name="l01216"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390"> 1216</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
 <div class="line"><a id="l01217" name="l01217"></a><span class="lineno"> 1217</span> </div>
-<div class="line"><a id="l01218" name="l01218"></a><span class="lineno"> 1218</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l01219" name="l01219"></a><span class="lineno"> 1219</span>  Base base_;</div>
-<div class="line"><a id="l01220" name="l01220"></a><span class="lineno"> 1220</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01221" name="l01221"></a><span class="lineno"> 1221</span>};</div>
-</div>
+<div class="line"><a id="l01218" name="l01218"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49"> 1218</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01219" name="l01219"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832"> 1219</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01220" name="l01220"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8"> 1220</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01221" name="l01221"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d"> 1221</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
 <div class="line"><a id="l01222" name="l01222"></a><span class="lineno"> 1222</span> </div>
-<div class="foldopen" id="foldopen01223" data-start="{" data-end="};">
-<div class="line"><a id="l01223" name="l01223"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html"> 1223</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_log1p.html">Log1p</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01224" name="l01224"></a><span class="lineno"> 1224</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01225" name="l01225"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a"> 1225</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a">Log1p</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01226" name="l01226"></a><span class="lineno"> 1226</span> </div>
-<div class="line"><a id="l01227" name="l01227"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23"> 1227</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01228" name="l01228"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431"> 1228</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01229" name="l01229"></a><span class="lineno"> 1229</span> </div>
-<div class="line"><a id="l01230" name="l01230"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71"> 1230</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01231" name="l01231"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2"> 1231</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01232" name="l01232"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4"> 1232</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_log1p.html">Log1p</a>)</div>
-<div class="line"><a id="l01233" name="l01233"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df"> 1233</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01234" name="l01234"></a><span class="lineno"> 1234</span> </div>
-<div class="line"><a id="l01235" name="l01235"></a><span class="lineno"> 1235</span> private:</div>
-<div class="line"><a id="l01236" name="l01236"></a><span class="lineno"> 1236</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01237" name="l01237"></a><span class="lineno"> 1237</span>};</div>
+<div class="foldopen" id="foldopen01223" data-start="{" data-end="}">
+<div class="line"><a id="l01223" name="l01223"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d"> 1223</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
+<div class="line"><a id="l01224" name="l01224"></a><span class="lineno"> 1224</span>    <span class="keywordflow">switch</span> (base_) {</div>
+<div class="line"><a id="l01225" name="l01225"></a><span class="lineno"> 1225</span>      <span class="keywordflow">case</span> e:</div>
+<div class="line"><a id="l01226" name="l01226"></a><span class="lineno"> 1226</span>        os &lt;&lt; <span class="stringliteral">&quot;Log&quot;</span>;</div>
+<div class="line"><a id="l01227" name="l01227"></a><span class="lineno"> 1227</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01228" name="l01228"></a><span class="lineno"> 1228</span>      <span class="keywordflow">case</span> two:</div>
+<div class="line"><a id="l01229" name="l01229"></a><span class="lineno"> 1229</span>        os &lt;&lt; <span class="stringliteral">&quot;Log2&quot;</span>;</div>
+<div class="line"><a id="l01230" name="l01230"></a><span class="lineno"> 1230</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01231" name="l01231"></a><span class="lineno"> 1231</span>      <span class="keywordflow">case</span> ten:</div>
+<div class="line"><a id="l01232" name="l01232"></a><span class="lineno"> 1232</span>        os &lt;&lt; <span class="stringliteral">&quot;Log10&quot;</span>;</div>
+<div class="line"><a id="l01233" name="l01233"></a><span class="lineno"> 1233</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01234" name="l01234"></a><span class="lineno"> 1234</span>    }</div>
+<div class="line"><a id="l01235" name="l01235"></a><span class="lineno"> 1235</span>  }</div>
 </div>
-<div class="line"><a id="l01238" name="l01238"></a><span class="lineno"> 1238</span> </div>
-<div class="foldopen" id="foldopen01239" data-start="{" data-end="};">
-<div class="line"><a id="l01239" name="l01239"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html"> 1239</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_logical_not.html">LogicalNot</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01240" name="l01240"></a><span class="lineno"> 1240</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01241" name="l01241"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7"> 1241</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7">LogicalNot</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01242" name="l01242"></a><span class="lineno"> 1242</span> </div>
-<div class="line"><a id="l01243" name="l01243"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3"> 1243</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01244" name="l01244"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a"> 1244</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01236" name="l01236"></a><span class="lineno"> 1236</span> </div>
+<div class="line"><a id="l01237" name="l01237"></a><span class="lineno"> 1237</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l01238" name="l01238"></a><span class="lineno"> 1238</span>  Base base_;</div>
+<div class="line"><a id="l01239" name="l01239"></a><span class="lineno"> 1239</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01240" name="l01240"></a><span class="lineno"> 1240</span>};</div>
+</div>
+<div class="line"><a id="l01241" name="l01241"></a><span class="lineno"> 1241</span> </div>
+<div class="foldopen" id="foldopen01242" data-start="{" data-end="};">
+<div class="line"><a id="l01242" name="l01242"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html"> 1242</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_log1p.html">Log1p</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01243" name="l01243"></a><span class="lineno"> 1243</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01244" name="l01244"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a"> 1244</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a">Log1p</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
 <div class="line"><a id="l01245" name="l01245"></a><span class="lineno"> 1245</span> </div>
-<div class="line"><a id="l01246" name="l01246"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d"> 1246</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01247" name="l01247"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c"> 1247</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01248" name="l01248"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c"> 1248</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_logical_not.html">LogicalNot</a>)</div>
-<div class="line"><a id="l01249" name="l01249"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99"> 1249</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01250" name="l01250"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c"> 1250</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01251" name="l01251"></a><span class="lineno"> 1251</span> </div>
-<div class="line"><a id="l01252" name="l01252"></a><span class="lineno"> 1252</span> private:</div>
-<div class="line"><a id="l01253" name="l01253"></a><span class="lineno"> 1253</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01254" name="l01254"></a><span class="lineno"> 1254</span>};</div>
+<div class="line"><a id="l01246" name="l01246"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23"> 1246</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01247" name="l01247"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431"> 1247</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01248" name="l01248"></a><span class="lineno"> 1248</span> </div>
+<div class="line"><a id="l01249" name="l01249"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71"> 1249</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01250" name="l01250"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2"> 1250</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01251" name="l01251"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4"> 1251</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_log1p.html">Log1p</a>)</div>
+<div class="line"><a id="l01252" name="l01252"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df"> 1252</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01253" name="l01253"></a><span class="lineno"> 1253</span> </div>
+<div class="line"><a id="l01254" name="l01254"></a><span class="lineno"> 1254</span> private:</div>
+<div class="line"><a id="l01255" name="l01255"></a><span class="lineno"> 1255</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01256" name="l01256"></a><span class="lineno"> 1256</span>};</div>
 </div>
-<div class="line"><a id="l01255" name="l01255"></a><span class="lineno"> 1255</span> </div>
-<div class="foldopen" id="foldopen01256" data-start="{" data-end="};">
-<div class="line"><a id="l01256" name="l01256"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html"> 1256</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_logical_and.html">LogicalAnd</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01257" name="l01257"></a><span class="lineno"> 1257</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01258" name="l01258"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3"> 1258</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3">LogicalAnd</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01259" name="l01259"></a><span class="lineno"> 1259</span> </div>
-<div class="line"><a id="l01260" name="l01260"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3"> 1260</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01261" name="l01261"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f"> 1261</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01262" name="l01262"></a><span class="lineno"> 1262</span> </div>
-<div class="line"><a id="l01263" name="l01263"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5"> 1263</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01264" name="l01264"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434"> 1264</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01265" name="l01265"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397"> 1265</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_logical_and.html">LogicalAnd</a>)</div>
-<div class="line"><a id="l01266" name="l01266"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be"> 1266</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01267" name="l01267"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617"> 1267</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01268" name="l01268"></a><span class="lineno"> 1268</span> </div>
-<div class="line"><a id="l01269" name="l01269"></a><span class="lineno"> 1269</span> private:</div>
-<div class="line"><a id="l01270" name="l01270"></a><span class="lineno"> 1270</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01271" name="l01271"></a><span class="lineno"> 1271</span>};</div>
+<div class="line"><a id="l01257" name="l01257"></a><span class="lineno"> 1257</span> </div>
+<div class="foldopen" id="foldopen01258" data-start="{" data-end="};">
+<div class="line"><a id="l01258" name="l01258"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html"> 1258</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_logical_not.html">LogicalNot</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01259" name="l01259"></a><span class="lineno"> 1259</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01260" name="l01260"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7"> 1260</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7">LogicalNot</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01261" name="l01261"></a><span class="lineno"> 1261</span> </div>
+<div class="line"><a id="l01262" name="l01262"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3"> 1262</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01263" name="l01263"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a"> 1263</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01264" name="l01264"></a><span class="lineno"> 1264</span> </div>
+<div class="line"><a id="l01265" name="l01265"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d"> 1265</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01266" name="l01266"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c"> 1266</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01267" name="l01267"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c"> 1267</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_logical_not.html">LogicalNot</a>)</div>
+<div class="line"><a id="l01268" name="l01268"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99"> 1268</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01269" name="l01269"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c"> 1269</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01270" name="l01270"></a><span class="lineno"> 1270</span> </div>
+<div class="line"><a id="l01271" name="l01271"></a><span class="lineno"> 1271</span> private:</div>
+<div class="line"><a id="l01272" name="l01272"></a><span class="lineno"> 1272</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01273" name="l01273"></a><span class="lineno"> 1273</span>};</div>
 </div>
-<div class="line"><a id="l01272" name="l01272"></a><span class="lineno"> 1272</span> </div>
-<div class="foldopen" id="foldopen01273" data-start="{" data-end="};">
-<div class="line"><a id="l01273" name="l01273"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html"> 1273</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_logical_or.html">LogicalOr</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01274" name="l01274"></a><span class="lineno"> 1274</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01275" name="l01275"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918"> 1275</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918">LogicalOr</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01276" name="l01276"></a><span class="lineno"> 1276</span> </div>
-<div class="line"><a id="l01277" name="l01277"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62"> 1277</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01278" name="l01278"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a"> 1278</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01279" name="l01279"></a><span class="lineno"> 1279</span> </div>
-<div class="line"><a id="l01280" name="l01280"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3"> 1280</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01281" name="l01281"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4"> 1281</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01282" name="l01282"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003"> 1282</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_logical_or.html">LogicalOr</a>)</div>
-<div class="line"><a id="l01283" name="l01283"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71"> 1283</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01284" name="l01284"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4"> 1284</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01285" name="l01285"></a><span class="lineno"> 1285</span> </div>
-<div class="line"><a id="l01286" name="l01286"></a><span class="lineno"> 1286</span> private:</div>
-<div class="line"><a id="l01287" name="l01287"></a><span class="lineno"> 1287</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01288" name="l01288"></a><span class="lineno"> 1288</span>};</div>
+<div class="line"><a id="l01274" name="l01274"></a><span class="lineno"> 1274</span> </div>
+<div class="foldopen" id="foldopen01275" data-start="{" data-end="};">
+<div class="line"><a id="l01275" name="l01275"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html"> 1275</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_logical_and.html">LogicalAnd</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01276" name="l01276"></a><span class="lineno"> 1276</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01277" name="l01277"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3"> 1277</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3">LogicalAnd</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01278" name="l01278"></a><span class="lineno"> 1278</span> </div>
+<div class="line"><a id="l01279" name="l01279"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3"> 1279</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01280" name="l01280"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f"> 1280</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01281" name="l01281"></a><span class="lineno"> 1281</span> </div>
+<div class="line"><a id="l01282" name="l01282"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5"> 1282</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01283" name="l01283"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434"> 1283</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01284" name="l01284"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397"> 1284</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_logical_and.html">LogicalAnd</a>)</div>
+<div class="line"><a id="l01285" name="l01285"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be"> 1285</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01286" name="l01286"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617"> 1286</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01287" name="l01287"></a><span class="lineno"> 1287</span> </div>
+<div class="line"><a id="l01288" name="l01288"></a><span class="lineno"> 1288</span> private:</div>
+<div class="line"><a id="l01289" name="l01289"></a><span class="lineno"> 1289</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01290" name="l01290"></a><span class="lineno"> 1290</span>};</div>
 </div>
-<div class="line"><a id="l01289" name="l01289"></a><span class="lineno"> 1289</span> </div>
-<div class="foldopen" id="foldopen01290" data-start="{" data-end="};">
-<div class="line"><a id="l01290" name="l01290"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html"> 1290</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_log_add_exp.html">LogAddExp</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01291" name="l01291"></a><span class="lineno"> 1291</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01292" name="l01292"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a"> 1292</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a">LogAddExp</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01293" name="l01293"></a><span class="lineno"> 1293</span> </div>
-<div class="line"><a id="l01294" name="l01294"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0"> 1294</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01295" name="l01295"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a"> 1295</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01296" name="l01296"></a><span class="lineno"> 1296</span> </div>
-<div class="line"><a id="l01297" name="l01297"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78"> 1297</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01298" name="l01298"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329"> 1298</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01299" name="l01299"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9"> 1299</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_log_add_exp.html">LogAddExp</a>)</div>
-<div class="line"><a id="l01300" name="l01300"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4"> 1300</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01301" name="l01301"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635"> 1301</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01302" name="l01302"></a><span class="lineno"> 1302</span> </div>
-<div class="line"><a id="l01303" name="l01303"></a><span class="lineno"> 1303</span> private:</div>
-<div class="line"><a id="l01304" name="l01304"></a><span class="lineno"> 1304</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01305" name="l01305"></a><span class="lineno"> 1305</span>};</div>
+<div class="line"><a id="l01291" name="l01291"></a><span class="lineno"> 1291</span> </div>
+<div class="foldopen" id="foldopen01292" data-start="{" data-end="};">
+<div class="line"><a id="l01292" name="l01292"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html"> 1292</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_logical_or.html">LogicalOr</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01293" name="l01293"></a><span class="lineno"> 1293</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01294" name="l01294"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918"> 1294</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918">LogicalOr</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01295" name="l01295"></a><span class="lineno"> 1295</span> </div>
+<div class="line"><a id="l01296" name="l01296"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62"> 1296</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01297" name="l01297"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a"> 1297</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01298" name="l01298"></a><span class="lineno"> 1298</span> </div>
+<div class="line"><a id="l01299" name="l01299"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3"> 1299</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01300" name="l01300"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4"> 1300</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01301" name="l01301"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003"> 1301</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_logical_or.html">LogicalOr</a>)</div>
+<div class="line"><a id="l01302" name="l01302"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71"> 1302</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01303" name="l01303"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4"> 1303</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01304" name="l01304"></a><span class="lineno"> 1304</span> </div>
+<div class="line"><a id="l01305" name="l01305"></a><span class="lineno"> 1305</span> private:</div>
+<div class="line"><a id="l01306" name="l01306"></a><span class="lineno"> 1306</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01307" name="l01307"></a><span class="lineno"> 1307</span>};</div>
 </div>
-<div class="line"><a id="l01306" name="l01306"></a><span class="lineno"> 1306</span> </div>
-<div class="foldopen" id="foldopen01307" data-start="{" data-end="};">
-<div class="line"><a id="l01307" name="l01307"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html"> 1307</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_matmul.html">Matmul</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01308" name="l01308"></a><span class="lineno"> 1308</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01309" name="l01309"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7"> 1309</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7">Matmul</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01310" name="l01310"></a><span class="lineno"> 1310</span> </div>
-<div class="line"><a id="l01311" name="l01311"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc"> 1311</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01312" name="l01312"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7"> 1312</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01313" name="l01313"></a><span class="lineno"> 1313</span> </div>
-<div class="line"><a id="l01314" name="l01314"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0"> 1314</a></span>  std::vector&lt;array&gt; <a class="code hl_function" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0">vjp</a>(</div>
-<div class="line"><a id="l01315" name="l01315"></a><span class="lineno"> 1315</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; primals,</div>
-<div class="line"><a id="l01316" name="l01316"></a><span class="lineno"> 1316</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; cotangents,</div>
-<div class="line"><a id="l01317" name="l01317"></a><span class="lineno"> 1317</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; argnums,</div>
-<div class="line"><a id="l01318" name="l01318"></a><span class="lineno"> 1318</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; outputs) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01319" name="l01319"></a><span class="lineno"> 1319</span> </div>
-<div class="line"><a id="l01320" name="l01320"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2"> 1320</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01321" name="l01321"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd"> 1321</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_matmul.html">Matmul</a>)</div>
-<div class="line"><a id="l01322" name="l01322"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630"> 1322</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01323" name="l01323"></a><span class="lineno"> 1323</span>};</div>
+<div class="line"><a id="l01308" name="l01308"></a><span class="lineno"> 1308</span> </div>
+<div class="foldopen" id="foldopen01309" data-start="{" data-end="};">
+<div class="line"><a id="l01309" name="l01309"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html"> 1309</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_log_add_exp.html">LogAddExp</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01310" name="l01310"></a><span class="lineno"> 1310</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01311" name="l01311"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a"> 1311</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a">LogAddExp</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01312" name="l01312"></a><span class="lineno"> 1312</span> </div>
+<div class="line"><a id="l01313" name="l01313"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0"> 1313</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01314" name="l01314"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a"> 1314</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01315" name="l01315"></a><span class="lineno"> 1315</span> </div>
+<div class="line"><a id="l01316" name="l01316"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78"> 1316</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01317" name="l01317"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329"> 1317</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01318" name="l01318"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9"> 1318</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_log_add_exp.html">LogAddExp</a>)</div>
+<div class="line"><a id="l01319" name="l01319"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4"> 1319</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01320" name="l01320"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635"> 1320</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01321" name="l01321"></a><span class="lineno"> 1321</span> </div>
+<div class="line"><a id="l01322" name="l01322"></a><span class="lineno"> 1322</span> private:</div>
+<div class="line"><a id="l01323" name="l01323"></a><span class="lineno"> 1323</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01324" name="l01324"></a><span class="lineno"> 1324</span>};</div>
 </div>
-<div class="line"><a id="l01324" name="l01324"></a><span class="lineno"> 1324</span> </div>
-<div class="foldopen" id="foldopen01325" data-start="{" data-end="};">
-<div class="line"><a id="l01325" name="l01325"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html"> 1325</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_maximum.html">Maximum</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01326" name="l01326"></a><span class="lineno"> 1326</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01327" name="l01327"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816"> 1327</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816">Maximum</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01328" name="l01328"></a><span class="lineno"> 1328</span> </div>
-<div class="line"><a id="l01329" name="l01329"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf"> 1329</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01330" name="l01330"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7"> 1330</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01331" name="l01331"></a><span class="lineno"> 1331</span> </div>
-<div class="line"><a id="l01332" name="l01332"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3"> 1332</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01333" name="l01333"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39"> 1333</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01334" name="l01334"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca"> 1334</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_maximum.html">Maximum</a>)</div>
-<div class="line"><a id="l01335" name="l01335"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46"> 1335</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01336" name="l01336"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b"> 1336</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01337" name="l01337"></a><span class="lineno"> 1337</span> </div>
-<div class="line"><a id="l01338" name="l01338"></a><span class="lineno"> 1338</span> private:</div>
-<div class="line"><a id="l01339" name="l01339"></a><span class="lineno"> 1339</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01340" name="l01340"></a><span class="lineno"> 1340</span>};</div>
+<div class="line"><a id="l01325" name="l01325"></a><span class="lineno"> 1325</span> </div>
+<div class="foldopen" id="foldopen01326" data-start="{" data-end="};">
+<div class="line"><a id="l01326" name="l01326"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html"> 1326</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_matmul.html">Matmul</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01327" name="l01327"></a><span class="lineno"> 1327</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01328" name="l01328"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7"> 1328</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7">Matmul</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01329" name="l01329"></a><span class="lineno"> 1329</span> </div>
+<div class="line"><a id="l01330" name="l01330"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc"> 1330</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01331" name="l01331"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7"> 1331</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01332" name="l01332"></a><span class="lineno"> 1332</span> </div>
+<div class="line"><a id="l01333" name="l01333"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0"> 1333</a></span>  std::vector&lt;array&gt; <a class="code hl_function" href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0">vjp</a>(</div>
+<div class="line"><a id="l01334" name="l01334"></a><span class="lineno"> 1334</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; primals,</div>
+<div class="line"><a id="l01335" name="l01335"></a><span class="lineno"> 1335</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; cotangents,</div>
+<div class="line"><a id="l01336" name="l01336"></a><span class="lineno"> 1336</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; argnums,</div>
+<div class="line"><a id="l01337" name="l01337"></a><span class="lineno"> 1337</span>      <span class="keyword">const</span> std::vector&lt;array&gt;&amp; outputs) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01338" name="l01338"></a><span class="lineno"> 1338</span> </div>
+<div class="line"><a id="l01339" name="l01339"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2"> 1339</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01340" name="l01340"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd"> 1340</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_matmul.html">Matmul</a>)</div>
+<div class="line"><a id="l01341" name="l01341"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630"> 1341</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01342" name="l01342"></a><span class="lineno"> 1342</span>};</div>
 </div>
-<div class="line"><a id="l01341" name="l01341"></a><span class="lineno"> 1341</span> </div>
-<div class="foldopen" id="foldopen01342" data-start="{" data-end="};">
-<div class="line"><a id="l01342" name="l01342"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html"> 1342</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_minimum.html">Minimum</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01343" name="l01343"></a><span class="lineno"> 1343</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01344" name="l01344"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5"> 1344</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5">Minimum</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01345" name="l01345"></a><span class="lineno"> 1345</span> </div>
-<div class="line"><a id="l01346" name="l01346"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e"> 1346</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01347" name="l01347"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba"> 1347</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01348" name="l01348"></a><span class="lineno"> 1348</span> </div>
-<div class="line"><a id="l01349" name="l01349"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980"> 1349</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01350" name="l01350"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038"> 1350</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01351" name="l01351"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512"> 1351</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_minimum.html">Minimum</a>)</div>
-<div class="line"><a id="l01352" name="l01352"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4"> 1352</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01353" name="l01353"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70"> 1353</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01354" name="l01354"></a><span class="lineno"> 1354</span> </div>
-<div class="line"><a id="l01355" name="l01355"></a><span class="lineno"> 1355</span> private:</div>
-<div class="line"><a id="l01356" name="l01356"></a><span class="lineno"> 1356</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01357" name="l01357"></a><span class="lineno"> 1357</span>};</div>
+<div class="line"><a id="l01343" name="l01343"></a><span class="lineno"> 1343</span> </div>
+<div class="foldopen" id="foldopen01344" data-start="{" data-end="};">
+<div class="line"><a id="l01344" name="l01344"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html"> 1344</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_maximum.html">Maximum</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01345" name="l01345"></a><span class="lineno"> 1345</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01346" name="l01346"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816"> 1346</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816">Maximum</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01347" name="l01347"></a><span class="lineno"> 1347</span> </div>
+<div class="line"><a id="l01348" name="l01348"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf"> 1348</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01349" name="l01349"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7"> 1349</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01350" name="l01350"></a><span class="lineno"> 1350</span> </div>
+<div class="line"><a id="l01351" name="l01351"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3"> 1351</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01352" name="l01352"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39"> 1352</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01353" name="l01353"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca"> 1353</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_maximum.html">Maximum</a>)</div>
+<div class="line"><a id="l01354" name="l01354"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46"> 1354</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01355" name="l01355"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b"> 1355</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01356" name="l01356"></a><span class="lineno"> 1356</span> </div>
+<div class="line"><a id="l01357" name="l01357"></a><span class="lineno"> 1357</span> private:</div>
+<div class="line"><a id="l01358" name="l01358"></a><span class="lineno"> 1358</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01359" name="l01359"></a><span class="lineno"> 1359</span>};</div>
 </div>
-<div class="line"><a id="l01358" name="l01358"></a><span class="lineno"> 1358</span> </div>
-<div class="foldopen" id="foldopen01359" data-start="{" data-end="};">
-<div class="line"><a id="l01359" name="l01359"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html"> 1359</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_multiply.html">Multiply</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01360" name="l01360"></a><span class="lineno"> 1360</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01361" name="l01361"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c"> 1361</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c">Multiply</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01362" name="l01362"></a><span class="lineno"> 1362</span> </div>
-<div class="line"><a id="l01363" name="l01363"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34"> 1363</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01364" name="l01364"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0"> 1364</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01365" name="l01365"></a><span class="lineno"> 1365</span> </div>
-<div class="line"><a id="l01366" name="l01366"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf"> 1366</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01367" name="l01367"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4"> 1367</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01368" name="l01368"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909"> 1368</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_multiply.html">Multiply</a>)</div>
-<div class="line"><a id="l01369" name="l01369"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2"> 1369</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01370" name="l01370"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061"> 1370</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01371" name="l01371"></a><span class="lineno"> 1371</span> </div>
-<div class="line"><a id="l01372" name="l01372"></a><span class="lineno"> 1372</span> private:</div>
-<div class="line"><a id="l01373" name="l01373"></a><span class="lineno"> 1373</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01374" name="l01374"></a><span class="lineno"> 1374</span>};</div>
+<div class="line"><a id="l01360" name="l01360"></a><span class="lineno"> 1360</span> </div>
+<div class="foldopen" id="foldopen01361" data-start="{" data-end="};">
+<div class="line"><a id="l01361" name="l01361"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html"> 1361</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_minimum.html">Minimum</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01362" name="l01362"></a><span class="lineno"> 1362</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01363" name="l01363"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5"> 1363</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5">Minimum</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01364" name="l01364"></a><span class="lineno"> 1364</span> </div>
+<div class="line"><a id="l01365" name="l01365"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e"> 1365</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01366" name="l01366"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba"> 1366</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01367" name="l01367"></a><span class="lineno"> 1367</span> </div>
+<div class="line"><a id="l01368" name="l01368"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980"> 1368</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01369" name="l01369"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038"> 1369</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01370" name="l01370"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512"> 1370</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_minimum.html">Minimum</a>)</div>
+<div class="line"><a id="l01371" name="l01371"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4"> 1371</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01372" name="l01372"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70"> 1372</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01373" name="l01373"></a><span class="lineno"> 1373</span> </div>
+<div class="line"><a id="l01374" name="l01374"></a><span class="lineno"> 1374</span> private:</div>
+<div class="line"><a id="l01375" name="l01375"></a><span class="lineno"> 1375</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01376" name="l01376"></a><span class="lineno"> 1376</span>};</div>
 </div>
-<div class="line"><a id="l01375" name="l01375"></a><span class="lineno"> 1375</span> </div>
-<div class="foldopen" id="foldopen01376" data-start="{" data-end="};">
-<div class="line"><a id="l01376" name="l01376"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html"> 1376</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_negative.html">Negative</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01377" name="l01377"></a><span class="lineno"> 1377</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01378" name="l01378"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70"> 1378</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70">Negative</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01379" name="l01379"></a><span class="lineno"> 1379</span> </div>
-<div class="line"><a id="l01380" name="l01380"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b"> 1380</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01381" name="l01381"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b"> 1381</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01382" name="l01382"></a><span class="lineno"> 1382</span> </div>
-<div class="line"><a id="l01383" name="l01383"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0"> 1383</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01384" name="l01384"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979"> 1384</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01385" name="l01385"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91"> 1385</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_negative.html">Negative</a>)</div>
-<div class="line"><a id="l01386" name="l01386"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823"> 1386</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01387" name="l01387"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014"> 1387</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01388" name="l01388"></a><span class="lineno"> 1388</span> </div>
-<div class="line"><a id="l01389" name="l01389"></a><span class="lineno"> 1389</span> private:</div>
-<div class="line"><a id="l01390" name="l01390"></a><span class="lineno"> 1390</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01391" name="l01391"></a><span class="lineno"> 1391</span>};</div>
+<div class="line"><a id="l01377" name="l01377"></a><span class="lineno"> 1377</span> </div>
+<div class="foldopen" id="foldopen01378" data-start="{" data-end="};">
+<div class="line"><a id="l01378" name="l01378"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html"> 1378</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_multiply.html">Multiply</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01379" name="l01379"></a><span class="lineno"> 1379</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01380" name="l01380"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c"> 1380</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c">Multiply</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01381" name="l01381"></a><span class="lineno"> 1381</span> </div>
+<div class="line"><a id="l01382" name="l01382"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34"> 1382</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01383" name="l01383"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0"> 1383</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01384" name="l01384"></a><span class="lineno"> 1384</span> </div>
+<div class="line"><a id="l01385" name="l01385"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf"> 1385</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01386" name="l01386"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4"> 1386</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01387" name="l01387"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909"> 1387</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_multiply.html">Multiply</a>)</div>
+<div class="line"><a id="l01388" name="l01388"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2"> 1388</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01389" name="l01389"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061"> 1389</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01390" name="l01390"></a><span class="lineno"> 1390</span> </div>
+<div class="line"><a id="l01391" name="l01391"></a><span class="lineno"> 1391</span> private:</div>
+<div class="line"><a id="l01392" name="l01392"></a><span class="lineno"> 1392</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01393" name="l01393"></a><span class="lineno"> 1393</span>};</div>
 </div>
-<div class="line"><a id="l01392" name="l01392"></a><span class="lineno"> 1392</span> </div>
-<div class="foldopen" id="foldopen01393" data-start="{" data-end="};">
-<div class="line"><a id="l01393" name="l01393"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html"> 1393</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_not_equal.html">NotEqual</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01394" name="l01394"></a><span class="lineno"> 1394</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01395" name="l01395"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#ac568397bd17b5d9f25ad1a0ebadedbb9"> 1395</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_not_equal.html#ac568397bd17b5d9f25ad1a0ebadedbb9">NotEqual</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01396" name="l01396"></a><span class="lineno"> 1396</span> </div>
-<div class="line"><a id="l01397" name="l01397"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047"> 1397</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01398" name="l01398"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2"> 1398</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01399" name="l01399"></a><span class="lineno"> 1399</span> </div>
-<div class="line"><a id="l01400" name="l01400"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5"> 1400</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01401" name="l01401"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17"> 1401</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01402" name="l01402"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09"> 1402</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_not_equal.html">NotEqual</a>)</div>
-<div class="line"><a id="l01403" name="l01403"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d"> 1403</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01404" name="l01404"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a"> 1404</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01405" name="l01405"></a><span class="lineno"> 1405</span> </div>
-<div class="line"><a id="l01406" name="l01406"></a><span class="lineno"> 1406</span> private:</div>
-<div class="line"><a id="l01407" name="l01407"></a><span class="lineno"> 1407</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01408" name="l01408"></a><span class="lineno"> 1408</span>};</div>
+<div class="line"><a id="l01394" name="l01394"></a><span class="lineno"> 1394</span> </div>
+<div class="foldopen" id="foldopen01395" data-start="{" data-end="};">
+<div class="line"><a id="l01395" name="l01395"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html"> 1395</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_negative.html">Negative</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01396" name="l01396"></a><span class="lineno"> 1396</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01397" name="l01397"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70"> 1397</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70">Negative</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01398" name="l01398"></a><span class="lineno"> 1398</span> </div>
+<div class="line"><a id="l01399" name="l01399"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b"> 1399</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01400" name="l01400"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b"> 1400</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01401" name="l01401"></a><span class="lineno"> 1401</span> </div>
+<div class="line"><a id="l01402" name="l01402"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0"> 1402</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01403" name="l01403"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979"> 1403</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01404" name="l01404"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91"> 1404</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_negative.html">Negative</a>)</div>
+<div class="line"><a id="l01405" name="l01405"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823"> 1405</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01406" name="l01406"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014"> 1406</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01407" name="l01407"></a><span class="lineno"> 1407</span> </div>
+<div class="line"><a id="l01408" name="l01408"></a><span class="lineno"> 1408</span> private:</div>
+<div class="line"><a id="l01409" name="l01409"></a><span class="lineno"> 1409</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01410" name="l01410"></a><span class="lineno"> 1410</span>};</div>
 </div>
-<div class="line"><a id="l01409" name="l01409"></a><span class="lineno"> 1409</span> </div>
-<div class="foldopen" id="foldopen01410" data-start="{" data-end="};">
-<div class="line"><a id="l01410" name="l01410"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html"> 1410</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_number_of_elements.html">NumberOfElements</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01411" name="l01411"></a><span class="lineno"> 1411</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01412" data-start="{" data-end="}">
-<div class="line"><a id="l01412" name="l01412"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#ac64d7c40ae29d687f8b7d2fa33e13b06"> 1412</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_number_of_elements.html#ac64d7c40ae29d687f8b7d2fa33e13b06">NumberOfElements</a>(</div>
-<div class="line"><a id="l01413" name="l01413"></a><span class="lineno"> 1413</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01414" name="l01414"></a><span class="lineno"> 1414</span>      std::vector&lt;int&gt; axes,</div>
-<div class="line"><a id="l01415" name="l01415"></a><span class="lineno"> 1415</span>      <span class="keywordtype">bool</span> inverted,</div>
-<div class="line"><a id="l01416" name="l01416"></a><span class="lineno"> 1416</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> dtype)</div>
-<div class="line"><a id="l01417" name="l01417"></a><span class="lineno"> 1417</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
-<div class="line"><a id="l01418" name="l01418"></a><span class="lineno"> 1418</span>        axes_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(axes)),</div>
-<div class="line"><a id="l01419" name="l01419"></a><span class="lineno"> 1419</span>        inverted_(inverted),</div>
-<div class="line"><a id="l01420" name="l01420"></a><span class="lineno"> 1420</span>        dtype_(dtype) {}</div>
-</div>
-<div class="line"><a id="l01421" name="l01421"></a><span class="lineno"> 1421</span> </div>
-<div class="line"><a id="l01422" name="l01422"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f"> 1422</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01423" name="l01423"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5"> 1423</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01411" name="l01411"></a><span class="lineno"> 1411</span> </div>
+<div class="foldopen" id="foldopen01412" data-start="{" data-end="};">
+<div class="line"><a id="l01412" name="l01412"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html"> 1412</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_not_equal.html">NotEqual</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01413" name="l01413"></a><span class="lineno"> 1413</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01414" name="l01414"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#ac568397bd17b5d9f25ad1a0ebadedbb9"> 1414</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_not_equal.html#ac568397bd17b5d9f25ad1a0ebadedbb9">NotEqual</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01415" name="l01415"></a><span class="lineno"> 1415</span> </div>
+<div class="line"><a id="l01416" name="l01416"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047"> 1416</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01417" name="l01417"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2"> 1417</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01418" name="l01418"></a><span class="lineno"> 1418</span> </div>
+<div class="line"><a id="l01419" name="l01419"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5"> 1419</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01420" name="l01420"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17"> 1420</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01421" name="l01421"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09"> 1421</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_not_equal.html">NotEqual</a>)</div>
+<div class="line"><a id="l01422" name="l01422"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d"> 1422</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01423" name="l01423"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a"> 1423</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
 <div class="line"><a id="l01424" name="l01424"></a><span class="lineno"> 1424</span> </div>
-<div class="line"><a id="l01425" name="l01425"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2"> 1425</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01426" name="l01426"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52"> 1426</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_number_of_elements.html">NumberOfElements</a>)</div>
-<div class="line"><a id="l01427" name="l01427"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f"> 1427</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="foldopen" id="foldopen01428" data-start="{" data-end="}">
-<div class="line"><a id="l01428" name="l01428"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8"> 1428</a></span>  std::vector&lt;std::vector&lt;<span class="keywordtype">int</span>&gt;&gt; output_shapes(</div>
-<div class="line"><a id="l01429" name="l01429"></a><span class="lineno"> 1429</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs)<span class="keyword"> override </span>{</div>
-<div class="line"><a id="l01430" name="l01430"></a><span class="lineno"> 1430</span>    <span class="keywordflow">return</span> {{}};</div>
-<div class="line"><a id="l01431" name="l01431"></a><span class="lineno"> 1431</span>  }</div>
+<div class="line"><a id="l01425" name="l01425"></a><span class="lineno"> 1425</span> private:</div>
+<div class="line"><a id="l01426" name="l01426"></a><span class="lineno"> 1426</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01427" name="l01427"></a><span class="lineno"> 1427</span>};</div>
 </div>
-<div class="line"><a id="l01432" name="l01432"></a><span class="lineno"> 1432</span> </div>
-<div class="line"><a id="l01433" name="l01433"></a><span class="lineno"> 1433</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l01434" name="l01434"></a><span class="lineno"> 1434</span>  std::vector&lt;int&gt; axes_;</div>
-<div class="line"><a id="l01435" name="l01435"></a><span class="lineno"> 1435</span>  <span class="keywordtype">bool</span> inverted_;</div>
-<div class="line"><a id="l01436" name="l01436"></a><span class="lineno"> 1436</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> dtype_;</div>
-<div class="line"><a id="l01437" name="l01437"></a><span class="lineno"> 1437</span> </div>
-<div class="line"><a id="l01438" name="l01438"></a><span class="lineno"> 1438</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01439" name="l01439"></a><span class="lineno"> 1439</span>};</div>
+<div class="line"><a id="l01428" name="l01428"></a><span class="lineno"> 1428</span> </div>
+<div class="foldopen" id="foldopen01429" data-start="{" data-end="};">
+<div class="line"><a id="l01429" name="l01429"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html"> 1429</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_number_of_elements.html">NumberOfElements</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01430" name="l01430"></a><span class="lineno"> 1430</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01431" data-start="{" data-end="}">
+<div class="line"><a id="l01431" name="l01431"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#ac64d7c40ae29d687f8b7d2fa33e13b06"> 1431</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_number_of_elements.html#ac64d7c40ae29d687f8b7d2fa33e13b06">NumberOfElements</a>(</div>
+<div class="line"><a id="l01432" name="l01432"></a><span class="lineno"> 1432</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01433" name="l01433"></a><span class="lineno"> 1433</span>      std::vector&lt;int&gt; axes,</div>
+<div class="line"><a id="l01434" name="l01434"></a><span class="lineno"> 1434</span>      <span class="keywordtype">bool</span> inverted,</div>
+<div class="line"><a id="l01435" name="l01435"></a><span class="lineno"> 1435</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> dtype)</div>
+<div class="line"><a id="l01436" name="l01436"></a><span class="lineno"> 1436</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
+<div class="line"><a id="l01437" name="l01437"></a><span class="lineno"> 1437</span>        axes_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(axes)),</div>
+<div class="line"><a id="l01438" name="l01438"></a><span class="lineno"> 1438</span>        inverted_(inverted),</div>
+<div class="line"><a id="l01439" name="l01439"></a><span class="lineno"> 1439</span>        dtype_(dtype) {}</div>
 </div>
 <div class="line"><a id="l01440" name="l01440"></a><span class="lineno"> 1440</span> </div>
-<div class="foldopen" id="foldopen01441" data-start="{" data-end="};">
-<div class="line"><a id="l01441" name="l01441"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html"> 1441</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_pad.html">Pad</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01442" name="l01442"></a><span class="lineno"> 1442</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01443" data-start="{" data-end="}">
-<div class="line"><a id="l01443" name="l01443"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#ad03da2c40b1e1f2fdf2649d00fa4ab43"> 1443</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_pad.html#ad03da2c40b1e1f2fdf2649d00fa4ab43">Pad</a>(</div>
-<div class="line"><a id="l01444" name="l01444"></a><span class="lineno"> 1444</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01445" name="l01445"></a><span class="lineno"> 1445</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes,</div>
-<div class="line"><a id="l01446" name="l01446"></a><span class="lineno"> 1446</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; low_pad_size,</div>
-<div class="line"><a id="l01447" name="l01447"></a><span class="lineno"> 1447</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; high_pad_size)</div>
-<div class="line"><a id="l01448" name="l01448"></a><span class="lineno"> 1448</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
-<div class="line"><a id="l01449" name="l01449"></a><span class="lineno"> 1449</span>        axes_(axes),</div>
-<div class="line"><a id="l01450" name="l01450"></a><span class="lineno"> 1450</span>        low_pad_size_(low_pad_size),</div>
-<div class="line"><a id="l01451" name="l01451"></a><span class="lineno"> 1451</span>        high_pad_size_(high_pad_size) {}</div>
+<div class="line"><a id="l01441" name="l01441"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f"> 1441</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01442" name="l01442"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5"> 1442</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01443" name="l01443"></a><span class="lineno"> 1443</span> </div>
+<div class="line"><a id="l01444" name="l01444"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2"> 1444</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01445" name="l01445"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52"> 1445</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_number_of_elements.html">NumberOfElements</a>)</div>
+<div class="line"><a id="l01446" name="l01446"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f"> 1446</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="foldopen" id="foldopen01447" data-start="{" data-end="}">
+<div class="line"><a id="l01447" name="l01447"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8"> 1447</a></span>  std::vector&lt;std::vector&lt;<span class="keywordtype">int</span>&gt;&gt; output_shapes(</div>
+<div class="line"><a id="l01448" name="l01448"></a><span class="lineno"> 1448</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs)<span class="keyword"> override </span>{</div>
+<div class="line"><a id="l01449" name="l01449"></a><span class="lineno"> 1449</span>    <span class="keywordflow">return</span> {{}};</div>
+<div class="line"><a id="l01450" name="l01450"></a><span class="lineno"> 1450</span>  }</div>
 </div>
-<div class="line"><a id="l01452" name="l01452"></a><span class="lineno"> 1452</span> </div>
-<div class="line"><a id="l01453" name="l01453"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb"> 1453</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01454" name="l01454"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153"> 1454</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01455" name="l01455"></a><span class="lineno"> 1455</span> </div>
-<div class="line"><a id="l01456" name="l01456"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf"> 1456</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01457" name="l01457"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72"> 1457</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01458" name="l01458"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a"> 1458</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_pad.html">Pad</a>)</div>
-<div class="line"><a id="l01459" name="l01459"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b"> 1459</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01460" name="l01460"></a><span class="lineno"> 1460</span> </div>
-<div class="line"><a id="l01461" name="l01461"></a><span class="lineno"> 1461</span> private:</div>
-<div class="line"><a id="l01462" name="l01462"></a><span class="lineno"> 1462</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; axes_;</div>
-<div class="line"><a id="l01463" name="l01463"></a><span class="lineno"> 1463</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; low_pad_size_;</div>
-<div class="line"><a id="l01464" name="l01464"></a><span class="lineno"> 1464</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; high_pad_size_;</div>
-<div class="line"><a id="l01465" name="l01465"></a><span class="lineno"> 1465</span> </div>
-<div class="line"><a id="l01466" name="l01466"></a><span class="lineno"> 1466</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01467" name="l01467"></a><span class="lineno"> 1467</span>};</div>
+<div class="line"><a id="l01451" name="l01451"></a><span class="lineno"> 1451</span> </div>
+<div class="line"><a id="l01452" name="l01452"></a><span class="lineno"> 1452</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l01453" name="l01453"></a><span class="lineno"> 1453</span>  std::vector&lt;int&gt; axes_;</div>
+<div class="line"><a id="l01454" name="l01454"></a><span class="lineno"> 1454</span>  <span class="keywordtype">bool</span> inverted_;</div>
+<div class="line"><a id="l01455" name="l01455"></a><span class="lineno"> 1455</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> dtype_;</div>
+<div class="line"><a id="l01456" name="l01456"></a><span class="lineno"> 1456</span> </div>
+<div class="line"><a id="l01457" name="l01457"></a><span class="lineno"> 1457</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01458" name="l01458"></a><span class="lineno"> 1458</span>};</div>
 </div>
-<div class="line"><a id="l01468" name="l01468"></a><span class="lineno"> 1468</span> </div>
-<div class="foldopen" id="foldopen01469" data-start="{" data-end="};">
-<div class="line"><a id="l01469" name="l01469"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html"> 1469</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_partition.html">Partition</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01470" name="l01470"></a><span class="lineno"> 1470</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01471" data-start="{" data-end="}">
-<div class="line"><a id="l01471" name="l01471"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#a7b82ca3895b6654308fac566b277ac0d"> 1471</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_partition.html#a7b82ca3895b6654308fac566b277ac0d">Partition</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">int</span> kth, <span class="keywordtype">int</span> axis)</div>
-<div class="line"><a id="l01472" name="l01472"></a><span class="lineno"> 1472</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), kth_(kth), axis_(axis) {}</div>
+<div class="line"><a id="l01459" name="l01459"></a><span class="lineno"> 1459</span> </div>
+<div class="foldopen" id="foldopen01460" data-start="{" data-end="};">
+<div class="line"><a id="l01460" name="l01460"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html"> 1460</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_pad.html">Pad</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01461" name="l01461"></a><span class="lineno"> 1461</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01462" data-start="{" data-end="}">
+<div class="line"><a id="l01462" name="l01462"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#ad03da2c40b1e1f2fdf2649d00fa4ab43"> 1462</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_pad.html#ad03da2c40b1e1f2fdf2649d00fa4ab43">Pad</a>(</div>
+<div class="line"><a id="l01463" name="l01463"></a><span class="lineno"> 1463</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01464" name="l01464"></a><span class="lineno"> 1464</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes,</div>
+<div class="line"><a id="l01465" name="l01465"></a><span class="lineno"> 1465</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; low_pad_size,</div>
+<div class="line"><a id="l01466" name="l01466"></a><span class="lineno"> 1466</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; high_pad_size)</div>
+<div class="line"><a id="l01467" name="l01467"></a><span class="lineno"> 1467</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
+<div class="line"><a id="l01468" name="l01468"></a><span class="lineno"> 1468</span>        axes_(axes),</div>
+<div class="line"><a id="l01469" name="l01469"></a><span class="lineno"> 1469</span>        low_pad_size_(low_pad_size),</div>
+<div class="line"><a id="l01470" name="l01470"></a><span class="lineno"> 1470</span>        high_pad_size_(high_pad_size) {}</div>
 </div>
-<div class="line"><a id="l01473" name="l01473"></a><span class="lineno"> 1473</span> </div>
-<div class="line"><a id="l01474" name="l01474"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8"> 1474</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01475" name="l01475"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef"> 1475</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01476" name="l01476"></a><span class="lineno"> 1476</span> </div>
-<div class="line"><a id="l01477" name="l01477"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c"> 1477</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01478" name="l01478"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a"> 1478</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01479" name="l01479"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0"> 1479</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_partition.html">Partition</a>)</div>
-<div class="line"><a id="l01480" name="l01480"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf"> 1480</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01481" name="l01481"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8"> 1481</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01482" name="l01482"></a><span class="lineno"> 1482</span> </div>
-<div class="line"><a id="l01483" name="l01483"></a><span class="lineno"> 1483</span> private:</div>
-<div class="line"><a id="l01484" name="l01484"></a><span class="lineno"> 1484</span>  <span class="keywordtype">int</span> kth_;</div>
-<div class="line"><a id="l01485" name="l01485"></a><span class="lineno"> 1485</span>  <span class="keywordtype">int</span> axis_;</div>
-<div class="line"><a id="l01486" name="l01486"></a><span class="lineno"> 1486</span> </div>
-<div class="line"><a id="l01487" name="l01487"></a><span class="lineno"> 1487</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01488" name="l01488"></a><span class="lineno"> 1488</span>};</div>
+<div class="line"><a id="l01471" name="l01471"></a><span class="lineno"> 1471</span> </div>
+<div class="line"><a id="l01472" name="l01472"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb"> 1472</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01473" name="l01473"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153"> 1473</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01474" name="l01474"></a><span class="lineno"> 1474</span> </div>
+<div class="line"><a id="l01475" name="l01475"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf"> 1475</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01476" name="l01476"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72"> 1476</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01477" name="l01477"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a"> 1477</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_pad.html">Pad</a>)</div>
+<div class="line"><a id="l01478" name="l01478"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b"> 1478</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01479" name="l01479"></a><span class="lineno"> 1479</span> </div>
+<div class="line"><a id="l01480" name="l01480"></a><span class="lineno"> 1480</span> private:</div>
+<div class="line"><a id="l01481" name="l01481"></a><span class="lineno"> 1481</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; axes_;</div>
+<div class="line"><a id="l01482" name="l01482"></a><span class="lineno"> 1482</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; low_pad_size_;</div>
+<div class="line"><a id="l01483" name="l01483"></a><span class="lineno"> 1483</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; high_pad_size_;</div>
+<div class="line"><a id="l01484" name="l01484"></a><span class="lineno"> 1484</span> </div>
+<div class="line"><a id="l01485" name="l01485"></a><span class="lineno"> 1485</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01486" name="l01486"></a><span class="lineno"> 1486</span>};</div>
 </div>
-<div class="line"><a id="l01489" name="l01489"></a><span class="lineno"> 1489</span> </div>
-<div class="foldopen" id="foldopen01490" data-start="{" data-end="};">
-<div class="line"><a id="l01490" name="l01490"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html"> 1490</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_power.html">Power</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01491" name="l01491"></a><span class="lineno"> 1491</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01492" name="l01492"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a7bc6c64179b7a2aef56fe1dafb6459b2"> 1492</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_power.html#a7bc6c64179b7a2aef56fe1dafb6459b2">Power</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01493" name="l01493"></a><span class="lineno"> 1493</span> </div>
-<div class="line"><a id="l01494" name="l01494"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206"> 1494</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01495" name="l01495"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11"> 1495</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01496" name="l01496"></a><span class="lineno"> 1496</span> </div>
-<div class="line"><a id="l01497" name="l01497"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f"> 1497</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01498" name="l01498"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a"> 1498</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01499" name="l01499"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60"> 1499</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_power.html">Power</a>)</div>
-<div class="line"><a id="l01500" name="l01500"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68"> 1500</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01501" name="l01501"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1"> 1501</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01502" name="l01502"></a><span class="lineno"> 1502</span> </div>
-<div class="line"><a id="l01503" name="l01503"></a><span class="lineno"> 1503</span> private:</div>
-<div class="line"><a id="l01504" name="l01504"></a><span class="lineno"> 1504</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01505" name="l01505"></a><span class="lineno"> 1505</span>};</div>
+<div class="line"><a id="l01487" name="l01487"></a><span class="lineno"> 1487</span> </div>
+<div class="foldopen" id="foldopen01488" data-start="{" data-end="};">
+<div class="line"><a id="l01488" name="l01488"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html"> 1488</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_partition.html">Partition</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01489" name="l01489"></a><span class="lineno"> 1489</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01490" data-start="{" data-end="}">
+<div class="line"><a id="l01490" name="l01490"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#a7b82ca3895b6654308fac566b277ac0d"> 1490</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_partition.html#a7b82ca3895b6654308fac566b277ac0d">Partition</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">int</span> kth, <span class="keywordtype">int</span> axis)</div>
+<div class="line"><a id="l01491" name="l01491"></a><span class="lineno"> 1491</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), kth_(kth), axis_(axis) {}</div>
 </div>
-<div class="line"><a id="l01506" name="l01506"></a><span class="lineno"> 1506</span> </div>
-<div class="foldopen" id="foldopen01507" data-start="{" data-end="};">
-<div class="line"><a id="l01507" name="l01507"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html"> 1507</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_quantized_matmul.html">QuantizedMatmul</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01508" name="l01508"></a><span class="lineno"> 1508</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01509" data-start="{" data-end="}">
-<div class="line"><a id="l01509" name="l01509"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c"> 1509</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c">QuantizedMatmul</a>(</div>
-<div class="line"><a id="l01510" name="l01510"></a><span class="lineno"> 1510</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01511" name="l01511"></a><span class="lineno"> 1511</span>      <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l01512" name="l01512"></a><span class="lineno"> 1512</span>      <span class="keywordtype">int</span> bits,</div>
-<div class="line"><a id="l01513" name="l01513"></a><span class="lineno"> 1513</span>      <span class="keywordtype">bool</span> transpose)</div>
-<div class="line"><a id="l01514" name="l01514"></a><span class="lineno"> 1514</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
-<div class="line"><a id="l01515" name="l01515"></a><span class="lineno"> 1515</span>        group_size_(group_size),</div>
-<div class="line"><a id="l01516" name="l01516"></a><span class="lineno"> 1516</span>        bits_(bits),</div>
-<div class="line"><a id="l01517" name="l01517"></a><span class="lineno"> 1517</span>        transpose_(<a class="code hl_function" href="group__ops.html#gac1869f3b7094869b44fe7ac4ce58638b">transpose</a>) {}</div>
+<div class="line"><a id="l01492" name="l01492"></a><span class="lineno"> 1492</span> </div>
+<div class="line"><a id="l01493" name="l01493"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8"> 1493</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01494" name="l01494"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef"> 1494</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01495" name="l01495"></a><span class="lineno"> 1495</span> </div>
+<div class="line"><a id="l01496" name="l01496"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c"> 1496</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01497" name="l01497"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a"> 1497</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01498" name="l01498"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0"> 1498</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_partition.html">Partition</a>)</div>
+<div class="line"><a id="l01499" name="l01499"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf"> 1499</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01500" name="l01500"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8"> 1500</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01501" name="l01501"></a><span class="lineno"> 1501</span> </div>
+<div class="line"><a id="l01502" name="l01502"></a><span class="lineno"> 1502</span> private:</div>
+<div class="line"><a id="l01503" name="l01503"></a><span class="lineno"> 1503</span>  <span class="keywordtype">int</span> kth_;</div>
+<div class="line"><a id="l01504" name="l01504"></a><span class="lineno"> 1504</span>  <span class="keywordtype">int</span> axis_;</div>
+<div class="line"><a id="l01505" name="l01505"></a><span class="lineno"> 1505</span> </div>
+<div class="line"><a id="l01506" name="l01506"></a><span class="lineno"> 1506</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01507" name="l01507"></a><span class="lineno"> 1507</span>};</div>
 </div>
-<div class="line"><a id="l01518" name="l01518"></a><span class="lineno"> 1518</span> </div>
-<div class="line"><a id="l01519" name="l01519"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3"> 1519</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01520" name="l01520"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3"> 1520</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01508" name="l01508"></a><span class="lineno"> 1508</span> </div>
+<div class="foldopen" id="foldopen01509" data-start="{" data-end="};">
+<div class="line"><a id="l01509" name="l01509"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html"> 1509</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_power.html">Power</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01510" name="l01510"></a><span class="lineno"> 1510</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01511" name="l01511"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a7bc6c64179b7a2aef56fe1dafb6459b2"> 1511</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_power.html#a7bc6c64179b7a2aef56fe1dafb6459b2">Power</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01512" name="l01512"></a><span class="lineno"> 1512</span> </div>
+<div class="line"><a id="l01513" name="l01513"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206"> 1513</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01514" name="l01514"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11"> 1514</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01515" name="l01515"></a><span class="lineno"> 1515</span> </div>
+<div class="line"><a id="l01516" name="l01516"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f"> 1516</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01517" name="l01517"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a"> 1517</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01518" name="l01518"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60"> 1518</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_power.html">Power</a>)</div>
+<div class="line"><a id="l01519" name="l01519"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68"> 1519</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01520" name="l01520"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1"> 1520</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
 <div class="line"><a id="l01521" name="l01521"></a><span class="lineno"> 1521</span> </div>
-<div class="line"><a id="l01522" name="l01522"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763"> 1522</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01523" name="l01523"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23"> 1523</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01524" name="l01524"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db"> 1524</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_quantized_matmul.html">QuantizedMatmul</a>)</div>
-<div class="line"><a id="l01525" name="l01525"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1"> 1525</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01526" name="l01526"></a><span class="lineno"> 1526</span> </div>
-<div class="line"><a id="l01527" name="l01527"></a><span class="lineno"> 1527</span> private:</div>
-<div class="line"><a id="l01528" name="l01528"></a><span class="lineno"> 1528</span>  <span class="keywordtype">int</span> group_size_;</div>
-<div class="line"><a id="l01529" name="l01529"></a><span class="lineno"> 1529</span>  <span class="keywordtype">int</span> bits_;</div>
-<div class="line"><a id="l01530" name="l01530"></a><span class="lineno"> 1530</span>  <span class="keywordtype">bool</span> transpose_;</div>
-<div class="line"><a id="l01531" name="l01531"></a><span class="lineno"> 1531</span> </div>
-<div class="line"><a id="l01532" name="l01532"></a><span class="lineno"> 1532</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01533" name="l01533"></a><span class="lineno"> 1533</span>};</div>
+<div class="line"><a id="l01522" name="l01522"></a><span class="lineno"> 1522</span> private:</div>
+<div class="line"><a id="l01523" name="l01523"></a><span class="lineno"> 1523</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01524" name="l01524"></a><span class="lineno"> 1524</span>};</div>
 </div>
-<div class="line"><a id="l01534" name="l01534"></a><span class="lineno"> 1534</span> </div>
-<div class="foldopen" id="foldopen01535" data-start="{" data-end="};">
-<div class="line"><a id="l01535" name="l01535"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html"> 1535</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_gather_q_m_m.html">GatherQMM</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01536" name="l01536"></a><span class="lineno"> 1536</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01537" data-start="{" data-end="}">
-<div class="line"><a id="l01537" name="l01537"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a60ed2ade7f10dd9c9314913a810f9360"> 1537</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather_q_m_m.html#a60ed2ade7f10dd9c9314913a810f9360">GatherQMM</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits, <span class="keywordtype">bool</span> transpose)</div>
-<div class="line"><a id="l01538" name="l01538"></a><span class="lineno"> 1538</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
-<div class="line"><a id="l01539" name="l01539"></a><span class="lineno"> 1539</span>        group_size_(group_size),</div>
-<div class="line"><a id="l01540" name="l01540"></a><span class="lineno"> 1540</span>        bits_(bits),</div>
-<div class="line"><a id="l01541" name="l01541"></a><span class="lineno"> 1541</span>        transpose_(<a class="code hl_function" href="group__ops.html#gac1869f3b7094869b44fe7ac4ce58638b">transpose</a>) {}</div>
+<div class="line"><a id="l01525" name="l01525"></a><span class="lineno"> 1525</span> </div>
+<div class="foldopen" id="foldopen01526" data-start="{" data-end="};">
+<div class="line"><a id="l01526" name="l01526"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html"> 1526</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_quantized_matmul.html">QuantizedMatmul</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01527" name="l01527"></a><span class="lineno"> 1527</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01528" data-start="{" data-end="}">
+<div class="line"><a id="l01528" name="l01528"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c"> 1528</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c">QuantizedMatmul</a>(</div>
+<div class="line"><a id="l01529" name="l01529"></a><span class="lineno"> 1529</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01530" name="l01530"></a><span class="lineno"> 1530</span>      <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01531" name="l01531"></a><span class="lineno"> 1531</span>      <span class="keywordtype">int</span> bits,</div>
+<div class="line"><a id="l01532" name="l01532"></a><span class="lineno"> 1532</span>      <span class="keywordtype">bool</span> transpose)</div>
+<div class="line"><a id="l01533" name="l01533"></a><span class="lineno"> 1533</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
+<div class="line"><a id="l01534" name="l01534"></a><span class="lineno"> 1534</span>        group_size_(group_size),</div>
+<div class="line"><a id="l01535" name="l01535"></a><span class="lineno"> 1535</span>        bits_(bits),</div>
+<div class="line"><a id="l01536" name="l01536"></a><span class="lineno"> 1536</span>        transpose_(<a class="code hl_function" href="group__ops.html#gac1869f3b7094869b44fe7ac4ce58638b">transpose</a>) {}</div>
 </div>
-<div class="line"><a id="l01542" name="l01542"></a><span class="lineno"> 1542</span> </div>
-<div class="line"><a id="l01543" name="l01543"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c"> 1543</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01544" name="l01544"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887"> 1544</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01537" name="l01537"></a><span class="lineno"> 1537</span> </div>
+<div class="line"><a id="l01538" name="l01538"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3"> 1538</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01539" name="l01539"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3"> 1539</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01540" name="l01540"></a><span class="lineno"> 1540</span> </div>
+<div class="line"><a id="l01541" name="l01541"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763"> 1541</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01542" name="l01542"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23"> 1542</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01543" name="l01543"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db"> 1543</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_quantized_matmul.html">QuantizedMatmul</a>)</div>
+<div class="line"><a id="l01544" name="l01544"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1"> 1544</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
 <div class="line"><a id="l01545" name="l01545"></a><span class="lineno"> 1545</span> </div>
-<div class="line"><a id="l01546" name="l01546"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f"> 1546</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01547" name="l01547"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0"> 1547</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01548" name="l01548"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0"> 1548</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_gather_q_m_m.html">GatherQMM</a>)</div>
-<div class="line"><a id="l01549" name="l01549"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11"> 1549</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01546" name="l01546"></a><span class="lineno"> 1546</span> private:</div>
+<div class="line"><a id="l01547" name="l01547"></a><span class="lineno"> 1547</span>  <span class="keywordtype">int</span> group_size_;</div>
+<div class="line"><a id="l01548" name="l01548"></a><span class="lineno"> 1548</span>  <span class="keywordtype">int</span> bits_;</div>
+<div class="line"><a id="l01549" name="l01549"></a><span class="lineno"> 1549</span>  <span class="keywordtype">bool</span> transpose_;</div>
 <div class="line"><a id="l01550" name="l01550"></a><span class="lineno"> 1550</span> </div>
-<div class="line"><a id="l01551" name="l01551"></a><span class="lineno"> 1551</span> private:</div>
-<div class="line"><a id="l01552" name="l01552"></a><span class="lineno"> 1552</span>  <span class="keywordtype">int</span> group_size_;</div>
-<div class="line"><a id="l01553" name="l01553"></a><span class="lineno"> 1553</span>  <span class="keywordtype">int</span> bits_;</div>
-<div class="line"><a id="l01554" name="l01554"></a><span class="lineno"> 1554</span>  <span class="keywordtype">bool</span> transpose_;</div>
-<div class="line"><a id="l01555" name="l01555"></a><span class="lineno"> 1555</span> </div>
-<div class="line"><a id="l01556" name="l01556"></a><span class="lineno"> 1556</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01557" name="l01557"></a><span class="lineno"> 1557</span>};</div>
+<div class="line"><a id="l01551" name="l01551"></a><span class="lineno"> 1551</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01552" name="l01552"></a><span class="lineno"> 1552</span>};</div>
 </div>
-<div class="line"><a id="l01558" name="l01558"></a><span class="lineno"> 1558</span> </div>
-<div class="foldopen" id="foldopen01559" data-start="{" data-end="};">
-<div class="line"><a id="l01559" name="l01559"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html"> 1559</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_random_bits.html">RandomBits</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01560" name="l01560"></a><span class="lineno"> 1560</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01561" data-start="{" data-end="}">
-<div class="line"><a id="l01561" name="l01561"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a7dd5ed03f2a4ab45d1d5e8e2b587de6b"> 1561</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_random_bits.html#a7dd5ed03f2a4ab45d1d5e8e2b587de6b">RandomBits</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keyword">const</span> std::vector&lt;int&gt;&amp; shape, <span class="keywordtype">int</span> width)</div>
-<div class="line"><a id="l01562" name="l01562"></a><span class="lineno"> 1562</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), shape_(shape), width_(width) {}</div>
+<div class="line"><a id="l01553" name="l01553"></a><span class="lineno"> 1553</span> </div>
+<div class="foldopen" id="foldopen01554" data-start="{" data-end="};">
+<div class="line"><a id="l01554" name="l01554"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html"> 1554</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_gather_q_m_m.html">GatherQMM</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01555" name="l01555"></a><span class="lineno"> 1555</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01556" data-start="{" data-end="}">
+<div class="line"><a id="l01556" name="l01556"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a60ed2ade7f10dd9c9314913a810f9360"> 1556</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather_q_m_m.html#a60ed2ade7f10dd9c9314913a810f9360">GatherQMM</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits, <span class="keywordtype">bool</span> transpose)</div>
+<div class="line"><a id="l01557" name="l01557"></a><span class="lineno"> 1557</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
+<div class="line"><a id="l01558" name="l01558"></a><span class="lineno"> 1558</span>        group_size_(group_size),</div>
+<div class="line"><a id="l01559" name="l01559"></a><span class="lineno"> 1559</span>        bits_(bits),</div>
+<div class="line"><a id="l01560" name="l01560"></a><span class="lineno"> 1560</span>        transpose_(<a class="code hl_function" href="group__ops.html#gac1869f3b7094869b44fe7ac4ce58638b">transpose</a>) {}</div>
 </div>
-<div class="line"><a id="l01563" name="l01563"></a><span class="lineno"> 1563</span> </div>
-<div class="line"><a id="l01564" name="l01564"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2"> 1564</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01565" name="l01565"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a"> 1565</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01566" name="l01566"></a><span class="lineno"> 1566</span> </div>
-<div class="line"><a id="l01567" name="l01567"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415"> 1567</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01568" name="l01568"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271"> 1568</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_random_bits.html">RandomBits</a>)</div>
-<div class="line"><a id="l01569" name="l01569"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6"> 1569</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01570" name="l01570"></a><span class="lineno"> 1570</span> </div>
-<div class="line"><a id="l01571" name="l01571"></a><span class="lineno"> 1571</span> private:</div>
-<div class="line"><a id="l01572" name="l01572"></a><span class="lineno"> 1572</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; shape_;</div>
-<div class="line"><a id="l01573" name="l01573"></a><span class="lineno"> 1573</span>  <span class="keywordtype">int</span> width_;</div>
+<div class="line"><a id="l01561" name="l01561"></a><span class="lineno"> 1561</span> </div>
+<div class="line"><a id="l01562" name="l01562"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c"> 1562</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01563" name="l01563"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887"> 1563</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01564" name="l01564"></a><span class="lineno"> 1564</span> </div>
+<div class="line"><a id="l01565" name="l01565"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f"> 1565</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01566" name="l01566"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0"> 1566</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01567" name="l01567"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0"> 1567</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_gather_q_m_m.html">GatherQMM</a>)</div>
+<div class="line"><a id="l01568" name="l01568"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11"> 1568</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01569" name="l01569"></a><span class="lineno"> 1569</span> </div>
+<div class="line"><a id="l01570" name="l01570"></a><span class="lineno"> 1570</span> private:</div>
+<div class="line"><a id="l01571" name="l01571"></a><span class="lineno"> 1571</span>  <span class="keywordtype">int</span> group_size_;</div>
+<div class="line"><a id="l01572" name="l01572"></a><span class="lineno"> 1572</span>  <span class="keywordtype">int</span> bits_;</div>
+<div class="line"><a id="l01573" name="l01573"></a><span class="lineno"> 1573</span>  <span class="keywordtype">bool</span> transpose_;</div>
 <div class="line"><a id="l01574" name="l01574"></a><span class="lineno"> 1574</span> </div>
 <div class="line"><a id="l01575" name="l01575"></a><span class="lineno"> 1575</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
 <div class="line"><a id="l01576" name="l01576"></a><span class="lineno"> 1576</span>};</div>
 </div>
 <div class="line"><a id="l01577" name="l01577"></a><span class="lineno"> 1577</span> </div>
 <div class="foldopen" id="foldopen01578" data-start="{" data-end="};">
-<div class="line"><a id="l01578" name="l01578"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html"> 1578</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_real.html">Real</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01578" name="l01578"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html"> 1578</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_random_bits.html">RandomBits</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
 <div class="line"><a id="l01579" name="l01579"></a><span class="lineno"> 1579</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01580" name="l01580"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#acd4480e3f0834d70ff6b5f1ecef17892"> 1580</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_real.html#acd4480e3f0834d70ff6b5f1ecef17892">Real</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01581" name="l01581"></a><span class="lineno"> 1581</span> </div>
-<div class="line"><a id="l01582" name="l01582"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934"> 1582</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01583" name="l01583"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2"> 1583</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01584" name="l01584"></a><span class="lineno"> 1584</span> </div>
-<div class="line"><a id="l01585" name="l01585"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6"> 1585</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01586" name="l01586"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526"> 1586</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01587" name="l01587"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b"> 1587</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_real.html">Real</a>)</div>
-<div class="line"><a id="l01588" name="l01588"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239"> 1588</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01589" name="l01589"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5"> 1589</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01590" name="l01590"></a><span class="lineno"> 1590</span>};</div>
+<div class="foldopen" id="foldopen01580" data-start="{" data-end="}">
+<div class="line"><a id="l01580" name="l01580"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a7dd5ed03f2a4ab45d1d5e8e2b587de6b"> 1580</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_random_bits.html#a7dd5ed03f2a4ab45d1d5e8e2b587de6b">RandomBits</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keyword">const</span> std::vector&lt;int&gt;&amp; shape, <span class="keywordtype">int</span> width)</div>
+<div class="line"><a id="l01581" name="l01581"></a><span class="lineno"> 1581</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), shape_(shape), width_(width) {}</div>
 </div>
-<div class="line"><a id="l01591" name="l01591"></a><span class="lineno"> 1591</span> </div>
-<div class="foldopen" id="foldopen01592" data-start="{" data-end="};">
-<div class="line"><a id="l01592" name="l01592"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html"> 1592</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_reshape.html">Reshape</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01593" name="l01593"></a><span class="lineno"> 1593</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01594" data-start="{" data-end="}">
-<div class="line"><a id="l01594" name="l01594"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#aa7c5a6e18d0615ad36102de01929eb26"> 1594</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reshape.html#aa7c5a6e18d0615ad36102de01929eb26">Reshape</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keyword">const</span> std::vector&lt;int&gt;&amp; shape)</div>
-<div class="line"><a id="l01595" name="l01595"></a><span class="lineno"> 1595</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), shape_(shape) {}</div>
+<div class="line"><a id="l01582" name="l01582"></a><span class="lineno"> 1582</span> </div>
+<div class="line"><a id="l01583" name="l01583"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2"> 1583</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01584" name="l01584"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a"> 1584</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01585" name="l01585"></a><span class="lineno"> 1585</span> </div>
+<div class="line"><a id="l01586" name="l01586"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415"> 1586</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01587" name="l01587"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271"> 1587</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_random_bits.html">RandomBits</a>)</div>
+<div class="line"><a id="l01588" name="l01588"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6"> 1588</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01589" name="l01589"></a><span class="lineno"> 1589</span> </div>
+<div class="line"><a id="l01590" name="l01590"></a><span class="lineno"> 1590</span> private:</div>
+<div class="line"><a id="l01591" name="l01591"></a><span class="lineno"> 1591</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; shape_;</div>
+<div class="line"><a id="l01592" name="l01592"></a><span class="lineno"> 1592</span>  <span class="keywordtype">int</span> width_;</div>
+<div class="line"><a id="l01593" name="l01593"></a><span class="lineno"> 1593</span> </div>
+<div class="line"><a id="l01594" name="l01594"></a><span class="lineno"> 1594</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01595" name="l01595"></a><span class="lineno"> 1595</span>};</div>
 </div>
 <div class="line"><a id="l01596" name="l01596"></a><span class="lineno"> 1596</span> </div>
-<div class="line"><a id="l01597" name="l01597"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f"> 1597</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01598" name="l01598"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059"> 1598</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01599" name="l01599"></a><span class="lineno"> 1599</span> </div>
-<div class="line"><a id="l01600" name="l01600"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d"> 1600</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01601" name="l01601"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5"> 1601</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01602" name="l01602"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862"> 1602</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_reshape.html">Reshape</a>)</div>
-<div class="line"><a id="l01603" name="l01603"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3"> 1603</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01604" name="l01604"></a><span class="lineno"> 1604</span> </div>
-<div class="line"><a id="l01605" name="l01605"></a><span class="lineno"> 1605</span> private:</div>
-<div class="line"><a id="l01606" name="l01606"></a><span class="lineno"> 1606</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; shape_;</div>
-<div class="line"><a id="l01607" name="l01607"></a><span class="lineno"> 1607</span> </div>
-<div class="line"><a id="l01608" name="l01608"></a><span class="lineno"> 1608</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01609" name="l01609"></a><span class="lineno"> 1609</span> </div>
-<div class="line"><a id="l01610" name="l01610"></a><span class="lineno"> 1610</span>  std::pair&lt;<span class="keywordtype">bool</span>, std::vector&lt;<span class="keywordtype">size_t</span>&gt;&gt; prepare_reshape(</div>
-<div class="line"><a id="l01611" name="l01611"></a><span class="lineno"> 1611</span>      const <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l01612" name="l01612"></a><span class="lineno"> 1612</span>      const <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01613" name="l01613"></a><span class="lineno"> 1613</span>  <span class="keywordtype">void</span> shared_buffer_reshape(</div>
-<div class="line"><a id="l01614" name="l01614"></a><span class="lineno"> 1614</span>      const <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l01615" name="l01615"></a><span class="lineno"> 1615</span>      const std::vector&lt;<span class="keywordtype">size_t</span>&gt;&amp; out_strides,</div>
-<div class="line"><a id="l01616" name="l01616"></a><span class="lineno"> 1616</span>      <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01617" name="l01617"></a><span class="lineno"> 1617</span>};</div>
+<div class="foldopen" id="foldopen01597" data-start="{" data-end="};">
+<div class="line"><a id="l01597" name="l01597"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html"> 1597</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_real.html">Real</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01598" name="l01598"></a><span class="lineno"> 1598</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01599" name="l01599"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#acd4480e3f0834d70ff6b5f1ecef17892"> 1599</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_real.html#acd4480e3f0834d70ff6b5f1ecef17892">Real</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01600" name="l01600"></a><span class="lineno"> 1600</span> </div>
+<div class="line"><a id="l01601" name="l01601"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934"> 1601</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01602" name="l01602"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2"> 1602</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01603" name="l01603"></a><span class="lineno"> 1603</span> </div>
+<div class="line"><a id="l01604" name="l01604"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6"> 1604</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01605" name="l01605"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526"> 1605</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01606" name="l01606"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b"> 1606</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_real.html">Real</a>)</div>
+<div class="line"><a id="l01607" name="l01607"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239"> 1607</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01608" name="l01608"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5"> 1608</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01609" name="l01609"></a><span class="lineno"> 1609</span>};</div>
 </div>
+<div class="line"><a id="l01610" name="l01610"></a><span class="lineno"> 1610</span> </div>
+<div class="foldopen" id="foldopen01611" data-start="{" data-end="};">
+<div class="line"><a id="l01611" name="l01611"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html"> 1611</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_reshape.html">Reshape</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01612" name="l01612"></a><span class="lineno"> 1612</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01613" data-start="{" data-end="}">
+<div class="line"><a id="l01613" name="l01613"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#aa7c5a6e18d0615ad36102de01929eb26"> 1613</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reshape.html#aa7c5a6e18d0615ad36102de01929eb26">Reshape</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keyword">const</span> std::vector&lt;int&gt;&amp; shape)</div>
+<div class="line"><a id="l01614" name="l01614"></a><span class="lineno"> 1614</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), shape_(shape) {}</div>
+</div>
+<div class="line"><a id="l01615" name="l01615"></a><span class="lineno"> 1615</span> </div>
+<div class="line"><a id="l01616" name="l01616"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f"> 1616</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01617" name="l01617"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059"> 1617</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
 <div class="line"><a id="l01618" name="l01618"></a><span class="lineno"> 1618</span> </div>
-<div class="foldopen" id="foldopen01619" data-start="{" data-end="};">
-<div class="line"><a id="l01619" name="l01619"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html"> 1619</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_reduce.html">Reduce</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01620" name="l01620"></a><span class="lineno"> 1620</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01621" name="l01621"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93"> 1621</a></span>  <span class="keyword">enum</span> <a class="code hl_enumeration" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9">ReduceType</a> { <a class="code hl_enumvalue" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93">And</a>, <a class="code hl_struct" href="struct_or.html">Or</a>, <a class="code hl_struct" href="struct_sum.html">Sum</a>, <a class="code hl_struct" href="struct_prod.html">Prod</a>, <a class="code hl_struct" href="struct_min.html">Min</a>, <a class="code hl_struct" href="struct_max.html">Max</a> };</div>
-<div class="line"><a id="l01622" name="l01622"></a><span class="lineno"> 1622</span> </div>
-<div class="foldopen" id="foldopen01623" data-start="{" data-end="}">
-<div class="line"><a id="l01623" name="l01623"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a055368c1d036fb953a23ef230e33dcbf"> 1623</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reduce.html#a055368c1d036fb953a23ef230e33dcbf">Reduce</a>(</div>
-<div class="line"><a id="l01624" name="l01624"></a><span class="lineno"> 1624</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01625" name="l01625"></a><span class="lineno"> 1625</span>      <a class="code hl_enumeration" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9">ReduceType</a> reduce_type,</div>
-<div class="line"><a id="l01626" name="l01626"></a><span class="lineno"> 1626</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes)</div>
-<div class="line"><a id="l01627" name="l01627"></a><span class="lineno"> 1627</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), reduce_type_(reduce_type), axes_(axes) {}</div>
-</div>
+<div class="line"><a id="l01619" name="l01619"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d"> 1619</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01620" name="l01620"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5"> 1620</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01621" name="l01621"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862"> 1621</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_reshape.html">Reshape</a>)</div>
+<div class="line"><a id="l01622" name="l01622"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3"> 1622</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01623" name="l01623"></a><span class="lineno"> 1623</span> </div>
+<div class="line"><a id="l01624" name="l01624"></a><span class="lineno"> 1624</span> private:</div>
+<div class="line"><a id="l01625" name="l01625"></a><span class="lineno"> 1625</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; shape_;</div>
+<div class="line"><a id="l01626" name="l01626"></a><span class="lineno"> 1626</span> </div>
+<div class="line"><a id="l01627" name="l01627"></a><span class="lineno"> 1627</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
 <div class="line"><a id="l01628" name="l01628"></a><span class="lineno"> 1628</span> </div>
-<div class="line"><a id="l01629" name="l01629"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa"> 1629</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01630" name="l01630"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f"> 1630</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01631" name="l01631"></a><span class="lineno"> 1631</span> </div>
-<div class="line"><a id="l01632" name="l01632"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38"> 1632</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01633" name="l01633"></a><span class="lineno"> 1633</span> </div>
-<div class="line"><a id="l01634" name="l01634"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e"> 1634</a></span>  std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt; vjp(</div>
-<div class="line"><a id="l01635" name="l01635"></a><span class="lineno"> 1635</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; primals,</div>
-<div class="line"><a id="l01636" name="l01636"></a><span class="lineno"> 1636</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; cotangents,</div>
-<div class="line"><a id="l01637" name="l01637"></a><span class="lineno"> 1637</span>      const std::vector&lt;<span class="keywordtype">int</span>&gt;&amp; argnums,</div>
-<div class="line"><a id="l01638" name="l01638"></a><span class="lineno"> 1638</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; outputs) override;</div>
-<div class="line"><a id="l01639" name="l01639"></a><span class="lineno"> 1639</span> </div>
-<div class="line"><a id="l01640" name="l01640"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65"> 1640</a></span>  std::vector&lt;std::vector&lt;<span class="keywordtype">int</span>&gt;&gt; output_shapes(</div>
-<div class="line"><a id="l01641" name="l01641"></a><span class="lineno"> 1641</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs) override;</div>
-<div class="line"><a id="l01642" name="l01642"></a><span class="lineno"> 1642</span> </div>
-<div class="foldopen" id="foldopen01643" data-start="{" data-end="}">
-<div class="line"><a id="l01643" name="l01643"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd"> 1643</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
-<div class="line"><a id="l01644" name="l01644"></a><span class="lineno"> 1644</span>    <span class="keywordflow">switch</span> (reduce_type_) {</div>
-<div class="line"><a id="l01645" name="l01645"></a><span class="lineno"> 1645</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_and.html">And</a>:</div>
-<div class="line"><a id="l01646" name="l01646"></a><span class="lineno"> 1646</span>        os &lt;&lt; <span class="stringliteral">&quot;And&quot;</span>;</div>
-<div class="line"><a id="l01647" name="l01647"></a><span class="lineno"> 1647</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01648" name="l01648"></a><span class="lineno"> 1648</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_or.html">Or</a>:</div>
-<div class="line"><a id="l01649" name="l01649"></a><span class="lineno"> 1649</span>        os &lt;&lt; <span class="stringliteral">&quot;Or&quot;</span>;</div>
-<div class="line"><a id="l01650" name="l01650"></a><span class="lineno"> 1650</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01651" name="l01651"></a><span class="lineno"> 1651</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_sum.html">Sum</a>:</div>
-<div class="line"><a id="l01652" name="l01652"></a><span class="lineno"> 1652</span>        os &lt;&lt; <span class="stringliteral">&quot;Sum&quot;</span>;</div>
-<div class="line"><a id="l01653" name="l01653"></a><span class="lineno"> 1653</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01654" name="l01654"></a><span class="lineno"> 1654</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_prod.html">Prod</a>:</div>
-<div class="line"><a id="l01655" name="l01655"></a><span class="lineno"> 1655</span>        os &lt;&lt; <span class="stringliteral">&quot;Prod&quot;</span>;</div>
-<div class="line"><a id="l01656" name="l01656"></a><span class="lineno"> 1656</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01657" name="l01657"></a><span class="lineno"> 1657</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_min.html">Min</a>:</div>
-<div class="line"><a id="l01658" name="l01658"></a><span class="lineno"> 1658</span>        os &lt;&lt; <span class="stringliteral">&quot;Min&quot;</span>;</div>
-<div class="line"><a id="l01659" name="l01659"></a><span class="lineno"> 1659</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01660" name="l01660"></a><span class="lineno"> 1660</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_max.html">Max</a>:</div>
-<div class="line"><a id="l01661" name="l01661"></a><span class="lineno"> 1661</span>        os &lt;&lt; <span class="stringliteral">&quot;Max&quot;</span>;</div>
-<div class="line"><a id="l01662" name="l01662"></a><span class="lineno"> 1662</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01663" name="l01663"></a><span class="lineno"> 1663</span>    }</div>
-<div class="line"><a id="l01664" name="l01664"></a><span class="lineno"> 1664</span>  }</div>
+<div class="line"><a id="l01629" name="l01629"></a><span class="lineno"> 1629</span>  std::pair&lt;<span class="keywordtype">bool</span>, std::vector&lt;<span class="keywordtype">size_t</span>&gt;&gt; prepare_reshape(</div>
+<div class="line"><a id="l01630" name="l01630"></a><span class="lineno"> 1630</span>      const <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l01631" name="l01631"></a><span class="lineno"> 1631</span>      const <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01632" name="l01632"></a><span class="lineno"> 1632</span>  <span class="keywordtype">void</span> shared_buffer_reshape(</div>
+<div class="line"><a id="l01633" name="l01633"></a><span class="lineno"> 1633</span>      const <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l01634" name="l01634"></a><span class="lineno"> 1634</span>      const std::vector&lt;<span class="keywordtype">size_t</span>&gt;&amp; out_strides,</div>
+<div class="line"><a id="l01635" name="l01635"></a><span class="lineno"> 1635</span>      <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01636" name="l01636"></a><span class="lineno"> 1636</span>};</div>
 </div>
-<div class="line"><a id="l01665" name="l01665"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e"> 1665</a></span>  <span class="keywordtype">bool</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e">is_equivalent</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) <span class="keyword">const override</span>;</div>
-<div class="line"><a id="l01666" name="l01666"></a><span class="lineno"> 1666</span> </div>
-<div class="line"><a id="l01667" name="l01667"></a><span class="lineno"> 1667</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l01668" name="l01668"></a><span class="lineno"> 1668</span>  <a class="code hl_enumeration" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9">ReduceType</a> reduce_type_;</div>
-<div class="line"><a id="l01669" name="l01669"></a><span class="lineno"> 1669</span>  std::vector&lt;int&gt; axes_;</div>
-<div class="line"><a id="l01670" name="l01670"></a><span class="lineno"> 1670</span> </div>
-<div class="line"><a id="l01671" name="l01671"></a><span class="lineno"> 1671</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01672" name="l01672"></a><span class="lineno"> 1672</span>};</div>
+<div class="line"><a id="l01637" name="l01637"></a><span class="lineno"> 1637</span> </div>
+<div class="foldopen" id="foldopen01638" data-start="{" data-end="};">
+<div class="line"><a id="l01638" name="l01638"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html"> 1638</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_reduce.html">Reduce</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01639" name="l01639"></a><span class="lineno"> 1639</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01640" name="l01640"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93"> 1640</a></span>  <span class="keyword">enum</span> <a class="code hl_enumeration" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9">ReduceType</a> { <a class="code hl_enumvalue" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93">And</a>, <a class="code hl_struct" href="struct_or.html">Or</a>, <a class="code hl_struct" href="struct_sum.html">Sum</a>, <a class="code hl_struct" href="struct_prod.html">Prod</a>, <a class="code hl_struct" href="struct_min.html">Min</a>, <a class="code hl_struct" href="struct_max.html">Max</a> };</div>
+<div class="line"><a id="l01641" name="l01641"></a><span class="lineno"> 1641</span> </div>
+<div class="foldopen" id="foldopen01642" data-start="{" data-end="}">
+<div class="line"><a id="l01642" name="l01642"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a055368c1d036fb953a23ef230e33dcbf"> 1642</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reduce.html#a055368c1d036fb953a23ef230e33dcbf">Reduce</a>(</div>
+<div class="line"><a id="l01643" name="l01643"></a><span class="lineno"> 1643</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01644" name="l01644"></a><span class="lineno"> 1644</span>      <a class="code hl_enumeration" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9">ReduceType</a> reduce_type,</div>
+<div class="line"><a id="l01645" name="l01645"></a><span class="lineno"> 1645</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes)</div>
+<div class="line"><a id="l01646" name="l01646"></a><span class="lineno"> 1646</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), reduce_type_(reduce_type), axes_(axes) {}</div>
 </div>
-<div class="line"><a id="l01673" name="l01673"></a><span class="lineno"> 1673</span> </div>
-<div class="foldopen" id="foldopen01674" data-start="{" data-end="};">
-<div class="line"><a id="l01674" name="l01674"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html"> 1674</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_round.html">Round</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01675" name="l01675"></a><span class="lineno"> 1675</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01676" name="l01676"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde"> 1676</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde">Round</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01677" name="l01677"></a><span class="lineno"> 1677</span> </div>
-<div class="line"><a id="l01678" name="l01678"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007"> 1678</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01679" name="l01679"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec"> 1679</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01680" name="l01680"></a><span class="lineno"> 1680</span> </div>
-<div class="line"><a id="l01681" name="l01681"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd"> 1681</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01682" name="l01682"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7"> 1682</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01683" name="l01683"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72"> 1683</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_round.html">Round</a>)</div>
-<div class="line"><a id="l01684" name="l01684"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927"> 1684</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01685" name="l01685"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047"> 1685</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01686" name="l01686"></a><span class="lineno"> 1686</span> </div>
-<div class="line"><a id="l01687" name="l01687"></a><span class="lineno"> 1687</span> private:</div>
-<div class="line"><a id="l01688" name="l01688"></a><span class="lineno"> 1688</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01689" name="l01689"></a><span class="lineno"> 1689</span>};</div>
+<div class="line"><a id="l01647" name="l01647"></a><span class="lineno"> 1647</span> </div>
+<div class="line"><a id="l01648" name="l01648"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa"> 1648</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01649" name="l01649"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f"> 1649</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01650" name="l01650"></a><span class="lineno"> 1650</span> </div>
+<div class="line"><a id="l01651" name="l01651"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38"> 1651</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01652" name="l01652"></a><span class="lineno"> 1652</span> </div>
+<div class="line"><a id="l01653" name="l01653"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e"> 1653</a></span>  std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt; vjp(</div>
+<div class="line"><a id="l01654" name="l01654"></a><span class="lineno"> 1654</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; primals,</div>
+<div class="line"><a id="l01655" name="l01655"></a><span class="lineno"> 1655</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; cotangents,</div>
+<div class="line"><a id="l01656" name="l01656"></a><span class="lineno"> 1656</span>      const std::vector&lt;<span class="keywordtype">int</span>&gt;&amp; argnums,</div>
+<div class="line"><a id="l01657" name="l01657"></a><span class="lineno"> 1657</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; outputs) override;</div>
+<div class="line"><a id="l01658" name="l01658"></a><span class="lineno"> 1658</span> </div>
+<div class="line"><a id="l01659" name="l01659"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65"> 1659</a></span>  std::vector&lt;std::vector&lt;<span class="keywordtype">int</span>&gt;&gt; output_shapes(</div>
+<div class="line"><a id="l01660" name="l01660"></a><span class="lineno"> 1660</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs) override;</div>
+<div class="line"><a id="l01661" name="l01661"></a><span class="lineno"> 1661</span> </div>
+<div class="foldopen" id="foldopen01662" data-start="{" data-end="}">
+<div class="line"><a id="l01662" name="l01662"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd"> 1662</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
+<div class="line"><a id="l01663" name="l01663"></a><span class="lineno"> 1663</span>    <span class="keywordflow">switch</span> (reduce_type_) {</div>
+<div class="line"><a id="l01664" name="l01664"></a><span class="lineno"> 1664</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_and.html">And</a>:</div>
+<div class="line"><a id="l01665" name="l01665"></a><span class="lineno"> 1665</span>        os &lt;&lt; <span class="stringliteral">&quot;And&quot;</span>;</div>
+<div class="line"><a id="l01666" name="l01666"></a><span class="lineno"> 1666</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01667" name="l01667"></a><span class="lineno"> 1667</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_or.html">Or</a>:</div>
+<div class="line"><a id="l01668" name="l01668"></a><span class="lineno"> 1668</span>        os &lt;&lt; <span class="stringliteral">&quot;Or&quot;</span>;</div>
+<div class="line"><a id="l01669" name="l01669"></a><span class="lineno"> 1669</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01670" name="l01670"></a><span class="lineno"> 1670</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_sum.html">Sum</a>:</div>
+<div class="line"><a id="l01671" name="l01671"></a><span class="lineno"> 1671</span>        os &lt;&lt; <span class="stringliteral">&quot;Sum&quot;</span>;</div>
+<div class="line"><a id="l01672" name="l01672"></a><span class="lineno"> 1672</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01673" name="l01673"></a><span class="lineno"> 1673</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_prod.html">Prod</a>:</div>
+<div class="line"><a id="l01674" name="l01674"></a><span class="lineno"> 1674</span>        os &lt;&lt; <span class="stringliteral">&quot;Prod&quot;</span>;</div>
+<div class="line"><a id="l01675" name="l01675"></a><span class="lineno"> 1675</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01676" name="l01676"></a><span class="lineno"> 1676</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_min.html">Min</a>:</div>
+<div class="line"><a id="l01677" name="l01677"></a><span class="lineno"> 1677</span>        os &lt;&lt; <span class="stringliteral">&quot;Min&quot;</span>;</div>
+<div class="line"><a id="l01678" name="l01678"></a><span class="lineno"> 1678</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01679" name="l01679"></a><span class="lineno"> 1679</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_max.html">Max</a>:</div>
+<div class="line"><a id="l01680" name="l01680"></a><span class="lineno"> 1680</span>        os &lt;&lt; <span class="stringliteral">&quot;Max&quot;</span>;</div>
+<div class="line"><a id="l01681" name="l01681"></a><span class="lineno"> 1681</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01682" name="l01682"></a><span class="lineno"> 1682</span>    }</div>
+<div class="line"><a id="l01683" name="l01683"></a><span class="lineno"> 1683</span>  }</div>
 </div>
-<div class="line"><a id="l01690" name="l01690"></a><span class="lineno"> 1690</span> </div>
-<div class="foldopen" id="foldopen01691" data-start="{" data-end="};">
-<div class="line"><a id="l01691" name="l01691"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html"> 1691</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_scan.html">Scan</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01692" name="l01692"></a><span class="lineno"> 1692</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01693" name="l01693"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ad54b2905015a390708f79bae6cdac56d"> 1693</a></span>  <span class="keyword">enum</span> <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1">ReduceType</a> { <a class="code hl_enumvalue" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ad54b2905015a390708f79bae6cdac56d">Max</a>, <a class="code hl_struct" href="struct_min.html">Min</a>, <a class="code hl_struct" href="struct_sum.html">Sum</a>, <a class="code hl_struct" href="struct_prod.html">Prod</a> };</div>
-<div class="line"><a id="l01694" name="l01694"></a><span class="lineno"> 1694</span> </div>
-<div class="foldopen" id="foldopen01695" data-start="{" data-end="}">
-<div class="line"><a id="l01695" name="l01695"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#ac93e8f9c6771de825d2186ef34fa7087"> 1695</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scan.html#ac93e8f9c6771de825d2186ef34fa7087">Scan</a>(</div>
-<div class="line"><a id="l01696" name="l01696"></a><span class="lineno"> 1696</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01697" name="l01697"></a><span class="lineno"> 1697</span>      <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1">ReduceType</a> reduce_type,</div>
-<div class="line"><a id="l01698" name="l01698"></a><span class="lineno"> 1698</span>      <span class="keywordtype">int</span> axis,</div>
-<div class="line"><a id="l01699" name="l01699"></a><span class="lineno"> 1699</span>      <span class="keywordtype">bool</span> reverse,</div>
-<div class="line"><a id="l01700" name="l01700"></a><span class="lineno"> 1700</span>      <span class="keywordtype">bool</span> inclusive)</div>
-<div class="line"><a id="l01701" name="l01701"></a><span class="lineno"> 1701</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
-<div class="line"><a id="l01702" name="l01702"></a><span class="lineno"> 1702</span>        reduce_type_(reduce_type),</div>
-<div class="line"><a id="l01703" name="l01703"></a><span class="lineno"> 1703</span>        axis_(axis),</div>
-<div class="line"><a id="l01704" name="l01704"></a><span class="lineno"> 1704</span>        reverse_(reverse),</div>
-<div class="line"><a id="l01705" name="l01705"></a><span class="lineno"> 1705</span>        inclusive_(inclusive) {}</div>
+<div class="line"><a id="l01684" name="l01684"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e"> 1684</a></span>  <span class="keywordtype">bool</span> <a class="code hl_function" href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e">is_equivalent</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) <span class="keyword">const override</span>;</div>
+<div class="line"><a id="l01685" name="l01685"></a><span class="lineno"> 1685</span> </div>
+<div class="line"><a id="l01686" name="l01686"></a><span class="lineno"> 1686</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l01687" name="l01687"></a><span class="lineno"> 1687</span>  <a class="code hl_enumeration" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9">ReduceType</a> reduce_type_;</div>
+<div class="line"><a id="l01688" name="l01688"></a><span class="lineno"> 1688</span>  std::vector&lt;int&gt; axes_;</div>
+<div class="line"><a id="l01689" name="l01689"></a><span class="lineno"> 1689</span> </div>
+<div class="line"><a id="l01690" name="l01690"></a><span class="lineno"> 1690</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01691" name="l01691"></a><span class="lineno"> 1691</span>};</div>
+</div>
+<div class="line"><a id="l01692" name="l01692"></a><span class="lineno"> 1692</span> </div>
+<div class="foldopen" id="foldopen01693" data-start="{" data-end="};">
+<div class="line"><a id="l01693" name="l01693"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html"> 1693</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_round.html">Round</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01694" name="l01694"></a><span class="lineno"> 1694</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01695" name="l01695"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde"> 1695</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde">Round</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01696" name="l01696"></a><span class="lineno"> 1696</span> </div>
+<div class="line"><a id="l01697" name="l01697"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007"> 1697</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01698" name="l01698"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec"> 1698</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01699" name="l01699"></a><span class="lineno"> 1699</span> </div>
+<div class="line"><a id="l01700" name="l01700"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd"> 1700</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01701" name="l01701"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7"> 1701</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01702" name="l01702"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72"> 1702</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_round.html">Round</a>)</div>
+<div class="line"><a id="l01703" name="l01703"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927"> 1703</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01704" name="l01704"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047"> 1704</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01705" name="l01705"></a><span class="lineno"> 1705</span> </div>
+<div class="line"><a id="l01706" name="l01706"></a><span class="lineno"> 1706</span> private:</div>
+<div class="line"><a id="l01707" name="l01707"></a><span class="lineno"> 1707</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01708" name="l01708"></a><span class="lineno"> 1708</span>};</div>
 </div>
-<div class="line"><a id="l01706" name="l01706"></a><span class="lineno"> 1706</span> </div>
-<div class="line"><a id="l01707" name="l01707"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b"> 1707</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01708" name="l01708"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde"> 1708</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
 <div class="line"><a id="l01709" name="l01709"></a><span class="lineno"> 1709</span> </div>
-<div class="line"><a id="l01710" name="l01710"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804"> 1710</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01711" name="l01711"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee"> 1711</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>();</div>
-<div class="line"><a id="l01712" name="l01712"></a><span class="lineno"> 1712</span> </div>
-<div class="foldopen" id="foldopen01713" data-start="{" data-end="}">
-<div class="line"><a id="l01713" name="l01713"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22"> 1713</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
-<div class="line"><a id="l01714" name="l01714"></a><span class="lineno"> 1714</span>    os &lt;&lt; <span class="stringliteral">&quot;Cum&quot;</span>;</div>
-<div class="line"><a id="l01715" name="l01715"></a><span class="lineno"> 1715</span>    <span class="keywordflow">switch</span> (reduce_type_) {</div>
-<div class="line"><a id="l01716" name="l01716"></a><span class="lineno"> 1716</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_sum.html">Sum</a>:</div>
-<div class="line"><a id="l01717" name="l01717"></a><span class="lineno"> 1717</span>        os &lt;&lt; <span class="stringliteral">&quot;Sum&quot;</span>;</div>
-<div class="line"><a id="l01718" name="l01718"></a><span class="lineno"> 1718</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01719" name="l01719"></a><span class="lineno"> 1719</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_prod.html">Prod</a>:</div>
-<div class="line"><a id="l01720" name="l01720"></a><span class="lineno"> 1720</span>        os &lt;&lt; <span class="stringliteral">&quot;Prod&quot;</span>;</div>
-<div class="line"><a id="l01721" name="l01721"></a><span class="lineno"> 1721</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01722" name="l01722"></a><span class="lineno"> 1722</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_min.html">Min</a>:</div>
-<div class="line"><a id="l01723" name="l01723"></a><span class="lineno"> 1723</span>        os &lt;&lt; <span class="stringliteral">&quot;Min&quot;</span>;</div>
-<div class="line"><a id="l01724" name="l01724"></a><span class="lineno"> 1724</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01725" name="l01725"></a><span class="lineno"> 1725</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_max.html">Max</a>:</div>
-<div class="line"><a id="l01726" name="l01726"></a><span class="lineno"> 1726</span>        os &lt;&lt; <span class="stringliteral">&quot;Max&quot;</span>;</div>
-<div class="line"><a id="l01727" name="l01727"></a><span class="lineno"> 1727</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01728" name="l01728"></a><span class="lineno"> 1728</span>    }</div>
-<div class="line"><a id="l01729" name="l01729"></a><span class="lineno"> 1729</span>  }</div>
+<div class="foldopen" id="foldopen01710" data-start="{" data-end="};">
+<div class="line"><a id="l01710" name="l01710"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html"> 1710</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_scan.html">Scan</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01711" name="l01711"></a><span class="lineno"> 1711</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01712" name="l01712"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ad54b2905015a390708f79bae6cdac56d"> 1712</a></span>  <span class="keyword">enum</span> <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1">ReduceType</a> { <a class="code hl_enumvalue" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ad54b2905015a390708f79bae6cdac56d">Max</a>, <a class="code hl_struct" href="struct_min.html">Min</a>, <a class="code hl_struct" href="struct_sum.html">Sum</a>, <a class="code hl_struct" href="struct_prod.html">Prod</a> };</div>
+<div class="line"><a id="l01713" name="l01713"></a><span class="lineno"> 1713</span> </div>
+<div class="foldopen" id="foldopen01714" data-start="{" data-end="}">
+<div class="line"><a id="l01714" name="l01714"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#ac93e8f9c6771de825d2186ef34fa7087"> 1714</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scan.html#ac93e8f9c6771de825d2186ef34fa7087">Scan</a>(</div>
+<div class="line"><a id="l01715" name="l01715"></a><span class="lineno"> 1715</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01716" name="l01716"></a><span class="lineno"> 1716</span>      <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1">ReduceType</a> reduce_type,</div>
+<div class="line"><a id="l01717" name="l01717"></a><span class="lineno"> 1717</span>      <span class="keywordtype">int</span> axis,</div>
+<div class="line"><a id="l01718" name="l01718"></a><span class="lineno"> 1718</span>      <span class="keywordtype">bool</span> reverse,</div>
+<div class="line"><a id="l01719" name="l01719"></a><span class="lineno"> 1719</span>      <span class="keywordtype">bool</span> inclusive)</div>
+<div class="line"><a id="l01720" name="l01720"></a><span class="lineno"> 1720</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
+<div class="line"><a id="l01721" name="l01721"></a><span class="lineno"> 1721</span>        reduce_type_(reduce_type),</div>
+<div class="line"><a id="l01722" name="l01722"></a><span class="lineno"> 1722</span>        axis_(axis),</div>
+<div class="line"><a id="l01723" name="l01723"></a><span class="lineno"> 1723</span>        reverse_(reverse),</div>
+<div class="line"><a id="l01724" name="l01724"></a><span class="lineno"> 1724</span>        inclusive_(inclusive) {}</div>
 </div>
-<div class="line"><a id="l01730" name="l01730"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6"> 1730</a></span>  <span class="keywordtype">bool</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6">is_equivalent</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) <span class="keyword">const override</span>;</div>
+<div class="line"><a id="l01725" name="l01725"></a><span class="lineno"> 1725</span> </div>
+<div class="line"><a id="l01726" name="l01726"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b"> 1726</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01727" name="l01727"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde"> 1727</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01728" name="l01728"></a><span class="lineno"> 1728</span> </div>
+<div class="line"><a id="l01729" name="l01729"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804"> 1729</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01730" name="l01730"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee"> 1730</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>();</div>
 <div class="line"><a id="l01731" name="l01731"></a><span class="lineno"> 1731</span> </div>
-<div class="line"><a id="l01732" name="l01732"></a><span class="lineno"> 1732</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l01733" name="l01733"></a><span class="lineno"> 1733</span>  <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1">ReduceType</a> reduce_type_;</div>
-<div class="line"><a id="l01734" name="l01734"></a><span class="lineno"> 1734</span>  <span class="keywordtype">int</span> axis_;</div>
-<div class="line"><a id="l01735" name="l01735"></a><span class="lineno"> 1735</span>  <span class="keywordtype">bool</span> reverse_;</div>
-<div class="line"><a id="l01736" name="l01736"></a><span class="lineno"> 1736</span>  <span class="keywordtype">bool</span> inclusive_;</div>
-<div class="line"><a id="l01737" name="l01737"></a><span class="lineno"> 1737</span> </div>
-<div class="line"><a id="l01738" name="l01738"></a><span class="lineno"> 1738</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01739" name="l01739"></a><span class="lineno"> 1739</span>};</div>
-</div>
-<div class="line"><a id="l01740" name="l01740"></a><span class="lineno"> 1740</span> </div>
-<div class="foldopen" id="foldopen01741" data-start="{" data-end="};">
-<div class="line"><a id="l01741" name="l01741"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html"> 1741</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_scatter.html">Scatter</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01742" name="l01742"></a><span class="lineno"> 1742</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01743" name="l01743"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca1c2da7b96d743296fe660f5fc4072f16"> 1743</a></span>  <span class="keyword">enum</span> <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613c">ReduceType</a> { <a class="code hl_enumvalue" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca1c2da7b96d743296fe660f5fc4072f16">Max</a>, <a class="code hl_struct" href="struct_min.html">Min</a>, <a class="code hl_struct" href="struct_sum.html">Sum</a>, <a class="code hl_struct" href="struct_prod.html">Prod</a>, <a class="code hl_struct" href="struct_none.html">None</a> };</div>
-<div class="line"><a id="l01744" name="l01744"></a><span class="lineno"> 1744</span> </div>
-<div class="foldopen" id="foldopen01745" data-start="{" data-end="}">
-<div class="line"><a id="l01745" name="l01745"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#ac9b3eff67389ef9aa820753379ffeaa3"> 1745</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#ac9b3eff67389ef9aa820753379ffeaa3">Scatter</a>(</div>
-<div class="line"><a id="l01746" name="l01746"></a><span class="lineno"> 1746</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01747" name="l01747"></a><span class="lineno"> 1747</span>      <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613c">ReduceType</a> reduce_type,</div>
-<div class="line"><a id="l01748" name="l01748"></a><span class="lineno"> 1748</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes)</div>
-<div class="line"><a id="l01749" name="l01749"></a><span class="lineno"> 1749</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), reduce_type_(reduce_type), axes_(axes) {}</div>
+<div class="foldopen" id="foldopen01732" data-start="{" data-end="}">
+<div class="line"><a id="l01732" name="l01732"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22"> 1732</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
+<div class="line"><a id="l01733" name="l01733"></a><span class="lineno"> 1733</span>    os &lt;&lt; <span class="stringliteral">&quot;Cum&quot;</span>;</div>
+<div class="line"><a id="l01734" name="l01734"></a><span class="lineno"> 1734</span>    <span class="keywordflow">switch</span> (reduce_type_) {</div>
+<div class="line"><a id="l01735" name="l01735"></a><span class="lineno"> 1735</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_sum.html">Sum</a>:</div>
+<div class="line"><a id="l01736" name="l01736"></a><span class="lineno"> 1736</span>        os &lt;&lt; <span class="stringliteral">&quot;Sum&quot;</span>;</div>
+<div class="line"><a id="l01737" name="l01737"></a><span class="lineno"> 1737</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01738" name="l01738"></a><span class="lineno"> 1738</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_prod.html">Prod</a>:</div>
+<div class="line"><a id="l01739" name="l01739"></a><span class="lineno"> 1739</span>        os &lt;&lt; <span class="stringliteral">&quot;Prod&quot;</span>;</div>
+<div class="line"><a id="l01740" name="l01740"></a><span class="lineno"> 1740</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01741" name="l01741"></a><span class="lineno"> 1741</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_min.html">Min</a>:</div>
+<div class="line"><a id="l01742" name="l01742"></a><span class="lineno"> 1742</span>        os &lt;&lt; <span class="stringliteral">&quot;Min&quot;</span>;</div>
+<div class="line"><a id="l01743" name="l01743"></a><span class="lineno"> 1743</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01744" name="l01744"></a><span class="lineno"> 1744</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_max.html">Max</a>:</div>
+<div class="line"><a id="l01745" name="l01745"></a><span class="lineno"> 1745</span>        os &lt;&lt; <span class="stringliteral">&quot;Max&quot;</span>;</div>
+<div class="line"><a id="l01746" name="l01746"></a><span class="lineno"> 1746</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01747" name="l01747"></a><span class="lineno"> 1747</span>    }</div>
+<div class="line"><a id="l01748" name="l01748"></a><span class="lineno"> 1748</span>  }</div>
 </div>
+<div class="line"><a id="l01749" name="l01749"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6"> 1749</a></span>  <span class="keywordtype">bool</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6">is_equivalent</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) <span class="keyword">const override</span>;</div>
 <div class="line"><a id="l01750" name="l01750"></a><span class="lineno"> 1750</span> </div>
-<div class="line"><a id="l01751" name="l01751"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97"> 1751</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01752" name="l01752"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678"> 1752</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01753" name="l01753"></a><span class="lineno"> 1753</span> </div>
-<div class="line"><a id="l01754" name="l01754"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322"> 1754</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>();</div>
-<div class="line"><a id="l01755" name="l01755"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934"> 1755</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>();</div>
+<div class="line"><a id="l01751" name="l01751"></a><span class="lineno"> 1751</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l01752" name="l01752"></a><span class="lineno"> 1752</span>  <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1">ReduceType</a> reduce_type_;</div>
+<div class="line"><a id="l01753" name="l01753"></a><span class="lineno"> 1753</span>  <span class="keywordtype">int</span> axis_;</div>
+<div class="line"><a id="l01754" name="l01754"></a><span class="lineno"> 1754</span>  <span class="keywordtype">bool</span> reverse_;</div>
+<div class="line"><a id="l01755" name="l01755"></a><span class="lineno"> 1755</span>  <span class="keywordtype">bool</span> inclusive_;</div>
 <div class="line"><a id="l01756" name="l01756"></a><span class="lineno"> 1756</span> </div>
-<div class="foldopen" id="foldopen01757" data-start="{" data-end="}">
-<div class="line"><a id="l01757" name="l01757"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa"> 1757</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">print</a>(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
-<div class="line"><a id="l01758" name="l01758"></a><span class="lineno"> 1758</span>    os &lt;&lt; <span class="stringliteral">&quot;Scatter&quot;</span>;</div>
-<div class="line"><a id="l01759" name="l01759"></a><span class="lineno"> 1759</span>    <span class="keywordflow">switch</span> (reduce_type_) {</div>
-<div class="line"><a id="l01760" name="l01760"></a><span class="lineno"> 1760</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_sum.html">Sum</a>:</div>
-<div class="line"><a id="l01761" name="l01761"></a><span class="lineno"> 1761</span>        os &lt;&lt; <span class="stringliteral">&quot; Sum&quot;</span>;</div>
-<div class="line"><a id="l01762" name="l01762"></a><span class="lineno"> 1762</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01763" name="l01763"></a><span class="lineno"> 1763</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_prod.html">Prod</a>:</div>
-<div class="line"><a id="l01764" name="l01764"></a><span class="lineno"> 1764</span>        os &lt;&lt; <span class="stringliteral">&quot; Prod&quot;</span>;</div>
-<div class="line"><a id="l01765" name="l01765"></a><span class="lineno"> 1765</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01766" name="l01766"></a><span class="lineno"> 1766</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_min.html">Min</a>:</div>
-<div class="line"><a id="l01767" name="l01767"></a><span class="lineno"> 1767</span>        os &lt;&lt; <span class="stringliteral">&quot; Min&quot;</span>;</div>
-<div class="line"><a id="l01768" name="l01768"></a><span class="lineno"> 1768</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01769" name="l01769"></a><span class="lineno"> 1769</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_max.html">Max</a>:</div>
-<div class="line"><a id="l01770" name="l01770"></a><span class="lineno"> 1770</span>        os &lt;&lt; <span class="stringliteral">&quot; Max&quot;</span>;</div>
-<div class="line"><a id="l01771" name="l01771"></a><span class="lineno"> 1771</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01772" name="l01772"></a><span class="lineno"> 1772</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_none.html">None</a>:</div>
-<div class="line"><a id="l01773" name="l01773"></a><span class="lineno"> 1773</span>        <span class="keywordflow">break</span>;</div>
-<div class="line"><a id="l01774" name="l01774"></a><span class="lineno"> 1774</span>    }</div>
-<div class="line"><a id="l01775" name="l01775"></a><span class="lineno"> 1775</span>  }</div>
+<div class="line"><a id="l01757" name="l01757"></a><span class="lineno"> 1757</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01758" name="l01758"></a><span class="lineno"> 1758</span>};</div>
 </div>
-<div class="line"><a id="l01776" name="l01776"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f"> 1776</a></span>  <span class="keywordtype">bool</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f">is_equivalent</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) <span class="keyword">const override</span>;</div>
-<div class="line"><a id="l01777" name="l01777"></a><span class="lineno"> 1777</span> </div>
-<div class="line"><a id="l01778" name="l01778"></a><span class="lineno"> 1778</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l01779" name="l01779"></a><span class="lineno"> 1779</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01780" name="l01780"></a><span class="lineno"> 1780</span>  <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613c">ReduceType</a> reduce_type_;</div>
-<div class="line"><a id="l01781" name="l01781"></a><span class="lineno"> 1781</span>  std::vector&lt;int&gt; axes_;</div>
-<div class="line"><a id="l01782" name="l01782"></a><span class="lineno"> 1782</span>};</div>
+<div class="line"><a id="l01759" name="l01759"></a><span class="lineno"> 1759</span> </div>
+<div class="foldopen" id="foldopen01760" data-start="{" data-end="};">
+<div class="line"><a id="l01760" name="l01760"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html"> 1760</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_scatter.html">Scatter</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01761" name="l01761"></a><span class="lineno"> 1761</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01762" name="l01762"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca1c2da7b96d743296fe660f5fc4072f16"> 1762</a></span>  <span class="keyword">enum</span> <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613c">ReduceType</a> { <a class="code hl_enumvalue" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca1c2da7b96d743296fe660f5fc4072f16">Max</a>, <a class="code hl_struct" href="struct_min.html">Min</a>, <a class="code hl_struct" href="struct_sum.html">Sum</a>, <a class="code hl_struct" href="struct_prod.html">Prod</a>, <a class="code hl_struct" href="struct_none.html">None</a> };</div>
+<div class="line"><a id="l01763" name="l01763"></a><span class="lineno"> 1763</span> </div>
+<div class="foldopen" id="foldopen01764" data-start="{" data-end="}">
+<div class="line"><a id="l01764" name="l01764"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#ac9b3eff67389ef9aa820753379ffeaa3"> 1764</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#ac9b3eff67389ef9aa820753379ffeaa3">Scatter</a>(</div>
+<div class="line"><a id="l01765" name="l01765"></a><span class="lineno"> 1765</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01766" name="l01766"></a><span class="lineno"> 1766</span>      <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613c">ReduceType</a> reduce_type,</div>
+<div class="line"><a id="l01767" name="l01767"></a><span class="lineno"> 1767</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes)</div>
+<div class="line"><a id="l01768" name="l01768"></a><span class="lineno"> 1768</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), reduce_type_(reduce_type), axes_(axes) {}</div>
 </div>
-<div class="line"><a id="l01783" name="l01783"></a><span class="lineno"> 1783</span> </div>
-<div class="foldopen" id="foldopen01784" data-start="{" data-end="};">
-<div class="line"><a id="l01784" name="l01784"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html"> 1784</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_sigmoid.html">Sigmoid</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01785" name="l01785"></a><span class="lineno"> 1785</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01786" name="l01786"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b"> 1786</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b">Sigmoid</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01787" name="l01787"></a><span class="lineno"> 1787</span> </div>
-<div class="line"><a id="l01788" name="l01788"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255"> 1788</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01789" name="l01789"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca"> 1789</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01790" name="l01790"></a><span class="lineno"> 1790</span> </div>
-<div class="line"><a id="l01791" name="l01791"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85"> 1791</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01792" name="l01792"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db"> 1792</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01793" name="l01793"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2"> 1793</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sigmoid.html">Sigmoid</a>)</div>
-<div class="line"><a id="l01794" name="l01794"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e"> 1794</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01795" name="l01795"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43"> 1795</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01769" name="l01769"></a><span class="lineno"> 1769</span> </div>
+<div class="line"><a id="l01770" name="l01770"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97"> 1770</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01771" name="l01771"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678"> 1771</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01772" name="l01772"></a><span class="lineno"> 1772</span> </div>
+<div class="line"><a id="l01773" name="l01773"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322"> 1773</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>();</div>
+<div class="line"><a id="l01774" name="l01774"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934"> 1774</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>();</div>
+<div class="line"><a id="l01775" name="l01775"></a><span class="lineno"> 1775</span> </div>
+<div class="foldopen" id="foldopen01776" data-start="{" data-end="}">
+<div class="line"><a id="l01776" name="l01776"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa"> 1776</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">print</a>(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
+<div class="line"><a id="l01777" name="l01777"></a><span class="lineno"> 1777</span>    os &lt;&lt; <span class="stringliteral">&quot;Scatter&quot;</span>;</div>
+<div class="line"><a id="l01778" name="l01778"></a><span class="lineno"> 1778</span>    <span class="keywordflow">switch</span> (reduce_type_) {</div>
+<div class="line"><a id="l01779" name="l01779"></a><span class="lineno"> 1779</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_sum.html">Sum</a>:</div>
+<div class="line"><a id="l01780" name="l01780"></a><span class="lineno"> 1780</span>        os &lt;&lt; <span class="stringliteral">&quot; Sum&quot;</span>;</div>
+<div class="line"><a id="l01781" name="l01781"></a><span class="lineno"> 1781</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01782" name="l01782"></a><span class="lineno"> 1782</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_prod.html">Prod</a>:</div>
+<div class="line"><a id="l01783" name="l01783"></a><span class="lineno"> 1783</span>        os &lt;&lt; <span class="stringliteral">&quot; Prod&quot;</span>;</div>
+<div class="line"><a id="l01784" name="l01784"></a><span class="lineno"> 1784</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01785" name="l01785"></a><span class="lineno"> 1785</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_min.html">Min</a>:</div>
+<div class="line"><a id="l01786" name="l01786"></a><span class="lineno"> 1786</span>        os &lt;&lt; <span class="stringliteral">&quot; Min&quot;</span>;</div>
+<div class="line"><a id="l01787" name="l01787"></a><span class="lineno"> 1787</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01788" name="l01788"></a><span class="lineno"> 1788</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_max.html">Max</a>:</div>
+<div class="line"><a id="l01789" name="l01789"></a><span class="lineno"> 1789</span>        os &lt;&lt; <span class="stringliteral">&quot; Max&quot;</span>;</div>
+<div class="line"><a id="l01790" name="l01790"></a><span class="lineno"> 1790</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01791" name="l01791"></a><span class="lineno"> 1791</span>      <span class="keywordflow">case</span> <a class="code hl_struct" href="struct_none.html">None</a>:</div>
+<div class="line"><a id="l01792" name="l01792"></a><span class="lineno"> 1792</span>        <span class="keywordflow">break</span>;</div>
+<div class="line"><a id="l01793" name="l01793"></a><span class="lineno"> 1793</span>    }</div>
+<div class="line"><a id="l01794" name="l01794"></a><span class="lineno"> 1794</span>  }</div>
+</div>
+<div class="line"><a id="l01795" name="l01795"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f"> 1795</a></span>  <span class="keywordtype">bool</span> <a class="code hl_function" href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f">is_equivalent</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) <span class="keyword">const override</span>;</div>
 <div class="line"><a id="l01796" name="l01796"></a><span class="lineno"> 1796</span> </div>
-<div class="line"><a id="l01797" name="l01797"></a><span class="lineno"> 1797</span> private:</div>
-<div class="line"><a id="l01798" name="l01798"></a><span class="lineno"> 1798</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01799" name="l01799"></a><span class="lineno"> 1799</span>};</div>
+<div class="line"><a id="l01797" name="l01797"></a><span class="lineno"> 1797</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l01798" name="l01798"></a><span class="lineno"> 1798</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01799" name="l01799"></a><span class="lineno"> 1799</span>  <a class="code hl_enumeration" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613c">ReduceType</a> reduce_type_;</div>
+<div class="line"><a id="l01800" name="l01800"></a><span class="lineno"> 1800</span>  std::vector&lt;int&gt; axes_;</div>
+<div class="line"><a id="l01801" name="l01801"></a><span class="lineno"> 1801</span>};</div>
 </div>
-<div class="line"><a id="l01800" name="l01800"></a><span class="lineno"> 1800</span> </div>
-<div class="foldopen" id="foldopen01801" data-start="{" data-end="};">
-<div class="line"><a id="l01801" name="l01801"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html"> 1801</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sign.html">Sign</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01802" name="l01802"></a><span class="lineno"> 1802</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01803" name="l01803"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763"> 1803</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763">Sign</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01804" name="l01804"></a><span class="lineno"> 1804</span> </div>
-<div class="line"><a id="l01805" name="l01805"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97"> 1805</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01806" name="l01806"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b"> 1806</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01807" name="l01807"></a><span class="lineno"> 1807</span> </div>
-<div class="line"><a id="l01808" name="l01808"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295"> 1808</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01809" name="l01809"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b"> 1809</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01810" name="l01810"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a"> 1810</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sign.html">Sign</a>)</div>
-<div class="line"><a id="l01811" name="l01811"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb"> 1811</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01812" name="l01812"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67"> 1812</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01813" name="l01813"></a><span class="lineno"> 1813</span> </div>
-<div class="line"><a id="l01814" name="l01814"></a><span class="lineno"> 1814</span> private:</div>
-<div class="line"><a id="l01815" name="l01815"></a><span class="lineno"> 1815</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01816" name="l01816"></a><span class="lineno"> 1816</span>};</div>
+<div class="line"><a id="l01802" name="l01802"></a><span class="lineno"> 1802</span> </div>
+<div class="foldopen" id="foldopen01803" data-start="{" data-end="};">
+<div class="line"><a id="l01803" name="l01803"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html"> 1803</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_sigmoid.html">Sigmoid</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01804" name="l01804"></a><span class="lineno"> 1804</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01805" name="l01805"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b"> 1805</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b">Sigmoid</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01806" name="l01806"></a><span class="lineno"> 1806</span> </div>
+<div class="line"><a id="l01807" name="l01807"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255"> 1807</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01808" name="l01808"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca"> 1808</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01809" name="l01809"></a><span class="lineno"> 1809</span> </div>
+<div class="line"><a id="l01810" name="l01810"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85"> 1810</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01811" name="l01811"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db"> 1811</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01812" name="l01812"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2"> 1812</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sigmoid.html">Sigmoid</a>)</div>
+<div class="line"><a id="l01813" name="l01813"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e"> 1813</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01814" name="l01814"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43"> 1814</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01815" name="l01815"></a><span class="lineno"> 1815</span> </div>
+<div class="line"><a id="l01816" name="l01816"></a><span class="lineno"> 1816</span> private:</div>
+<div class="line"><a id="l01817" name="l01817"></a><span class="lineno"> 1817</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01818" name="l01818"></a><span class="lineno"> 1818</span>};</div>
 </div>
-<div class="line"><a id="l01817" name="l01817"></a><span class="lineno"> 1817</span> </div>
-<div class="foldopen" id="foldopen01818" data-start="{" data-end="};">
-<div class="line"><a id="l01818" name="l01818"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html"> 1818</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sin.html">Sin</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01819" name="l01819"></a><span class="lineno"> 1819</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01820" name="l01820"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea"> 1820</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea">Sin</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01821" name="l01821"></a><span class="lineno"> 1821</span> </div>
-<div class="line"><a id="l01822" name="l01822"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5"> 1822</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01823" name="l01823"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e"> 1823</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01824" name="l01824"></a><span class="lineno"> 1824</span> </div>
-<div class="line"><a id="l01825" name="l01825"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba"> 1825</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01826" name="l01826"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de"> 1826</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01827" name="l01827"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4"> 1827</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sin.html">Sin</a>)</div>
-<div class="line"><a id="l01828" name="l01828"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a"> 1828</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01829" name="l01829"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a"> 1829</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01830" name="l01830"></a><span class="lineno"> 1830</span> </div>
-<div class="line"><a id="l01831" name="l01831"></a><span class="lineno"> 1831</span> private:</div>
-<div class="line"><a id="l01832" name="l01832"></a><span class="lineno"> 1832</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01833" name="l01833"></a><span class="lineno"> 1833</span>};</div>
+<div class="line"><a id="l01819" name="l01819"></a><span class="lineno"> 1819</span> </div>
+<div class="foldopen" id="foldopen01820" data-start="{" data-end="};">
+<div class="line"><a id="l01820" name="l01820"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html"> 1820</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sign.html">Sign</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01821" name="l01821"></a><span class="lineno"> 1821</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01822" name="l01822"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763"> 1822</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763">Sign</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01823" name="l01823"></a><span class="lineno"> 1823</span> </div>
+<div class="line"><a id="l01824" name="l01824"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97"> 1824</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01825" name="l01825"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b"> 1825</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01826" name="l01826"></a><span class="lineno"> 1826</span> </div>
+<div class="line"><a id="l01827" name="l01827"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295"> 1827</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01828" name="l01828"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b"> 1828</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01829" name="l01829"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a"> 1829</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sign.html">Sign</a>)</div>
+<div class="line"><a id="l01830" name="l01830"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb"> 1830</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01831" name="l01831"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67"> 1831</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01832" name="l01832"></a><span class="lineno"> 1832</span> </div>
+<div class="line"><a id="l01833" name="l01833"></a><span class="lineno"> 1833</span> private:</div>
+<div class="line"><a id="l01834" name="l01834"></a><span class="lineno"> 1834</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01835" name="l01835"></a><span class="lineno"> 1835</span>};</div>
 </div>
-<div class="line"><a id="l01834" name="l01834"></a><span class="lineno"> 1834</span> </div>
-<div class="foldopen" id="foldopen01835" data-start="{" data-end="};">
-<div class="line"><a id="l01835" name="l01835"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html"> 1835</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sinh.html">Sinh</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01836" name="l01836"></a><span class="lineno"> 1836</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01837" name="l01837"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96"> 1837</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96">Sinh</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01838" name="l01838"></a><span class="lineno"> 1838</span> </div>
-<div class="line"><a id="l01839" name="l01839"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd"> 1839</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01840" name="l01840"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75"> 1840</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01841" name="l01841"></a><span class="lineno"> 1841</span> </div>
-<div class="line"><a id="l01842" name="l01842"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788"> 1842</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01843" name="l01843"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c"> 1843</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01844" name="l01844"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77"> 1844</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sinh.html">Sinh</a>)</div>
-<div class="line"><a id="l01845" name="l01845"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d"> 1845</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01846" name="l01846"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28"> 1846</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01847" name="l01847"></a><span class="lineno"> 1847</span> </div>
-<div class="line"><a id="l01848" name="l01848"></a><span class="lineno"> 1848</span> private:</div>
-<div class="line"><a id="l01849" name="l01849"></a><span class="lineno"> 1849</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01850" name="l01850"></a><span class="lineno"> 1850</span>};</div>
+<div class="line"><a id="l01836" name="l01836"></a><span class="lineno"> 1836</span> </div>
+<div class="foldopen" id="foldopen01837" data-start="{" data-end="};">
+<div class="line"><a id="l01837" name="l01837"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html"> 1837</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sin.html">Sin</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01838" name="l01838"></a><span class="lineno"> 1838</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01839" name="l01839"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea"> 1839</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea">Sin</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01840" name="l01840"></a><span class="lineno"> 1840</span> </div>
+<div class="line"><a id="l01841" name="l01841"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5"> 1841</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01842" name="l01842"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e"> 1842</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01843" name="l01843"></a><span class="lineno"> 1843</span> </div>
+<div class="line"><a id="l01844" name="l01844"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba"> 1844</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01845" name="l01845"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de"> 1845</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01846" name="l01846"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4"> 1846</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sin.html">Sin</a>)</div>
+<div class="line"><a id="l01847" name="l01847"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a"> 1847</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01848" name="l01848"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a"> 1848</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01849" name="l01849"></a><span class="lineno"> 1849</span> </div>
+<div class="line"><a id="l01850" name="l01850"></a><span class="lineno"> 1850</span> private:</div>
+<div class="line"><a id="l01851" name="l01851"></a><span class="lineno"> 1851</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01852" name="l01852"></a><span class="lineno"> 1852</span>};</div>
 </div>
-<div class="line"><a id="l01851" name="l01851"></a><span class="lineno"> 1851</span> </div>
-<div class="foldopen" id="foldopen01852" data-start="{" data-end="};">
-<div class="line"><a id="l01852" name="l01852"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html"> 1852</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_slice.html">Slice</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01853" name="l01853"></a><span class="lineno"> 1853</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01854" data-start="{" data-end="}">
-<div class="line"><a id="l01854" name="l01854"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f"> 1854</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f">Slice</a>(</div>
-<div class="line"><a id="l01855" name="l01855"></a><span class="lineno"> 1855</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01856" name="l01856"></a><span class="lineno"> 1856</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; start_indices,</div>
-<div class="line"><a id="l01857" name="l01857"></a><span class="lineno"> 1857</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; end_indices,</div>
-<div class="line"><a id="l01858" name="l01858"></a><span class="lineno"> 1858</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; strides)</div>
-<div class="line"><a id="l01859" name="l01859"></a><span class="lineno"> 1859</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
-<div class="line"><a id="l01860" name="l01860"></a><span class="lineno"> 1860</span>        start_indices_(start_indices),</div>
-<div class="line"><a id="l01861" name="l01861"></a><span class="lineno"> 1861</span>        end_indices_(end_indices),</div>
-<div class="line"><a id="l01862" name="l01862"></a><span class="lineno"> 1862</span>        strides_(strides) {}</div>
-</div>
-<div class="line"><a id="l01863" name="l01863"></a><span class="lineno"> 1863</span> </div>
-<div class="line"><a id="l01864" name="l01864"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2"> 1864</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01865" name="l01865"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a"> 1865</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01853" name="l01853"></a><span class="lineno"> 1853</span> </div>
+<div class="foldopen" id="foldopen01854" data-start="{" data-end="};">
+<div class="line"><a id="l01854" name="l01854"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html"> 1854</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sinh.html">Sinh</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01855" name="l01855"></a><span class="lineno"> 1855</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01856" name="l01856"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96"> 1856</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96">Sinh</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01857" name="l01857"></a><span class="lineno"> 1857</span> </div>
+<div class="line"><a id="l01858" name="l01858"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd"> 1858</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01859" name="l01859"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75"> 1859</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01860" name="l01860"></a><span class="lineno"> 1860</span> </div>
+<div class="line"><a id="l01861" name="l01861"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788"> 1861</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01862" name="l01862"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c"> 1862</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01863" name="l01863"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77"> 1863</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sinh.html">Sinh</a>)</div>
+<div class="line"><a id="l01864" name="l01864"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d"> 1864</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l01865" name="l01865"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28"> 1865</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
 <div class="line"><a id="l01866" name="l01866"></a><span class="lineno"> 1866</span> </div>
-<div class="line"><a id="l01867" name="l01867"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2"> 1867</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01868" name="l01868"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36"> 1868</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01869" name="l01869"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504"> 1869</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_slice.html">Slice</a>)</div>
-<div class="line"><a id="l01870" name="l01870"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0"> 1870</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01871" name="l01871"></a><span class="lineno"> 1871</span> </div>
-<div class="line"><a id="l01872" name="l01872"></a><span class="lineno"> 1872</span> private:</div>
-<div class="line"><a id="l01873" name="l01873"></a><span class="lineno"> 1873</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; start_indices_;</div>
-<div class="line"><a id="l01874" name="l01874"></a><span class="lineno"> 1874</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; end_indices_;</div>
-<div class="line"><a id="l01875" name="l01875"></a><span class="lineno"> 1875</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; strides_;</div>
-<div class="line"><a id="l01876" name="l01876"></a><span class="lineno"> 1876</span> </div>
-<div class="line"><a id="l01877" name="l01877"></a><span class="lineno"> 1877</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01878" name="l01878"></a><span class="lineno"> 1878</span>};</div>
+<div class="line"><a id="l01867" name="l01867"></a><span class="lineno"> 1867</span> private:</div>
+<div class="line"><a id="l01868" name="l01868"></a><span class="lineno"> 1868</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01869" name="l01869"></a><span class="lineno"> 1869</span>};</div>
 </div>
-<div class="line"><a id="l01879" name="l01879"></a><span class="lineno"> 1879</span> </div>
-<div class="foldopen" id="foldopen01880" data-start="{" data-end="};">
-<div class="line"><a id="l01880" name="l01880"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html"> 1880</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_slice_update.html">SliceUpdate</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01881" name="l01881"></a><span class="lineno"> 1881</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01882" data-start="{" data-end="}">
-<div class="line"><a id="l01882" name="l01882"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990"> 1882</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990">SliceUpdate</a>(</div>
-<div class="line"><a id="l01883" name="l01883"></a><span class="lineno"> 1883</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
-<div class="line"><a id="l01884" name="l01884"></a><span class="lineno"> 1884</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; start_indices,</div>
-<div class="line"><a id="l01885" name="l01885"></a><span class="lineno"> 1885</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; end_indices,</div>
-<div class="line"><a id="l01886" name="l01886"></a><span class="lineno"> 1886</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; strides)</div>
-<div class="line"><a id="l01887" name="l01887"></a><span class="lineno"> 1887</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
-<div class="line"><a id="l01888" name="l01888"></a><span class="lineno"> 1888</span>        start_indices_(start_indices),</div>
-<div class="line"><a id="l01889" name="l01889"></a><span class="lineno"> 1889</span>        end_indices_(end_indices),</div>
-<div class="line"><a id="l01890" name="l01890"></a><span class="lineno"> 1890</span>        strides_(strides) {}</div>
+<div class="line"><a id="l01870" name="l01870"></a><span class="lineno"> 1870</span> </div>
+<div class="foldopen" id="foldopen01871" data-start="{" data-end="};">
+<div class="line"><a id="l01871" name="l01871"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html"> 1871</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_slice.html">Slice</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01872" name="l01872"></a><span class="lineno"> 1872</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01873" data-start="{" data-end="}">
+<div class="line"><a id="l01873" name="l01873"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f"> 1873</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f">Slice</a>(</div>
+<div class="line"><a id="l01874" name="l01874"></a><span class="lineno"> 1874</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01875" name="l01875"></a><span class="lineno"> 1875</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; start_indices,</div>
+<div class="line"><a id="l01876" name="l01876"></a><span class="lineno"> 1876</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; end_indices,</div>
+<div class="line"><a id="l01877" name="l01877"></a><span class="lineno"> 1877</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; strides)</div>
+<div class="line"><a id="l01878" name="l01878"></a><span class="lineno"> 1878</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
+<div class="line"><a id="l01879" name="l01879"></a><span class="lineno"> 1879</span>        start_indices_(start_indices),</div>
+<div class="line"><a id="l01880" name="l01880"></a><span class="lineno"> 1880</span>        end_indices_(end_indices),</div>
+<div class="line"><a id="l01881" name="l01881"></a><span class="lineno"> 1881</span>        strides_(strides) {}</div>
 </div>
-<div class="line"><a id="l01891" name="l01891"></a><span class="lineno"> 1891</span> </div>
-<div class="line"><a id="l01892" name="l01892"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b"> 1892</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01893" name="l01893"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b"> 1893</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01894" name="l01894"></a><span class="lineno"> 1894</span> </div>
-<div class="line"><a id="l01895" name="l01895"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3"> 1895</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01896" name="l01896"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611"> 1896</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01897" name="l01897"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b"> 1897</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_slice_update.html">SliceUpdate</a>)</div>
-<div class="line"><a id="l01898" name="l01898"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119"> 1898</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01899" name="l01899"></a><span class="lineno"> 1899</span> </div>
-<div class="line"><a id="l01900" name="l01900"></a><span class="lineno"> 1900</span> private:</div>
-<div class="line"><a id="l01901" name="l01901"></a><span class="lineno"> 1901</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; start_indices_;</div>
-<div class="line"><a id="l01902" name="l01902"></a><span class="lineno"> 1902</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; end_indices_;</div>
-<div class="line"><a id="l01903" name="l01903"></a><span class="lineno"> 1903</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; strides_;</div>
-<div class="line"><a id="l01904" name="l01904"></a><span class="lineno"> 1904</span> </div>
-<div class="line"><a id="l01905" name="l01905"></a><span class="lineno"> 1905</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01906" name="l01906"></a><span class="lineno"> 1906</span> </div>
-<div class="line"><a id="l01907" name="l01907"></a><span class="lineno"> 1907</span>  std::tuple&lt;int64_t, std::vector&lt;int64_t&gt;&gt; prepare_slice(const <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in);</div>
-<div class="line"><a id="l01908" name="l01908"></a><span class="lineno"> 1908</span>};</div>
+<div class="line"><a id="l01882" name="l01882"></a><span class="lineno"> 1882</span> </div>
+<div class="line"><a id="l01883" name="l01883"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2"> 1883</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01884" name="l01884"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a"> 1884</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01885" name="l01885"></a><span class="lineno"> 1885</span> </div>
+<div class="line"><a id="l01886" name="l01886"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2"> 1886</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01887" name="l01887"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36"> 1887</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01888" name="l01888"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504"> 1888</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_slice.html">Slice</a>)</div>
+<div class="line"><a id="l01889" name="l01889"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0"> 1889</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01890" name="l01890"></a><span class="lineno"> 1890</span> </div>
+<div class="line"><a id="l01891" name="l01891"></a><span class="lineno"> 1891</span> private:</div>
+<div class="line"><a id="l01892" name="l01892"></a><span class="lineno"> 1892</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; start_indices_;</div>
+<div class="line"><a id="l01893" name="l01893"></a><span class="lineno"> 1893</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; end_indices_;</div>
+<div class="line"><a id="l01894" name="l01894"></a><span class="lineno"> 1894</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; strides_;</div>
+<div class="line"><a id="l01895" name="l01895"></a><span class="lineno"> 1895</span> </div>
+<div class="line"><a id="l01896" name="l01896"></a><span class="lineno"> 1896</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01897" name="l01897"></a><span class="lineno"> 1897</span>};</div>
 </div>
-<div class="line"><a id="l01909" name="l01909"></a><span class="lineno"> 1909</span> </div>
-<div class="foldopen" id="foldopen01910" data-start="{" data-end="};">
-<div class="line"><a id="l01910" name="l01910"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html"> 1910</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_softmax.html">Softmax</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01911" name="l01911"></a><span class="lineno"> 1911</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01912" data-start="{" data-end="}">
-<div class="line"><a id="l01912" name="l01912"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb"> 1912</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb">Softmax</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> precise)</div>
-<div class="line"><a id="l01913" name="l01913"></a><span class="lineno"> 1913</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), precise_(precise) {}</div>
+<div class="line"><a id="l01898" name="l01898"></a><span class="lineno"> 1898</span> </div>
+<div class="foldopen" id="foldopen01899" data-start="{" data-end="};">
+<div class="line"><a id="l01899" name="l01899"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html"> 1899</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_slice_update.html">SliceUpdate</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01900" name="l01900"></a><span class="lineno"> 1900</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01901" data-start="{" data-end="}">
+<div class="line"><a id="l01901" name="l01901"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990"> 1901</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990">SliceUpdate</a>(</div>
+<div class="line"><a id="l01902" name="l01902"></a><span class="lineno"> 1902</span>      <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream,</div>
+<div class="line"><a id="l01903" name="l01903"></a><span class="lineno"> 1903</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; start_indices,</div>
+<div class="line"><a id="l01904" name="l01904"></a><span class="lineno"> 1904</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; end_indices,</div>
+<div class="line"><a id="l01905" name="l01905"></a><span class="lineno"> 1905</span>      <span class="keyword">const</span> std::vector&lt;int&gt;&amp; strides)</div>
+<div class="line"><a id="l01906" name="l01906"></a><span class="lineno"> 1906</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream),</div>
+<div class="line"><a id="l01907" name="l01907"></a><span class="lineno"> 1907</span>        start_indices_(start_indices),</div>
+<div class="line"><a id="l01908" name="l01908"></a><span class="lineno"> 1908</span>        end_indices_(end_indices),</div>
+<div class="line"><a id="l01909" name="l01909"></a><span class="lineno"> 1909</span>        strides_(strides) {}</div>
 </div>
-<div class="line"><a id="l01914" name="l01914"></a><span class="lineno"> 1914</span> </div>
-<div class="line"><a id="l01915" name="l01915"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79"> 1915</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01916" name="l01916"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af"> 1916</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01917" name="l01917"></a><span class="lineno"> 1917</span> </div>
-<div class="line"><a id="l01918" name="l01918"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19"> 1918</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01919" name="l01919"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f"> 1919</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01920" name="l01920"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83"> 1920</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_softmax.html">Softmax</a>)</div>
-<div class="line"><a id="l01921" name="l01921"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35"> 1921</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01922" name="l01922"></a><span class="lineno"> 1922</span> </div>
-<div class="line"><a id="l01923" name="l01923"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728"> 1923</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01924" name="l01924"></a><span class="lineno"> 1924</span> </div>
-<div class="line"><a id="l01925" name="l01925"></a><span class="lineno"> 1925</span> private:</div>
-<div class="line"><a id="l01926" name="l01926"></a><span class="lineno"> 1926</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01927" name="l01927"></a><span class="lineno"> 1927</span>  <span class="keywordtype">bool</span> precise_;</div>
-<div class="line"><a id="l01928" name="l01928"></a><span class="lineno"> 1928</span>};</div>
+<div class="line"><a id="l01910" name="l01910"></a><span class="lineno"> 1910</span> </div>
+<div class="line"><a id="l01911" name="l01911"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b"> 1911</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01912" name="l01912"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b"> 1912</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01913" name="l01913"></a><span class="lineno"> 1913</span> </div>
+<div class="line"><a id="l01914" name="l01914"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3"> 1914</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01915" name="l01915"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611"> 1915</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01916" name="l01916"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b"> 1916</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_slice_update.html">SliceUpdate</a>)</div>
+<div class="line"><a id="l01917" name="l01917"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119"> 1917</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01918" name="l01918"></a><span class="lineno"> 1918</span> </div>
+<div class="line"><a id="l01919" name="l01919"></a><span class="lineno"> 1919</span> private:</div>
+<div class="line"><a id="l01920" name="l01920"></a><span class="lineno"> 1920</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; start_indices_;</div>
+<div class="line"><a id="l01921" name="l01921"></a><span class="lineno"> 1921</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; end_indices_;</div>
+<div class="line"><a id="l01922" name="l01922"></a><span class="lineno"> 1922</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; strides_;</div>
+<div class="line"><a id="l01923" name="l01923"></a><span class="lineno"> 1923</span> </div>
+<div class="line"><a id="l01924" name="l01924"></a><span class="lineno"> 1924</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01925" name="l01925"></a><span class="lineno"> 1925</span> </div>
+<div class="line"><a id="l01926" name="l01926"></a><span class="lineno"> 1926</span>  std::tuple&lt;int64_t, std::vector&lt;int64_t&gt;&gt; prepare_slice(const <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in);</div>
+<div class="line"><a id="l01927" name="l01927"></a><span class="lineno"> 1927</span>};</div>
 </div>
-<div class="line"><a id="l01929" name="l01929"></a><span class="lineno"> 1929</span> </div>
-<div class="foldopen" id="foldopen01930" data-start="{" data-end="};">
-<div class="line"><a id="l01930" name="l01930"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html"> 1930</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sort.html">Sort</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01931" name="l01931"></a><span class="lineno"> 1931</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01932" data-start="{" data-end="}">
-<div class="line"><a id="l01932" name="l01932"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44"> 1932</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44">Sort</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">int</span> axis)</div>
-<div class="line"><a id="l01933" name="l01933"></a><span class="lineno"> 1933</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), axis_(axis) {}</div>
+<div class="line"><a id="l01928" name="l01928"></a><span class="lineno"> 1928</span> </div>
+<div class="foldopen" id="foldopen01929" data-start="{" data-end="};">
+<div class="line"><a id="l01929" name="l01929"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html"> 1929</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_softmax.html">Softmax</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01930" name="l01930"></a><span class="lineno"> 1930</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01931" data-start="{" data-end="}">
+<div class="line"><a id="l01931" name="l01931"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb"> 1931</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb">Softmax</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> precise)</div>
+<div class="line"><a id="l01932" name="l01932"></a><span class="lineno"> 1932</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), precise_(precise) {}</div>
 </div>
-<div class="line"><a id="l01934" name="l01934"></a><span class="lineno"> 1934</span> </div>
-<div class="line"><a id="l01935" name="l01935"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd"> 1935</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01936" name="l01936"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382"> 1936</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01937" name="l01937"></a><span class="lineno"> 1937</span> </div>
-<div class="line"><a id="l01938" name="l01938"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c"> 1938</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01939" name="l01939"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62"> 1939</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01940" name="l01940"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2"> 1940</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sort.html">Sort</a>)</div>
-<div class="line"><a id="l01941" name="l01941"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d"> 1941</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01942" name="l01942"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511"> 1942</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01933" name="l01933"></a><span class="lineno"> 1933</span> </div>
+<div class="line"><a id="l01934" name="l01934"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79"> 1934</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01935" name="l01935"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af"> 1935</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01936" name="l01936"></a><span class="lineno"> 1936</span> </div>
+<div class="line"><a id="l01937" name="l01937"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19"> 1937</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01938" name="l01938"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f"> 1938</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01939" name="l01939"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83"> 1939</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_softmax.html">Softmax</a>)</div>
+<div class="line"><a id="l01940" name="l01940"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35"> 1940</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01941" name="l01941"></a><span class="lineno"> 1941</span> </div>
+<div class="line"><a id="l01942" name="l01942"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728"> 1942</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
 <div class="line"><a id="l01943" name="l01943"></a><span class="lineno"> 1943</span> </div>
 <div class="line"><a id="l01944" name="l01944"></a><span class="lineno"> 1944</span> private:</div>
-<div class="line"><a id="l01945" name="l01945"></a><span class="lineno"> 1945</span>  <span class="keywordtype">int</span> axis_;</div>
-<div class="line"><a id="l01946" name="l01946"></a><span class="lineno"> 1946</span> </div>
-<div class="line"><a id="l01947" name="l01947"></a><span class="lineno"> 1947</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01948" name="l01948"></a><span class="lineno"> 1948</span>};</div>
+<div class="line"><a id="l01945" name="l01945"></a><span class="lineno"> 1945</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01946" name="l01946"></a><span class="lineno"> 1946</span>  <span class="keywordtype">bool</span> precise_;</div>
+<div class="line"><a id="l01947" name="l01947"></a><span class="lineno"> 1947</span>};</div>
 </div>
-<div class="line"><a id="l01949" name="l01949"></a><span class="lineno"> 1949</span> </div>
-<div class="foldopen" id="foldopen01950" data-start="{" data-end="};">
-<div class="line"><a id="l01950" name="l01950"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html"> 1950</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_split.html">Split</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
-<div class="line"><a id="l01951" name="l01951"></a><span class="lineno"> 1951</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01952" data-start="{" data-end="}">
-<div class="line"><a id="l01952" name="l01952"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385"> 1952</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385">Split</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keyword">const</span> std::vector&lt;int&gt;&amp; indices, <span class="keywordtype">int</span> axis)</div>
-<div class="line"><a id="l01953" name="l01953"></a><span class="lineno"> 1953</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream), indices_(indices), axis_(axis) {}</div>
+<div class="line"><a id="l01948" name="l01948"></a><span class="lineno"> 1948</span> </div>
+<div class="foldopen" id="foldopen01949" data-start="{" data-end="};">
+<div class="line"><a id="l01949" name="l01949"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html"> 1949</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sort.html">Sort</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01950" name="l01950"></a><span class="lineno"> 1950</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01951" data-start="{" data-end="}">
+<div class="line"><a id="l01951" name="l01951"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44"> 1951</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44">Sort</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">int</span> axis)</div>
+<div class="line"><a id="l01952" name="l01952"></a><span class="lineno"> 1952</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), axis_(axis) {}</div>
 </div>
-<div class="line"><a id="l01954" name="l01954"></a><span class="lineno"> 1954</span> </div>
-<div class="line"><a id="l01955" name="l01955"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4"> 1955</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l01956" name="l01956"></a><span class="lineno"> 1956</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01957" name="l01957"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df"> 1957</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l01958" name="l01958"></a><span class="lineno"> 1958</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01959" name="l01959"></a><span class="lineno"> 1959</span> </div>
-<div class="line"><a id="l01960" name="l01960"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6"> 1960</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01961" name="l01961"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282"> 1961</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01962" name="l01962"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2"> 1962</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_split.html">Split</a>)</div>
-<div class="line"><a id="l01963" name="l01963"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345"> 1963</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l01964" name="l01964"></a><span class="lineno"> 1964</span> </div>
-<div class="line"><a id="l01965" name="l01965"></a><span class="lineno"> 1965</span> private:</div>
-<div class="line"><a id="l01966" name="l01966"></a><span class="lineno"> 1966</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; outputs);</div>
-<div class="line"><a id="l01967" name="l01967"></a><span class="lineno"> 1967</span> </div>
-<div class="line"><a id="l01968" name="l01968"></a><span class="lineno"> 1968</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; indices_;</div>
-<div class="line"><a id="l01969" name="l01969"></a><span class="lineno"> 1969</span>  <span class="keywordtype">int</span> axis_;</div>
-<div class="line"><a id="l01970" name="l01970"></a><span class="lineno"> 1970</span>};</div>
+<div class="line"><a id="l01953" name="l01953"></a><span class="lineno"> 1953</span> </div>
+<div class="line"><a id="l01954" name="l01954"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd"> 1954</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01955" name="l01955"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382"> 1955</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01956" name="l01956"></a><span class="lineno"> 1956</span> </div>
+<div class="line"><a id="l01957" name="l01957"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c"> 1957</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01958" name="l01958"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62"> 1958</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01959" name="l01959"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2"> 1959</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_sort.html">Sort</a>)</div>
+<div class="line"><a id="l01960" name="l01960"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d"> 1960</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l01961" name="l01961"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511"> 1961</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01962" name="l01962"></a><span class="lineno"> 1962</span> </div>
+<div class="line"><a id="l01963" name="l01963"></a><span class="lineno"> 1963</span> private:</div>
+<div class="line"><a id="l01964" name="l01964"></a><span class="lineno"> 1964</span>  <span class="keywordtype">int</span> axis_;</div>
+<div class="line"><a id="l01965" name="l01965"></a><span class="lineno"> 1965</span> </div>
+<div class="line"><a id="l01966" name="l01966"></a><span class="lineno"> 1966</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l01967" name="l01967"></a><span class="lineno"> 1967</span>};</div>
 </div>
-<div class="line"><a id="l01971" name="l01971"></a><span class="lineno"> 1971</span> </div>
-<div class="foldopen" id="foldopen01972" data-start="{" data-end="};">
-<div class="line"><a id="l01972" name="l01972"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html"> 1972</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_square.html">Square</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01973" name="l01973"></a><span class="lineno"> 1973</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l01974" name="l01974"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4"> 1974</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4">Square</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l01975" name="l01975"></a><span class="lineno"> 1975</span> </div>
-<div class="line"><a id="l01976" name="l01976"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59"> 1976</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01977" name="l01977"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045"> 1977</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01968" name="l01968"></a><span class="lineno"> 1968</span> </div>
+<div class="foldopen" id="foldopen01969" data-start="{" data-end="};">
+<div class="line"><a id="l01969" name="l01969"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html"> 1969</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_split.html">Split</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
+<div class="line"><a id="l01970" name="l01970"></a><span class="lineno"> 1970</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen01971" data-start="{" data-end="}">
+<div class="line"><a id="l01971" name="l01971"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385"> 1971</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385">Split</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keyword">const</span> std::vector&lt;int&gt;&amp; indices, <span class="keywordtype">int</span> axis)</div>
+<div class="line"><a id="l01972" name="l01972"></a><span class="lineno"> 1972</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream), indices_(indices), axis_(axis) {}</div>
+</div>
+<div class="line"><a id="l01973" name="l01973"></a><span class="lineno"> 1973</span> </div>
+<div class="line"><a id="l01974" name="l01974"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4"> 1974</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l01975" name="l01975"></a><span class="lineno"> 1975</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01976" name="l01976"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df"> 1976</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l01977" name="l01977"></a><span class="lineno"> 1977</span>      <span class="keyword">override</span>;</div>
 <div class="line"><a id="l01978" name="l01978"></a><span class="lineno"> 1978</span> </div>
-<div class="line"><a id="l01979" name="l01979"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5"> 1979</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01980" name="l01980"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d"> 1980</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01981" name="l01981"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384"> 1981</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_square.html">Square</a>)</div>
-<div class="line"><a id="l01982" name="l01982"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2"> 1982</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l01983" name="l01983"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02"> 1983</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l01984" name="l01984"></a><span class="lineno"> 1984</span> </div>
-<div class="line"><a id="l01985" name="l01985"></a><span class="lineno"> 1985</span> private:</div>
-<div class="line"><a id="l01986" name="l01986"></a><span class="lineno"> 1986</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l01987" name="l01987"></a><span class="lineno"> 1987</span>};</div>
+<div class="line"><a id="l01979" name="l01979"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6"> 1979</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01980" name="l01980"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282"> 1980</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l01981" name="l01981"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2"> 1981</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_split.html">Split</a>)</div>
+<div class="line"><a id="l01982" name="l01982"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345"> 1982</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l01983" name="l01983"></a><span class="lineno"> 1983</span> </div>
+<div class="line"><a id="l01984" name="l01984"></a><span class="lineno"> 1984</span> private:</div>
+<div class="line"><a id="l01985" name="l01985"></a><span class="lineno"> 1985</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; outputs);</div>
+<div class="line"><a id="l01986" name="l01986"></a><span class="lineno"> 1986</span> </div>
+<div class="line"><a id="l01987" name="l01987"></a><span class="lineno"> 1987</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; indices_;</div>
+<div class="line"><a id="l01988" name="l01988"></a><span class="lineno"> 1988</span>  <span class="keywordtype">int</span> axis_;</div>
+<div class="line"><a id="l01989" name="l01989"></a><span class="lineno"> 1989</span>};</div>
 </div>
-<div class="line"><a id="l01988" name="l01988"></a><span class="lineno"> 1988</span> </div>
-<div class="foldopen" id="foldopen01989" data-start="{" data-end="};">
-<div class="line"><a id="l01989" name="l01989"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html"> 1989</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sqrt.html">Sqrt</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l01990" name="l01990"></a><span class="lineno"> 1990</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen01991" data-start="{" data-end="}">
-<div class="line"><a id="l01991" name="l01991"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29"> 1991</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29">Sqrt</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> recip = <span class="keyword">false</span>)</div>
-<div class="line"><a id="l01992" name="l01992"></a><span class="lineno"> 1992</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), recip_(recip) {}</div>
+<div class="line"><a id="l01990" name="l01990"></a><span class="lineno"> 1990</span> </div>
+<div class="foldopen" id="foldopen01991" data-start="{" data-end="};">
+<div class="line"><a id="l01991" name="l01991"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html"> 1991</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_square.html">Square</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l01992" name="l01992"></a><span class="lineno"> 1992</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l01993" name="l01993"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4"> 1993</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4">Square</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l01994" name="l01994"></a><span class="lineno"> 1994</span> </div>
+<div class="line"><a id="l01995" name="l01995"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59"> 1995</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01996" name="l01996"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045"> 1996</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l01997" name="l01997"></a><span class="lineno"> 1997</span> </div>
+<div class="line"><a id="l01998" name="l01998"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5"> 1998</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l01999" name="l01999"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d"> 1999</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l02000" name="l02000"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384"> 2000</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_square.html">Square</a>)</div>
+<div class="line"><a id="l02001" name="l02001"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2"> 2001</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l02002" name="l02002"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02"> 2002</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l02003" name="l02003"></a><span class="lineno"> 2003</span> </div>
+<div class="line"><a id="l02004" name="l02004"></a><span class="lineno"> 2004</span> private:</div>
+<div class="line"><a id="l02005" name="l02005"></a><span class="lineno"> 2005</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l02006" name="l02006"></a><span class="lineno"> 2006</span>};</div>
 </div>
-<div class="line"><a id="l01993" name="l01993"></a><span class="lineno"> 1993</span> </div>
-<div class="line"><a id="l01994" name="l01994"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5"> 1994</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01995" name="l01995"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501"> 1995</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l01996" name="l01996"></a><span class="lineno"> 1996</span> </div>
-<div class="line"><a id="l01997" name="l01997"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e"> 1997</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l01998" name="l01998"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818"> 1998</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l01999" name="l01999"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5"> 1999</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l02000" name="l02000"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46"> 2000</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l02001" name="l02001"></a><span class="lineno"> 2001</span> </div>
-<div class="foldopen" id="foldopen02002" data-start="{" data-end="}">
-<div class="line"><a id="l02002" name="l02002"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f"> 2002</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
-<div class="line"><a id="l02003" name="l02003"></a><span class="lineno"> 2003</span>    <span class="keywordflow">if</span> (recip_) {</div>
-<div class="line"><a id="l02004" name="l02004"></a><span class="lineno"> 2004</span>      os &lt;&lt; <span class="stringliteral">&quot;Rsqrt&quot;</span>;</div>
-<div class="line"><a id="l02005" name="l02005"></a><span class="lineno"> 2005</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l02006" name="l02006"></a><span class="lineno"> 2006</span>      os &lt;&lt; <span class="stringliteral">&quot;Sqrt&quot;</span>;</div>
-<div class="line"><a id="l02007" name="l02007"></a><span class="lineno"> 2007</span>    }</div>
-<div class="line"><a id="l02008" name="l02008"></a><span class="lineno"> 2008</span>  }</div>
+<div class="line"><a id="l02007" name="l02007"></a><span class="lineno"> 2007</span> </div>
+<div class="foldopen" id="foldopen02008" data-start="{" data-end="};">
+<div class="line"><a id="l02008" name="l02008"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html"> 2008</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_sqrt.html">Sqrt</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02009" name="l02009"></a><span class="lineno"> 2009</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen02010" data-start="{" data-end="}">
+<div class="line"><a id="l02010" name="l02010"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29"> 2010</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29">Sqrt</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> recip = <span class="keyword">false</span>)</div>
+<div class="line"><a id="l02011" name="l02011"></a><span class="lineno"> 2011</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), recip_(recip) {}</div>
 </div>
-<div class="line"><a id="l02009" name="l02009"></a><span class="lineno"> 2009</span> </div>
-<div class="line"><a id="l02010" name="l02010"></a><span class="lineno"> 2010</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l02011" name="l02011"></a><span class="lineno"> 2011</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l02012" name="l02012"></a><span class="lineno"> 2012</span>  <span class="keywordtype">bool</span> recip_;</div>
-<div class="line"><a id="l02013" name="l02013"></a><span class="lineno"> 2013</span>};</div>
+<div class="line"><a id="l02012" name="l02012"></a><span class="lineno"> 2012</span> </div>
+<div class="line"><a id="l02013" name="l02013"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5"> 2013</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02014" name="l02014"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501"> 2014</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02015" name="l02015"></a><span class="lineno"> 2015</span> </div>
+<div class="line"><a id="l02016" name="l02016"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e"> 2016</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02017" name="l02017"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818"> 2017</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l02018" name="l02018"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5"> 2018</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l02019" name="l02019"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46"> 2019</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l02020" name="l02020"></a><span class="lineno"> 2020</span> </div>
+<div class="foldopen" id="foldopen02021" data-start="{" data-end="}">
+<div class="line"><a id="l02021" name="l02021"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f"> 2021</a></span>  <span class="keywordtype">void</span> print(std::ostream&amp; os)<span class="keyword"> override </span>{</div>
+<div class="line"><a id="l02022" name="l02022"></a><span class="lineno"> 2022</span>    <span class="keywordflow">if</span> (recip_) {</div>
+<div class="line"><a id="l02023" name="l02023"></a><span class="lineno"> 2023</span>      os &lt;&lt; <span class="stringliteral">&quot;Rsqrt&quot;</span>;</div>
+<div class="line"><a id="l02024" name="l02024"></a><span class="lineno"> 2024</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l02025" name="l02025"></a><span class="lineno"> 2025</span>      os &lt;&lt; <span class="stringliteral">&quot;Sqrt&quot;</span>;</div>
+<div class="line"><a id="l02026" name="l02026"></a><span class="lineno"> 2026</span>    }</div>
+<div class="line"><a id="l02027" name="l02027"></a><span class="lineno"> 2027</span>  }</div>
 </div>
-<div class="line"><a id="l02014" name="l02014"></a><span class="lineno"> 2014</span> </div>
-<div class="foldopen" id="foldopen02015" data-start="{" data-end="};">
-<div class="line"><a id="l02015" name="l02015"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html"> 2015</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_stop_gradient.html">StopGradient</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l02016" name="l02016"></a><span class="lineno"> 2016</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l02017" name="l02017"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f"> 2017</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f">StopGradient</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l02018" name="l02018"></a><span class="lineno"> 2018</span> </div>
-<div class="line"><a id="l02019" name="l02019"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2"> 2019</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02020" name="l02020"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89"> 2020</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02021" name="l02021"></a><span class="lineno"> 2021</span> </div>
-<div class="line"><a id="l02022" name="l02022"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0"> 2022</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02023" name="l02023"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50"> 2023</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_stop_gradient.html">StopGradient</a>)</div>
-<div class="line"><a id="l02024" name="l02024"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3"> 2024</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l02025" name="l02025"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e"> 2025</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l02026" name="l02026"></a><span class="lineno"> 2026</span> </div>
-<div class="line"><a id="l02027" name="l02027"></a><span class="lineno"> 2027</span> private:</div>
-<div class="line"><a id="l02028" name="l02028"></a><span class="lineno"> 2028</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l02029" name="l02029"></a><span class="lineno"> 2029</span>};</div>
+<div class="line"><a id="l02028" name="l02028"></a><span class="lineno"> 2028</span> </div>
+<div class="line"><a id="l02029" name="l02029"></a><span class="lineno"> 2029</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l02030" name="l02030"></a><span class="lineno"> 2030</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l02031" name="l02031"></a><span class="lineno"> 2031</span>  <span class="keywordtype">bool</span> recip_;</div>
+<div class="line"><a id="l02032" name="l02032"></a><span class="lineno"> 2032</span>};</div>
 </div>
-<div class="line"><a id="l02030" name="l02030"></a><span class="lineno"> 2030</span> </div>
-<div class="foldopen" id="foldopen02031" data-start="{" data-end="};">
-<div class="line"><a id="l02031" name="l02031"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html"> 2031</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_subtract.html">Subtract</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l02032" name="l02032"></a><span class="lineno"> 2032</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l02033" name="l02033"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c"> 2033</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c">Subtract</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l02034" name="l02034"></a><span class="lineno"> 2034</span> </div>
-<div class="line"><a id="l02035" name="l02035"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12"> 2035</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02036" name="l02036"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c"> 2036</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02033" name="l02033"></a><span class="lineno"> 2033</span> </div>
+<div class="foldopen" id="foldopen02034" data-start="{" data-end="};">
+<div class="line"><a id="l02034" name="l02034"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html"> 2034</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_stop_gradient.html">StopGradient</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02035" name="l02035"></a><span class="lineno"> 2035</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l02036" name="l02036"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f"> 2036</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f">StopGradient</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
 <div class="line"><a id="l02037" name="l02037"></a><span class="lineno"> 2037</span> </div>
-<div class="line"><a id="l02038" name="l02038"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098"> 2038</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02039" name="l02039"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220"> 2039</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l02040" name="l02040"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b"> 2040</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_subtract.html">Subtract</a>)</div>
-<div class="line"><a id="l02041" name="l02041"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b"> 2041</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l02042" name="l02042"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc"> 2042</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l02043" name="l02043"></a><span class="lineno"> 2043</span> </div>
-<div class="line"><a id="l02044" name="l02044"></a><span class="lineno"> 2044</span> private:</div>
-<div class="line"><a id="l02045" name="l02045"></a><span class="lineno"> 2045</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l02046" name="l02046"></a><span class="lineno"> 2046</span>};</div>
+<div class="line"><a id="l02038" name="l02038"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2"> 2038</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02039" name="l02039"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89"> 2039</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02040" name="l02040"></a><span class="lineno"> 2040</span> </div>
+<div class="line"><a id="l02041" name="l02041"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0"> 2041</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02042" name="l02042"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50"> 2042</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_stop_gradient.html">StopGradient</a>)</div>
+<div class="line"><a id="l02043" name="l02043"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3"> 2043</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l02044" name="l02044"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e"> 2044</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l02045" name="l02045"></a><span class="lineno"> 2045</span> </div>
+<div class="line"><a id="l02046" name="l02046"></a><span class="lineno"> 2046</span> private:</div>
+<div class="line"><a id="l02047" name="l02047"></a><span class="lineno"> 2047</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l02048" name="l02048"></a><span class="lineno"> 2048</span>};</div>
 </div>
-<div class="line"><a id="l02047" name="l02047"></a><span class="lineno"> 2047</span> </div>
-<div class="foldopen" id="foldopen02048" data-start="{" data-end="};">
-<div class="line"><a id="l02048" name="l02048"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html"> 2048</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_tan.html">Tan</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l02049" name="l02049"></a><span class="lineno"> 2049</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l02050" name="l02050"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#a8dcc9ff660210ccf05134dd95f47de08"> 2050</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tan.html#a8dcc9ff660210ccf05134dd95f47de08">Tan</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l02051" name="l02051"></a><span class="lineno"> 2051</span> </div>
-<div class="line"><a id="l02052" name="l02052"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9"> 2052</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02053" name="l02053"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f"> 2053</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02054" name="l02054"></a><span class="lineno"> 2054</span> </div>
-<div class="line"><a id="l02055" name="l02055"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7"> 2055</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02056" name="l02056"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2"> 2056</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l02057" name="l02057"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f"> 2057</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_tan.html">Tan</a>)</div>
-<div class="line"><a id="l02058" name="l02058"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4"> 2058</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l02059" name="l02059"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37"> 2059</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l02060" name="l02060"></a><span class="lineno"> 2060</span> </div>
-<div class="line"><a id="l02061" name="l02061"></a><span class="lineno"> 2061</span> private:</div>
-<div class="line"><a id="l02062" name="l02062"></a><span class="lineno"> 2062</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l02063" name="l02063"></a><span class="lineno"> 2063</span>};</div>
+<div class="line"><a id="l02049" name="l02049"></a><span class="lineno"> 2049</span> </div>
+<div class="foldopen" id="foldopen02050" data-start="{" data-end="};">
+<div class="line"><a id="l02050" name="l02050"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html"> 2050</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_subtract.html">Subtract</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02051" name="l02051"></a><span class="lineno"> 2051</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l02052" name="l02052"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c"> 2052</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c">Subtract</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l02053" name="l02053"></a><span class="lineno"> 2053</span> </div>
+<div class="line"><a id="l02054" name="l02054"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12"> 2054</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02055" name="l02055"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c"> 2055</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02056" name="l02056"></a><span class="lineno"> 2056</span> </div>
+<div class="line"><a id="l02057" name="l02057"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098"> 2057</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02058" name="l02058"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220"> 2058</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l02059" name="l02059"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b"> 2059</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_subtract.html">Subtract</a>)</div>
+<div class="line"><a id="l02060" name="l02060"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b"> 2060</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l02061" name="l02061"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc"> 2061</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l02062" name="l02062"></a><span class="lineno"> 2062</span> </div>
+<div class="line"><a id="l02063" name="l02063"></a><span class="lineno"> 2063</span> private:</div>
+<div class="line"><a id="l02064" name="l02064"></a><span class="lineno"> 2064</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l02065" name="l02065"></a><span class="lineno"> 2065</span>};</div>
 </div>
-<div class="line"><a id="l02064" name="l02064"></a><span class="lineno"> 2064</span> </div>
-<div class="foldopen" id="foldopen02065" data-start="{" data-end="};">
-<div class="line"><a id="l02065" name="l02065"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html"> 2065</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_tanh.html">Tanh</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l02066" name="l02066"></a><span class="lineno"> 2066</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l02067" name="l02067"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#ae551297bf573e1802fb831440276dee4"> 2067</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tanh.html#ae551297bf573e1802fb831440276dee4">Tanh</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l02068" name="l02068"></a><span class="lineno"> 2068</span> </div>
-<div class="line"><a id="l02069" name="l02069"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5"> 2069</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02070" name="l02070"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761"> 2070</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02071" name="l02071"></a><span class="lineno"> 2071</span> </div>
-<div class="line"><a id="l02072" name="l02072"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f"> 2072</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02073" name="l02073"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a"> 2073</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l02074" name="l02074"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e"> 2074</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_tanh.html">Tanh</a>)</div>
-<div class="line"><a id="l02075" name="l02075"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda"> 2075</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l02076" name="l02076"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325"> 2076</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
-<div class="line"><a id="l02077" name="l02077"></a><span class="lineno"> 2077</span> </div>
-<div class="line"><a id="l02078" name="l02078"></a><span class="lineno"> 2078</span> private:</div>
-<div class="line"><a id="l02079" name="l02079"></a><span class="lineno"> 2079</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l02080" name="l02080"></a><span class="lineno"> 2080</span>};</div>
-</div>
-<div class="line"><a id="l02081" name="l02081"></a><span class="lineno"> 2081</span> </div>
-<div class="foldopen" id="foldopen02082" data-start="{" data-end="};">
-<div class="line"><a id="l02082" name="l02082"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html"> 2082</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_uniform.html">Uniform</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l02083" name="l02083"></a><span class="lineno"> 2083</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l02084" name="l02084"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1"> 2084</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1">Uniform</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
-<div class="line"><a id="l02085" name="l02085"></a><span class="lineno"> 2085</span> </div>
-<div class="line"><a id="l02086" name="l02086"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f"> 2086</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02087" name="l02087"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0"> 2087</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02088" name="l02088"></a><span class="lineno"> 2088</span> </div>
-<div class="line"><a id="l02089" name="l02089"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926"> 2089</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02090" name="l02090"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d"> 2090</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_uniform.html">Uniform</a>)</div>
-<div class="line"><a id="l02091" name="l02091"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b"> 2091</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
-<div class="line"><a id="l02092" name="l02092"></a><span class="lineno"> 2092</span> </div>
-<div class="line"><a id="l02093" name="l02093"></a><span class="lineno"> 2093</span> private:</div>
-<div class="line"><a id="l02094" name="l02094"></a><span class="lineno"> 2094</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l02095" name="l02095"></a><span class="lineno"> 2095</span>};</div>
+<div class="line"><a id="l02066" name="l02066"></a><span class="lineno"> 2066</span> </div>
+<div class="foldopen" id="foldopen02067" data-start="{" data-end="};">
+<div class="line"><a id="l02067" name="l02067"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html"> 2067</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_tan.html">Tan</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02068" name="l02068"></a><span class="lineno"> 2068</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l02069" name="l02069"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#a8dcc9ff660210ccf05134dd95f47de08"> 2069</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tan.html#a8dcc9ff660210ccf05134dd95f47de08">Tan</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l02070" name="l02070"></a><span class="lineno"> 2070</span> </div>
+<div class="line"><a id="l02071" name="l02071"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9"> 2071</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02072" name="l02072"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f"> 2072</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02073" name="l02073"></a><span class="lineno"> 2073</span> </div>
+<div class="line"><a id="l02074" name="l02074"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7"> 2074</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02075" name="l02075"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2"> 2075</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l02076" name="l02076"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f"> 2076</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_tan.html">Tan</a>)</div>
+<div class="line"><a id="l02077" name="l02077"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4"> 2077</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l02078" name="l02078"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37"> 2078</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
+<div class="line"><a id="l02079" name="l02079"></a><span class="lineno"> 2079</span> </div>
+<div class="line"><a id="l02080" name="l02080"></a><span class="lineno"> 2080</span> private:</div>
+<div class="line"><a id="l02081" name="l02081"></a><span class="lineno"> 2081</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l02082" name="l02082"></a><span class="lineno"> 2082</span>};</div>
 </div>
+<div class="line"><a id="l02083" name="l02083"></a><span class="lineno"> 2083</span> </div>
+<div class="foldopen" id="foldopen02084" data-start="{" data-end="};">
+<div class="line"><a id="l02084" name="l02084"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html"> 2084</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_tanh.html">Tanh</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02085" name="l02085"></a><span class="lineno"> 2085</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l02086" name="l02086"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#ae551297bf573e1802fb831440276dee4"> 2086</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tanh.html#ae551297bf573e1802fb831440276dee4">Tanh</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
+<div class="line"><a id="l02087" name="l02087"></a><span class="lineno"> 2087</span> </div>
+<div class="line"><a id="l02088" name="l02088"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5"> 2088</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02089" name="l02089"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761"> 2089</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02090" name="l02090"></a><span class="lineno"> 2090</span> </div>
+<div class="line"><a id="l02091" name="l02091"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f"> 2091</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02092" name="l02092"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a"> 2092</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l02093" name="l02093"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e"> 2093</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_tanh.html">Tanh</a>)</div>
+<div class="line"><a id="l02094" name="l02094"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda"> 2094</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l02095" name="l02095"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325"> 2095</a></span>  <a class="code hl_define" href="primitives_8h.html#a649a06267b75e007224ea4ddefedb999">DEFINE_INPUT_OUTPUT_SHAPE</a>()</div>
 <div class="line"><a id="l02096" name="l02096"></a><span class="lineno"> 2096</span> </div>
-<div class="foldopen" id="foldopen02097" data-start="{" data-end="};">
-<div class="line"><a id="l02097" name="l02097"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html"> 2097</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_view.html">View</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l02098" name="l02098"></a><span class="lineno"> 2098</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen02099" data-start="{" data-end="}">
-<div class="line"><a id="l02099" name="l02099"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e"> 2099</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e">View</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> dtype)</div>
-<div class="line"><a id="l02100" name="l02100"></a><span class="lineno"> 2100</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), dtype_(dtype) {}</div>
+<div class="line"><a id="l02097" name="l02097"></a><span class="lineno"> 2097</span> private:</div>
+<div class="line"><a id="l02098" name="l02098"></a><span class="lineno"> 2098</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l02099" name="l02099"></a><span class="lineno"> 2099</span>};</div>
 </div>
-<div class="line"><a id="l02101" name="l02101"></a><span class="lineno"> 2101</span> </div>
-<div class="line"><a id="l02102" name="l02102"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497"> 2102</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02103" name="l02103"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075"> 2103</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02100" name="l02100"></a><span class="lineno"> 2100</span> </div>
+<div class="foldopen" id="foldopen02101" data-start="{" data-end="};">
+<div class="line"><a id="l02101" name="l02101"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html"> 2101</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_uniform.html">Uniform</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02102" name="l02102"></a><span class="lineno"> 2102</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l02103" name="l02103"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1"> 2103</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1">Uniform</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream) {}</div>
 <div class="line"><a id="l02104" name="l02104"></a><span class="lineno"> 2104</span> </div>
-<div class="line"><a id="l02105" name="l02105"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121"> 2105</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02106" name="l02106"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c"> 2106</a></span>  void print(std::ostream&amp; os) override;</div>
-<div class="line"><a id="l02107" name="l02107"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64"> 2107</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l02108" name="l02108"></a><span class="lineno"> 2108</span> </div>
-<div class="line"><a id="l02109" name="l02109"></a><span class="lineno"> 2109</span> private:</div>
-<div class="line"><a id="l02110" name="l02110"></a><span class="lineno"> 2110</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> dtype_;</div>
-<div class="line"><a id="l02111" name="l02111"></a><span class="lineno"> 2111</span>};</div>
+<div class="line"><a id="l02105" name="l02105"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f"> 2105</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02106" name="l02106"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0"> 2106</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02107" name="l02107"></a><span class="lineno"> 2107</span> </div>
+<div class="line"><a id="l02108" name="l02108"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926"> 2108</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02109" name="l02109"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d"> 2109</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_uniform.html">Uniform</a>)</div>
+<div class="line"><a id="l02110" name="l02110"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b"> 2110</a></span>  <a class="code hl_define" href="primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a">DEFINE_DEFAULT_IS_EQUIVALENT</a>()</div>
+<div class="line"><a id="l02111" name="l02111"></a><span class="lineno"> 2111</span> </div>
+<div class="line"><a id="l02112" name="l02112"></a><span class="lineno"> 2112</span> private:</div>
+<div class="line"><a id="l02113" name="l02113"></a><span class="lineno"> 2113</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l02114" name="l02114"></a><span class="lineno"> 2114</span>};</div>
 </div>
-<div class="line"><a id="l02112" name="l02112"></a><span class="lineno"> 2112</span> </div>
-<div class="foldopen" id="foldopen02113" data-start="{" data-end="};">
-<div class="line"><a id="l02113" name="l02113"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html"> 2113</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_transpose.html">Transpose</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l02114" name="l02114"></a><span class="lineno"> 2114</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen02115" data-start="{" data-end="}">
-<div class="line"><a id="l02115" name="l02115"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a"> 2115</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a">Transpose</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes)</div>
-<div class="line"><a id="l02116" name="l02116"></a><span class="lineno"> 2116</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), axes_(axes) {}</div>
+<div class="line"><a id="l02115" name="l02115"></a><span class="lineno"> 2115</span> </div>
+<div class="foldopen" id="foldopen02116" data-start="{" data-end="};">
+<div class="line"><a id="l02116" name="l02116"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html"> 2116</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_view.html">View</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02117" name="l02117"></a><span class="lineno"> 2117</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen02118" data-start="{" data-end="}">
+<div class="line"><a id="l02118" name="l02118"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e"> 2118</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e">View</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> dtype)</div>
+<div class="line"><a id="l02119" name="l02119"></a><span class="lineno"> 2119</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), dtype_(dtype) {}</div>
 </div>
-<div class="line"><a id="l02117" name="l02117"></a><span class="lineno"> 2117</span> </div>
-<div class="line"><a id="l02118" name="l02118"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8"> 2118</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02119" name="l02119"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e"> 2119</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
 <div class="line"><a id="l02120" name="l02120"></a><span class="lineno"> 2120</span> </div>
-<div class="line"><a id="l02121" name="l02121"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe"> 2121</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02122" name="l02122"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1"> 2122</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
-<div class="line"><a id="l02123" name="l02123"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04"> 2123</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_transpose.html">Transpose</a>)</div>
-<div class="line"><a id="l02124" name="l02124"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab"> 2124</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
-<div class="line"><a id="l02125" name="l02125"></a><span class="lineno"> 2125</span> </div>
-<div class="line"><a id="l02126" name="l02126"></a><span class="lineno"> 2126</span> private:</div>
-<div class="line"><a id="l02127" name="l02127"></a><span class="lineno"> 2127</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; axes_;</div>
-<div class="line"><a id="l02128" name="l02128"></a><span class="lineno"> 2128</span> </div>
-<div class="line"><a id="l02129" name="l02129"></a><span class="lineno"> 2129</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l02121" name="l02121"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497"> 2121</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02122" name="l02122"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075"> 2122</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02123" name="l02123"></a><span class="lineno"> 2123</span> </div>
+<div class="line"><a id="l02124" name="l02124"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121"> 2124</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02125" name="l02125"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c"> 2125</a></span>  void print(std::ostream&amp; os) override;</div>
+<div class="line"><a id="l02126" name="l02126"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64"> 2126</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l02127" name="l02127"></a><span class="lineno"> 2127</span> </div>
+<div class="line"><a id="l02128" name="l02128"></a><span class="lineno"> 2128</span> private:</div>
+<div class="line"><a id="l02129" name="l02129"></a><span class="lineno"> 2129</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1_dtype.html">Dtype</a> dtype_;</div>
 <div class="line"><a id="l02130" name="l02130"></a><span class="lineno"> 2130</span>};</div>
 </div>
 <div class="line"><a id="l02131" name="l02131"></a><span class="lineno"> 2131</span> </div>
-<div class="line"><a id="l02132" name="l02132"></a><span class="lineno"> 2132</span><span class="comment">/* QR Factorization primitive. */</span></div>
-<div class="foldopen" id="foldopen02133" data-start="{" data-end="};">
-<div class="line"><a id="l02133" name="l02133"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html"> 2133</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_q_r_f.html">QRF</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
-<div class="line"><a id="l02134" name="l02134"></a><span class="lineno"> 2134</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l02135" name="l02135"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983"> 2135</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983">QRF</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream) {}</div>
+<div class="foldopen" id="foldopen02132" data-start="{" data-end="};">
+<div class="line"><a id="l02132" name="l02132"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html"> 2132</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_transpose.html">Transpose</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02133" name="l02133"></a><span class="lineno"> 2133</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen02134" data-start="{" data-end="}">
+<div class="line"><a id="l02134" name="l02134"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a"> 2134</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a">Transpose</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes)</div>
+<div class="line"><a id="l02135" name="l02135"></a><span class="lineno"> 2135</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), axes_(axes) {}</div>
+</div>
 <div class="line"><a id="l02136" name="l02136"></a><span class="lineno"> 2136</span> </div>
-<div class="line"><a id="l02137" name="l02137"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2"> 2137</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l02138" name="l02138"></a><span class="lineno"> 2138</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02139" name="l02139"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9"> 2139</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l02140" name="l02140"></a><span class="lineno"> 2140</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02141" name="l02141"></a><span class="lineno"> 2141</span> </div>
-<div class="line"><a id="l02142" name="l02142"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b"> 2142</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_q_r_f.html">QRF</a>)</div>
-<div class="line"><a id="l02143" name="l02143"></a><span class="lineno"> 2143</span> </div>
-<div class="line"><a id="l02144" name="l02144"></a><span class="lineno"> 2144</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l02145" name="l02145"></a><span class="lineno"> 2145</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
-<div class="line"><a id="l02146" name="l02146"></a><span class="lineno"> 2146</span>};</div>
-</div>
+<div class="line"><a id="l02137" name="l02137"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8"> 2137</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02138" name="l02138"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e"> 2138</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02139" name="l02139"></a><span class="lineno"> 2139</span> </div>
+<div class="line"><a id="l02140" name="l02140"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe"> 2140</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02141" name="l02141"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1"> 2141</a></span>  <a class="code hl_define" href="primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6">DEFINE_GRADS</a>()</div>
+<div class="line"><a id="l02142" name="l02142"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04"> 2142</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_transpose.html">Transpose</a>)</div>
+<div class="line"><a id="l02143" name="l02143"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab"> 2143</a></span>  <span class="keywordtype">bool</span> is_equivalent(const <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other) const override;</div>
+<div class="line"><a id="l02144" name="l02144"></a><span class="lineno"> 2144</span> </div>
+<div class="line"><a id="l02145" name="l02145"></a><span class="lineno"> 2145</span> private:</div>
+<div class="line"><a id="l02146" name="l02146"></a><span class="lineno"> 2146</span>  std::vector&lt;<span class="keywordtype">int</span>&gt; axes_;</div>
 <div class="line"><a id="l02147" name="l02147"></a><span class="lineno"> 2147</span> </div>
-<div class="line"><a id="l02148" name="l02148"></a><span class="lineno"> 2148</span><span class="comment">/* SVD primitive. */</span></div>
-<div class="foldopen" id="foldopen02149" data-start="{" data-end="};">
-<div class="line"><a id="l02149" name="l02149"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html"> 2149</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_s_v_d.html">SVD</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
-<div class="line"><a id="l02150" name="l02150"></a><span class="lineno"> 2150</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l02151" name="l02151"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1"> 2151</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1">SVD</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream) {}</div>
-<div class="line"><a id="l02152" name="l02152"></a><span class="lineno"> 2152</span> </div>
-<div class="line"><a id="l02153" name="l02153"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6"> 2153</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l02154" name="l02154"></a><span class="lineno"> 2154</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02155" name="l02155"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83"> 2155</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l02156" name="l02156"></a><span class="lineno"> 2156</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02157" name="l02157"></a><span class="lineno"> 2157</span> </div>
-<div class="line"><a id="l02158" name="l02158"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8"> 2158</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02159" name="l02159"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53"> 2159</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_s_v_d.html">SVD</a>)</div>
+<div class="line"><a id="l02148" name="l02148"></a><span class="lineno"> 2148</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l02149" name="l02149"></a><span class="lineno"> 2149</span>};</div>
+</div>
+<div class="line"><a id="l02150" name="l02150"></a><span class="lineno"> 2150</span> </div>
+<div class="line"><a id="l02151" name="l02151"></a><span class="lineno"> 2151</span><span class="comment">/* QR Factorization primitive. */</span></div>
+<div class="foldopen" id="foldopen02152" data-start="{" data-end="};">
+<div class="line"><a id="l02152" name="l02152"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html"> 2152</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_q_r_f.html">QRF</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
+<div class="line"><a id="l02153" name="l02153"></a><span class="lineno"> 2153</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l02154" name="l02154"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983"> 2154</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983">QRF</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream) {}</div>
+<div class="line"><a id="l02155" name="l02155"></a><span class="lineno"> 2155</span> </div>
+<div class="line"><a id="l02156" name="l02156"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2"> 2156</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l02157" name="l02157"></a><span class="lineno"> 2157</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02158" name="l02158"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9"> 2158</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l02159" name="l02159"></a><span class="lineno"> 2159</span>      <span class="keyword">override</span>;</div>
 <div class="line"><a id="l02160" name="l02160"></a><span class="lineno"> 2160</span> </div>
-<div class="line"><a id="l02161" name="l02161"></a><span class="lineno"> 2161</span> private:</div>
-<div class="line"><a id="l02162" name="l02162"></a><span class="lineno"> 2162</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; outputs);</div>
-<div class="line"><a id="l02163" name="l02163"></a><span class="lineno"> 2163</span>};</div>
+<div class="line"><a id="l02161" name="l02161"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b"> 2161</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_q_r_f.html">QRF</a>)</div>
+<div class="line"><a id="l02162" name="l02162"></a><span class="lineno"> 2162</span> </div>
+<div class="line"><a id="l02163" name="l02163"></a><span class="lineno"> 2163</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l02164" name="l02164"></a><span class="lineno"> 2164</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
+<div class="line"><a id="l02165" name="l02165"></a><span class="lineno"> 2165</span>};</div>
 </div>
-<div class="line"><a id="l02164" name="l02164"></a><span class="lineno"> 2164</span> </div>
-<div class="line"><a id="l02165" name="l02165"></a><span class="lineno"> 2165</span><span class="comment">/* Matrix inversion primitive. */</span></div>
-<div class="foldopen" id="foldopen02166" data-start="{" data-end="};">
-<div class="line"><a id="l02166" name="l02166"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html"> 2166</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_inverse.html">Inverse</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l02167" name="l02167"></a><span class="lineno"> 2167</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen02168" data-start="{" data-end="}">
-<div class="line"><a id="l02168" name="l02168"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#a71467681e523abb725724490bfeb76ad"> 2168</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_inverse.html#a71467681e523abb725724490bfeb76ad">Inverse</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> tri, <span class="keywordtype">bool</span> upper)</div>
-<div class="line"><a id="l02169" name="l02169"></a><span class="lineno"> 2169</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), tri_(<a class="code hl_function" href="group__ops.html#ga4f3389e5b89e70e862e7d2b40d6c7f78">tri</a>), upper_(upper) {}</div>
-</div>
-<div class="line"><a id="l02170" name="l02170"></a><span class="lineno"> 2170</span> </div>
-<div class="line"><a id="l02171" name="l02171"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81"> 2171</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; output) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02172" name="l02172"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2"> 2172</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; output) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02173" name="l02173"></a><span class="lineno"> 2173</span> </div>
-<div class="line"><a id="l02174" name="l02174"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2"> 2174</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02175" name="l02175"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9"> 2175</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_inverse.html">Inverse</a>)</div>
+<div class="line"><a id="l02166" name="l02166"></a><span class="lineno"> 2166</span> </div>
+<div class="line"><a id="l02167" name="l02167"></a><span class="lineno"> 2167</span><span class="comment">/* SVD primitive. */</span></div>
+<div class="foldopen" id="foldopen02168" data-start="{" data-end="};">
+<div class="line"><a id="l02168" name="l02168"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html"> 2168</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1_s_v_d.html">SVD</a> : <span class="keyword">public</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
+<div class="line"><a id="l02169" name="l02169"></a><span class="lineno"> 2169</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l02170" name="l02170"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1"> 2170</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1">SVD</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream) : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream) {}</div>
+<div class="line"><a id="l02171" name="l02171"></a><span class="lineno"> 2171</span> </div>
+<div class="line"><a id="l02172" name="l02172"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6"> 2172</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l02173" name="l02173"></a><span class="lineno"> 2173</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02174" name="l02174"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83"> 2174</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l02175" name="l02175"></a><span class="lineno"> 2175</span>      <span class="keyword">override</span>;</div>
 <div class="line"><a id="l02176" name="l02176"></a><span class="lineno"> 2176</span> </div>
-<div class="line"><a id="l02177" name="l02177"></a><span class="lineno"> 2177</span> private:</div>
-<div class="line"><a id="l02178" name="l02178"></a><span class="lineno"> 2178</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; output);</div>
-<div class="line"><a id="l02179" name="l02179"></a><span class="lineno"> 2179</span>  <span class="keywordtype">bool</span> tri_;</div>
-<div class="line"><a id="l02180" name="l02180"></a><span class="lineno"> 2180</span>  <span class="keywordtype">bool</span> upper_;</div>
-<div class="line"><a id="l02181" name="l02181"></a><span class="lineno"> 2181</span>};</div>
+<div class="line"><a id="l02177" name="l02177"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8"> 2177</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02178" name="l02178"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53"> 2178</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_s_v_d.html">SVD</a>)</div>
+<div class="line"><a id="l02179" name="l02179"></a><span class="lineno"> 2179</span> </div>
+<div class="line"><a id="l02180" name="l02180"></a><span class="lineno"> 2180</span> private:</div>
+<div class="line"><a id="l02181" name="l02181"></a><span class="lineno"> 2181</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; outputs);</div>
+<div class="line"><a id="l02182" name="l02182"></a><span class="lineno"> 2182</span>};</div>
 </div>
-<div class="line"><a id="l02182" name="l02182"></a><span class="lineno"> 2182</span> </div>
-<div class="foldopen" id="foldopen02183" data-start="{" data-end="};">
-<div class="line"><a id="l02183" name="l02183"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html"> 2183</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_cholesky.html">Cholesky</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
-<div class="line"><a id="l02184" name="l02184"></a><span class="lineno"> 2184</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen02185" data-start="{" data-end="}">
-<div class="line"><a id="l02185" name="l02185"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#a6ae2e30b85f99f4f0d7f14c7949818ab"> 2185</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cholesky.html#a6ae2e30b85f99f4f0d7f14c7949818ab">Cholesky</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> upper)</div>
-<div class="line"><a id="l02186" name="l02186"></a><span class="lineno"> 2186</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), upper_(upper) {}</div>
+<div class="line"><a id="l02183" name="l02183"></a><span class="lineno"> 2183</span> </div>
+<div class="line"><a id="l02184" name="l02184"></a><span class="lineno"> 2184</span><span class="comment">/* Matrix inversion primitive. */</span></div>
+<div class="foldopen" id="foldopen02185" data-start="{" data-end="};">
+<div class="line"><a id="l02185" name="l02185"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html"> 2185</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_inverse.html">Inverse</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02186" name="l02186"></a><span class="lineno"> 2186</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen02187" data-start="{" data-end="}">
+<div class="line"><a id="l02187" name="l02187"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#a71467681e523abb725724490bfeb76ad"> 2187</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_inverse.html#a71467681e523abb725724490bfeb76ad">Inverse</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> tri, <span class="keywordtype">bool</span> upper)</div>
+<div class="line"><a id="l02188" name="l02188"></a><span class="lineno"> 2188</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), tri_(<a class="code hl_function" href="group__ops.html#ga4f3389e5b89e70e862e7d2b40d6c7f78">tri</a>), upper_(upper) {}</div>
 </div>
-<div class="line"><a id="l02187" name="l02187"></a><span class="lineno"> 2187</span> </div>
-<div class="line"><a id="l02188" name="l02188"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5"> 2188</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02189" name="l02189"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795"> 2189</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02190" name="l02190"></a><span class="lineno"> 2190</span> </div>
-<div class="line"><a id="l02191" name="l02191"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5"> 2191</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02192" name="l02192"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84"> 2192</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_cholesky.html">Cholesky</a>)</div>
-<div class="line"><a id="l02193" name="l02193"></a><span class="lineno"> 2193</span> </div>
-<div class="line"><a id="l02194" name="l02194"></a><span class="lineno"> 2194</span> private:</div>
-<div class="line"><a id="l02195" name="l02195"></a><span class="lineno"> 2195</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; output);</div>
-<div class="line"><a id="l02196" name="l02196"></a><span class="lineno"> 2196</span>  <span class="keywordtype">bool</span> upper_;</div>
-<div class="line"><a id="l02197" name="l02197"></a><span class="lineno"> 2197</span>};</div>
+<div class="line"><a id="l02189" name="l02189"></a><span class="lineno"> 2189</span> </div>
+<div class="line"><a id="l02190" name="l02190"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81"> 2190</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; output) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02191" name="l02191"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2"> 2191</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; output) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02192" name="l02192"></a><span class="lineno"> 2192</span> </div>
+<div class="line"><a id="l02193" name="l02193"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2"> 2193</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02194" name="l02194"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9"> 2194</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_inverse.html">Inverse</a>)</div>
+<div class="line"><a id="l02195" name="l02195"></a><span class="lineno"> 2195</span> </div>
+<div class="line"><a id="l02196" name="l02196"></a><span class="lineno"> 2196</span> private:</div>
+<div class="line"><a id="l02197" name="l02197"></a><span class="lineno"> 2197</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; output);</div>
+<div class="line"><a id="l02198" name="l02198"></a><span class="lineno"> 2198</span>  <span class="keywordtype">bool</span> tri_;</div>
+<div class="line"><a id="l02199" name="l02199"></a><span class="lineno"> 2199</span>  <span class="keywordtype">bool</span> upper_;</div>
+<div class="line"><a id="l02200" name="l02200"></a><span class="lineno"> 2200</span>};</div>
 </div>
-<div class="line"><a id="l02198" name="l02198"></a><span class="lineno"> 2198</span> </div>
-<div class="foldopen" id="foldopen02199" data-start="{" data-end="};">
-<div class="line"><a id="l02199" name="l02199"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html"> 2199</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_eigh.html">Eigh</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
-<div class="line"><a id="l02200" name="l02200"></a><span class="lineno"> 2200</span> <span class="keyword">public</span>:</div>
-<div class="foldopen" id="foldopen02201" data-start="{" data-end="}">
-<div class="line"><a id="l02201" name="l02201"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#ad8f5d012ebd5942abeffecca77fcddda"> 2201</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_eigh.html#ad8f5d012ebd5942abeffecca77fcddda">Eigh</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, std::string uplo, <span class="keywordtype">bool</span> compute_eigenvectors)</div>
-<div class="line"><a id="l02202" name="l02202"></a><span class="lineno"> 2202</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream),</div>
-<div class="line"><a id="l02203" name="l02203"></a><span class="lineno"> 2203</span>        uplo_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(uplo)),</div>
-<div class="line"><a id="l02204" name="l02204"></a><span class="lineno"> 2204</span>        compute_eigenvectors_(compute_eigenvectors) {}</div>
+<div class="line"><a id="l02201" name="l02201"></a><span class="lineno"> 2201</span> </div>
+<div class="foldopen" id="foldopen02202" data-start="{" data-end="};">
+<div class="line"><a id="l02202" name="l02202"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html"> 2202</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_cholesky.html">Cholesky</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a> {</div>
+<div class="line"><a id="l02203" name="l02203"></a><span class="lineno"> 2203</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen02204" data-start="{" data-end="}">
+<div class="line"><a id="l02204" name="l02204"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#a6ae2e30b85f99f4f0d7f14c7949818ab"> 2204</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cholesky.html#a6ae2e30b85f99f4f0d7f14c7949818ab">Cholesky</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, <span class="keywordtype">bool</span> upper)</div>
+<div class="line"><a id="l02205" name="l02205"></a><span class="lineno"> 2205</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_unary_primitive.html">UnaryPrimitive</a>(stream), upper_(upper) {}</div>
 </div>
-<div class="line"><a id="l02205" name="l02205"></a><span class="lineno"> 2205</span> </div>
-<div class="line"><a id="l02206" name="l02206"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be"> 2206</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l02207" name="l02207"></a><span class="lineno"> 2207</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02208" name="l02208"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2"> 2208</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
-<div class="line"><a id="l02209" name="l02209"></a><span class="lineno"> 2209</span>      <span class="keyword">override</span>;</div>
-<div class="line"><a id="l02210" name="l02210"></a><span class="lineno"> 2210</span> </div>
-<div class="line"><a id="l02211" name="l02211"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f"> 2211</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
-<div class="line"><a id="l02212" name="l02212"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84"> 2212</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_eigh.html">Eigh</a>)</div>
-<div class="line"><a id="l02213" name="l02213"></a><span class="lineno"> 2213</span> </div>
-<div class="foldopen" id="foldopen02214" data-start="{" data-end="}">
-<div class="line"><a id="l02214" name="l02214"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5"> 2214</a></span>  std::vector&lt;std::vector&lt;<span class="keywordtype">int</span>&gt;&gt; output_shapes(</div>
-<div class="line"><a id="l02215" name="l02215"></a><span class="lineno"> 2215</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs)<span class="keyword"> override </span>{</div>
-<div class="line"><a id="l02216" name="l02216"></a><span class="lineno"> 2216</span>    <span class="keyword">auto</span> shape = inputs[0].shape();</div>
-<div class="line"><a id="l02217" name="l02217"></a><span class="lineno"> 2217</span>    shape.pop_back(); <span class="comment">// Remove last dimension for eigenvalues</span></div>
-<div class="line"><a id="l02218" name="l02218"></a><span class="lineno"> 2218</span>    <span class="keywordflow">if</span> (compute_eigenvectors_) {</div>
-<div class="line"><a id="l02219" name="l02219"></a><span class="lineno"> 2219</span>      <span class="keywordflow">return</span> {shape, inputs[0].shape()}; <span class="comment">// Eigenvalues and eigenvectors</span></div>
-<div class="line"><a id="l02220" name="l02220"></a><span class="lineno"> 2220</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l02221" name="l02221"></a><span class="lineno"> 2221</span>      <span class="keywordflow">return</span> {shape}; <span class="comment">// Only eigenvalues</span></div>
-<div class="line"><a id="l02222" name="l02222"></a><span class="lineno"> 2222</span>    }</div>
-<div class="line"><a id="l02223" name="l02223"></a><span class="lineno"> 2223</span>  }</div>
+<div class="line"><a id="l02206" name="l02206"></a><span class="lineno"> 2206</span> </div>
+<div class="line"><a id="l02207" name="l02207"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5"> 2207</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02208" name="l02208"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795"> 2208</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out) <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02209" name="l02209"></a><span class="lineno"> 2209</span> </div>
+<div class="line"><a id="l02210" name="l02210"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5"> 2210</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02211" name="l02211"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84"> 2211</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_cholesky.html">Cholesky</a>)</div>
+<div class="line"><a id="l02212" name="l02212"></a><span class="lineno"> 2212</span> </div>
+<div class="line"><a id="l02213" name="l02213"></a><span class="lineno"> 2213</span> private:</div>
+<div class="line"><a id="l02214" name="l02214"></a><span class="lineno"> 2214</span>  <span class="keywordtype">void</span> eval(const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs, <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; output);</div>
+<div class="line"><a id="l02215" name="l02215"></a><span class="lineno"> 2215</span>  <span class="keywordtype">bool</span> upper_;</div>
+<div class="line"><a id="l02216" name="l02216"></a><span class="lineno"> 2216</span>};</div>
+</div>
+<div class="line"><a id="l02217" name="l02217"></a><span class="lineno"> 2217</span> </div>
+<div class="foldopen" id="foldopen02218" data-start="{" data-end="};">
+<div class="line"><a id="l02218" name="l02218"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html"> 2218</a></span>class <a class="code hl_class" href="classmlx_1_1core_1_1_eigh.html">Eigh</a> : public <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a> {</div>
+<div class="line"><a id="l02219" name="l02219"></a><span class="lineno"> 2219</span> <span class="keyword">public</span>:</div>
+<div class="foldopen" id="foldopen02220" data-start="{" data-end="}">
+<div class="line"><a id="l02220" name="l02220"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#ad8f5d012ebd5942abeffecca77fcddda"> 2220</a></span>  <span class="keyword">explicit</span> <a class="code hl_function" href="classmlx_1_1core_1_1_eigh.html#ad8f5d012ebd5942abeffecca77fcddda">Eigh</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a> stream, std::string uplo, <span class="keywordtype">bool</span> compute_eigenvectors)</div>
+<div class="line"><a id="l02221" name="l02221"></a><span class="lineno"> 2221</span>      : <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>(stream),</div>
+<div class="line"><a id="l02222" name="l02222"></a><span class="lineno"> 2222</span>        uplo_(<a class="code hl_function" href="group__ops.html#ga2a466024f8061febc0a64be557644cb0">std</a>::move(uplo)),</div>
+<div class="line"><a id="l02223" name="l02223"></a><span class="lineno"> 2223</span>        compute_eigenvectors_(compute_eigenvectors) {}</div>
 </div>
 <div class="line"><a id="l02224" name="l02224"></a><span class="lineno"> 2224</span> </div>
-<div class="foldopen" id="foldopen02225" data-start="{" data-end="}">
-<div class="line"><a id="l02225" name="l02225"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381"> 2225</a></span>  <span class="keywordtype">bool</span> <a class="code hl_function" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">is_equivalent</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other)<span class="keyword"> const override </span>{</div>
-<div class="line"><a id="l02226" name="l02226"></a><span class="lineno"> 2226</span>    <span class="keywordflow">if</span> (<span class="keyword">auto</span>* p = <span class="keyword">dynamic_cast&lt;</span><span class="keyword">const </span><a class="code hl_class" href="classmlx_1_1core_1_1_eigh.html">Eigh</a>*<span class="keyword">&gt;</span>(&amp;other)) {</div>
-<div class="line"><a id="l02227" name="l02227"></a><span class="lineno"> 2227</span>      <span class="keywordflow">return</span> uplo_ == p-&gt;uplo_ &amp;&amp;</div>
-<div class="line"><a id="l02228" name="l02228"></a><span class="lineno"> 2228</span>          compute_eigenvectors_ == p-&gt;compute_eigenvectors_;</div>
-<div class="line"><a id="l02229" name="l02229"></a><span class="lineno"> 2229</span>    }</div>
-<div class="line"><a id="l02230" name="l02230"></a><span class="lineno"> 2230</span>    <span class="keywordflow">return</span> <span class="keyword">false</span>;</div>
-<div class="line"><a id="l02231" name="l02231"></a><span class="lineno"> 2231</span>  }</div>
-</div>
+<div class="line"><a id="l02225" name="l02225"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be"> 2225</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be">eval_cpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l02226" name="l02226"></a><span class="lineno"> 2226</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02227" name="l02227"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2"> 2227</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2">eval_gpu</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs)</div>
+<div class="line"><a id="l02228" name="l02228"></a><span class="lineno"> 2228</span>      <span class="keyword">override</span>;</div>
+<div class="line"><a id="l02229" name="l02229"></a><span class="lineno"> 2229</span> </div>
+<div class="line"><a id="l02230" name="l02230"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f"> 2230</a></span>  <a class="code hl_define" href="primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd">DEFINE_VMAP</a>()</div>
+<div class="line"><a id="l02231" name="l02231"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84"> 2231</a></span>  <a class="code hl_define" href="primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592">DEFINE_PRINT</a>(<a class="code hl_class" href="classmlx_1_1core_1_1_eigh.html">Eigh</a>)</div>
 <div class="line"><a id="l02232" name="l02232"></a><span class="lineno"> 2232</span> </div>
-<div class="line"><a id="l02233" name="l02233"></a><span class="lineno"> 2233</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l02234" name="l02234"></a><span class="lineno"> 2234</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
-<div class="line"><a id="l02235" name="l02235"></a><span class="lineno"> 2235</span>  std::string uplo_;</div>
-<div class="line"><a id="l02236" name="l02236"></a><span class="lineno"> 2236</span>  <span class="keywordtype">bool</span> compute_eigenvectors_;</div>
-<div class="line"><a id="l02237" name="l02237"></a><span class="lineno"> 2237</span>};</div>
+<div class="foldopen" id="foldopen02233" data-start="{" data-end="}">
+<div class="line"><a id="l02233" name="l02233"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5"> 2233</a></span>  std::vector&lt;std::vector&lt;<span class="keywordtype">int</span>&gt;&gt; output_shapes(</div>
+<div class="line"><a id="l02234" name="l02234"></a><span class="lineno"> 2234</span>      const std::vector&lt;<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&gt;&amp; inputs)<span class="keyword"> override </span>{</div>
+<div class="line"><a id="l02235" name="l02235"></a><span class="lineno"> 2235</span>    <span class="keyword">auto</span> shape = inputs[0].shape();</div>
+<div class="line"><a id="l02236" name="l02236"></a><span class="lineno"> 2236</span>    shape.pop_back(); <span class="comment">// Remove last dimension for eigenvalues</span></div>
+<div class="line"><a id="l02237" name="l02237"></a><span class="lineno"> 2237</span>    <span class="keywordflow">if</span> (compute_eigenvectors_) {</div>
+<div class="line"><a id="l02238" name="l02238"></a><span class="lineno"> 2238</span>      <span class="keywordflow">return</span> {shape, inputs[0].shape()}; <span class="comment">// Eigenvalues and eigenvectors</span></div>
+<div class="line"><a id="l02239" name="l02239"></a><span class="lineno"> 2239</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l02240" name="l02240"></a><span class="lineno"> 2240</span>      <span class="keywordflow">return</span> {shape}; <span class="comment">// Only eigenvalues</span></div>
+<div class="line"><a id="l02241" name="l02241"></a><span class="lineno"> 2241</span>    }</div>
+<div class="line"><a id="l02242" name="l02242"></a><span class="lineno"> 2242</span>  }</div>
 </div>
-<div class="line"><a id="l02238" name="l02238"></a><span class="lineno"> 2238</span> </div>
-<div class="line"><a id="l02239" name="l02239"></a><span class="lineno"> 2239</span>} <span class="comment">// namespace mlx::core</span></div>
+<div class="line"><a id="l02243" name="l02243"></a><span class="lineno"> 2243</span> </div>
+<div class="foldopen" id="foldopen02244" data-start="{" data-end="}">
+<div class="line"><a id="l02244" name="l02244"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381"> 2244</a></span>  <span class="keywordtype">bool</span> <a class="code hl_function" href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">is_equivalent</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1_primitive.html">Primitive</a>&amp; other)<span class="keyword"> const override </span>{</div>
+<div class="line"><a id="l02245" name="l02245"></a><span class="lineno"> 2245</span>    <span class="keywordflow">if</span> (<span class="keyword">auto</span>* p = <span class="keyword">dynamic_cast&lt;</span><span class="keyword">const </span><a class="code hl_class" href="classmlx_1_1core_1_1_eigh.html">Eigh</a>*<span class="keyword">&gt;</span>(&amp;other)) {</div>
+<div class="line"><a id="l02246" name="l02246"></a><span class="lineno"> 2246</span>      <span class="keywordflow">return</span> uplo_ == p-&gt;uplo_ &amp;&amp;</div>
+<div class="line"><a id="l02247" name="l02247"></a><span class="lineno"> 2247</span>          compute_eigenvectors_ == p-&gt;compute_eigenvectors_;</div>
+<div class="line"><a id="l02248" name="l02248"></a><span class="lineno"> 2248</span>    }</div>
+<div class="line"><a id="l02249" name="l02249"></a><span class="lineno"> 2249</span>    <span class="keywordflow">return</span> <span class="keyword">false</span>;</div>
+<div class="line"><a id="l02250" name="l02250"></a><span class="lineno"> 2250</span>  }</div>
+</div>
+<div class="line"><a id="l02251" name="l02251"></a><span class="lineno"> 2251</span> </div>
+<div class="line"><a id="l02252" name="l02252"></a><span class="lineno"> 2252</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l02253" name="l02253"></a><span class="lineno"> 2253</span>  <span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299">eval</a>(<span class="keyword">const</span> std::vector&lt;array&gt;&amp; inputs, std::vector&lt;array&gt;&amp; outputs);</div>
+<div class="line"><a id="l02254" name="l02254"></a><span class="lineno"> 2254</span>  std::string uplo_;</div>
+<div class="line"><a id="l02255" name="l02255"></a><span class="lineno"> 2255</span>  <span class="keywordtype">bool</span> compute_eigenvectors_;</div>
+<div class="line"><a id="l02256" name="l02256"></a><span class="lineno"> 2256</span>};</div>
+</div>
+<div class="line"><a id="l02257" name="l02257"></a><span class="lineno"> 2257</span> </div>
+<div class="line"><a id="l02258" name="l02258"></a><span class="lineno"> 2258</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_abs_html"><div class="ttname"><a href="classmlx_1_1core_1_1_abs.html">mlx::core::Abs</a></div><div class="ttdef"><b>Definition</b> primitives.h:155</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_abs_html_a0a976e636dd8505b473fbdddf949f514"><div class="ttname"><a href="classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514">mlx::core::Abs::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
@@ -2717,9 +2740,9 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aclassmlx_1_1core_1_1_ceil_html_a9791801fff3f8b79944e15ac2a45a035"><div class="ttname"><a href="classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035">mlx::core::Ceil::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_ceil_html_abe178e0058e44b6618be414215e96887"><div class="ttname"><a href="classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887">mlx::core::Ceil::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_ceil_html_aede38610ca25429f229301546bc9b682"><div class="ttname"><a href="classmlx_1_1core_1_1_ceil.html#aede38610ca25429f229301546bc9b682">mlx::core::Ceil::Ceil</a></div><div class="ttdeci">Ceil(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:549</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_cholesky_html"><div class="ttname"><a href="classmlx_1_1core_1_1_cholesky.html">mlx::core::Cholesky</a></div><div class="ttdef"><b>Definition</b> primitives.h:2183</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_cholesky_html"><div class="ttname"><a href="classmlx_1_1core_1_1_cholesky.html">mlx::core::Cholesky</a></div><div class="ttdef"><b>Definition</b> primitives.h:2202</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_cholesky_html_a4bdec36c1cc99aadf9a4a39d4c57bea5"><div class="ttname"><a href="classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5">mlx::core::Cholesky::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_cholesky_html_a6ae2e30b85f99f4f0d7f14c7949818ab"><div class="ttname"><a href="classmlx_1_1core_1_1_cholesky.html#a6ae2e30b85f99f4f0d7f14c7949818ab">mlx::core::Cholesky::Cholesky</a></div><div class="ttdeci">Cholesky(Stream stream, bool upper)</div><div class="ttdef"><b>Definition</b> primitives.h:2185</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_cholesky_html_a6ae2e30b85f99f4f0d7f14c7949818ab"><div class="ttname"><a href="classmlx_1_1core_1_1_cholesky.html#a6ae2e30b85f99f4f0d7f14c7949818ab">mlx::core::Cholesky::Cholesky</a></div><div class="ttdeci">Cholesky(Stream stream, bool upper)</div><div class="ttdef"><b>Definition</b> primitives.h:2204</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_cholesky_html_a8c918594bf129888044ef37fcae56795"><div class="ttname"><a href="classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795">mlx::core::Cholesky::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_compiled_html"><div class="ttname"><a href="classmlx_1_1core_1_1_compiled.html">mlx::core::Compiled</a></div><div class="ttdef"><b>Definition</b> primitives.h:564</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_compiled_html_a2d8cefff835c419a48a077d306b8e051"><div class="ttname"><a href="classmlx_1_1core_1_1_compiled.html#a2d8cefff835c419a48a077d306b8e051">mlx::core::Compiled::Compiled</a></div><div class="ttdeci">Compiled(Stream stream, std::vector&lt; array &gt; inputs, std::vector&lt; array &gt; outputs, std::vector&lt; array &gt; tape, std::unordered_set&lt; uintptr_t &gt; constant_ids)</div></div>
@@ -2733,79 +2756,83 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aclassmlx_1_1core_1_1_conjugate_html_a627f9e6a8729fb3ffb3ca3228d007c87"><div class="ttname"><a href="classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87">mlx::core::Conjugate::Conjugate</a></div><div class="ttdeci">Conjugate(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:628</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_conjugate_html_ae39643e2178f442ffba05139f8609d61"><div class="ttname"><a href="classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61">mlx::core::Conjugate::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_conjugate_html_aff0a802166e3724db88ab5d3feb2d3de"><div class="ttname"><a href="classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de">mlx::core::Conjugate::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_convolution_html"><div class="ttname"><a href="classmlx_1_1core_1_1_convolution.html">mlx::core::Convolution</a></div><div class="ttdef"><b>Definition</b> primitives.h:642</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_contiguous_html"><div class="ttname"><a href="classmlx_1_1core_1_1_contiguous.html">mlx::core::Contiguous</a></div><div class="ttdef"><b>Definition</b> primitives.h:642</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_contiguous_html_a3e83f414c02ae0b92a50b6f8e402e1c0"><div class="ttname"><a href="classmlx_1_1core_1_1_contiguous.html#a3e83f414c02ae0b92a50b6f8e402e1c0">mlx::core::Contiguous::Contiguous</a></div><div class="ttdeci">Contiguous(Stream stream, bool allow_col_major)</div><div class="ttdef"><b>Definition</b> primitives.h:644</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_contiguous_html_a519cd16fd0c55b371ea7625fbb37c70f"><div class="ttname"><a href="classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f">mlx::core::Contiguous::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_contiguous_html_a742de24e6c0310cd85a606dec0cd8336"><div class="ttname"><a href="classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336">mlx::core::Contiguous::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_convolution_html"><div class="ttname"><a href="classmlx_1_1core_1_1_convolution.html">mlx::core::Convolution</a></div><div class="ttdef"><b>Definition</b> primitives.h:661</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_convolution_html_a30b64109eeb1778f002b99447dff9dd2"><div class="ttname"><a href="classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2">mlx::core::Convolution::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_convolution_html_a6f1de77b719bb13217b0d8c64cabb8ef"><div class="ttname"><a href="classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef">mlx::core::Convolution::Convolution</a></div><div class="ttdeci">Convolution(Stream stream, const std::vector&lt; int &gt; &amp;kernel_strides, const std::vector&lt; int &gt; &amp;padding, const std::vector&lt; int &gt; &amp;kernel_dilation, const std::vector&lt; int &gt; &amp;input_dilation, const int groups=1, const bool flip=false)</div><div class="ttdef"><b>Definition</b> primitives.h:644</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_convolution_html_a6f1de77b719bb13217b0d8c64cabb8ef"><div class="ttname"><a href="classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef">mlx::core::Convolution::Convolution</a></div><div class="ttdeci">Convolution(Stream stream, const std::vector&lt; int &gt; &amp;kernel_strides, const std::vector&lt; int &gt; &amp;padding, const std::vector&lt; int &gt; &amp;kernel_dilation, const std::vector&lt; int &gt; &amp;input_dilation, const int groups=1, const bool flip=false)</div><div class="ttdef"><b>Definition</b> primitives.h:663</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_convolution_html_ac74256068da01730629109fa4fa8432b"><div class="ttname"><a href="classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b">mlx::core::Convolution::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_convolution_html_af8eb9c0c055ad20aa74b547016917690"><div class="ttname"><a href="classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690">mlx::core::Convolution::vjp</a></div><div class="ttdeci">std::vector&lt; array &gt; vjp(const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotangents, const std::vector&lt; int &gt; &amp;argnums, const std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">The vector-Jacobian product.</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_copy_html"><div class="ttname"><a href="classmlx_1_1core_1_1_copy.html">mlx::core::Copy</a></div><div class="ttdef"><b>Definition</b> primitives.h:683</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_copy_html"><div class="ttname"><a href="classmlx_1_1core_1_1_copy.html">mlx::core::Copy</a></div><div class="ttdef"><b>Definition</b> primitives.h:702</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_copy_html_a1eda7b2ea771a168f67421f0d384b3a1"><div class="ttname"><a href="classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1">mlx::core::Copy::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_copy_html_a6243e044af119105ffaaed7d405cd584"><div class="ttname"><a href="classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584">mlx::core::Copy::Copy</a></div><div class="ttdeci">Copy(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:685</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_copy_html_a6243e044af119105ffaaed7d405cd584"><div class="ttname"><a href="classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584">mlx::core::Copy::Copy</a></div><div class="ttdeci">Copy(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:704</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_copy_html_af4a0ebec423e84ffe8083a5e9ed0d70c"><div class="ttname"><a href="classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c">mlx::core::Copy::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_cos_html"><div class="ttname"><a href="classmlx_1_1core_1_1_cos.html">mlx::core::Cos</a></div><div class="ttdef"><b>Definition</b> primitives.h:700</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_cos_html"><div class="ttname"><a href="classmlx_1_1core_1_1_cos.html">mlx::core::Cos</a></div><div class="ttdef"><b>Definition</b> primitives.h:719</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_cos_html_a061fc446268fe56237ae6b20ccf78152"><div class="ttname"><a href="classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152">mlx::core::Cos::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_cos_html_a2acb9fcf0901462189c476756fd99995"><div class="ttname"><a href="classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995">mlx::core::Cos::Cos</a></div><div class="ttdeci">Cos(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:702</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_cos_html_a2acb9fcf0901462189c476756fd99995"><div class="ttname"><a href="classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995">mlx::core::Cos::Cos</a></div><div class="ttdeci">Cos(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:721</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_cos_html_a5ef41aafad595f6cdd8c535e36e12060"><div class="ttname"><a href="classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060">mlx::core::Cos::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_cosh_html"><div class="ttname"><a href="classmlx_1_1core_1_1_cosh.html">mlx::core::Cosh</a></div><div class="ttdef"><b>Definition</b> primitives.h:717</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_cosh_html"><div class="ttname"><a href="classmlx_1_1core_1_1_cosh.html">mlx::core::Cosh</a></div><div class="ttdef"><b>Definition</b> primitives.h:736</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_cosh_html_a23f71b43792934c3ec0ebe9b74f32559"><div class="ttname"><a href="classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559">mlx::core::Cosh::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_cosh_html_a44e8ac2e09a55ec32e9dc6641eedc8f1"><div class="ttname"><a href="classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1">mlx::core::Cosh::Cosh</a></div><div class="ttdeci">Cosh(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:719</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_cosh_html_a44e8ac2e09a55ec32e9dc6641eedc8f1"><div class="ttname"><a href="classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1">mlx::core::Cosh::Cosh</a></div><div class="ttdeci">Cosh(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:738</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_cosh_html_ae8702df7e8f0e20cbeccb2a548961d3d"><div class="ttname"><a href="classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d">mlx::core::Cosh::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_custom_transforms_html"><div class="ttname"><a href="classmlx_1_1core_1_1_custom_transforms.html">mlx::core::CustomTransforms</a></div><div class="ttdef"><b>Definition</b> primitives.h:734</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_custom_transforms_html"><div class="ttname"><a href="classmlx_1_1core_1_1_custom_transforms.html">mlx::core::CustomTransforms</a></div><div class="ttdef"><b>Definition</b> primitives.h:753</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_custom_transforms_html_a7b3538681acbb20af3ed37b0877f6667"><div class="ttname"><a href="classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667">mlx::core::CustomTransforms::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_custom_transforms_html_ab52abadb9c6f6db83d087c7b751be488"><div class="ttname"><a href="classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488">mlx::core::CustomTransforms::CustomTransforms</a></div><div class="ttdeci">CustomTransforms(Stream stream, int num_outputs, std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;, const std::vector&lt; array &gt; &amp;, const std::vector&lt; array &gt; &amp;)&gt; vjp, std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;, const std::vector&lt; array &gt; &amp;, const std::vector&lt; int &gt; &amp;)&gt; jvp, std::function&lt; std::pair&lt; std::vector&lt; array &gt;, std::vector&lt; int &gt; &gt;(const std::vector&lt; array &gt; &amp;, const std::vector&lt; int &gt; &amp;)&gt; vmap)</div><div class="ttdef"><b>Definition</b> primitives.h:736</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_custom_transforms_html_ab52abadb9c6f6db83d087c7b751be488"><div class="ttname"><a href="classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488">mlx::core::CustomTransforms::CustomTransforms</a></div><div class="ttdeci">CustomTransforms(Stream stream, int num_outputs, std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;, const std::vector&lt; array &gt; &amp;, const std::vector&lt; array &gt; &amp;)&gt; vjp, std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;, const std::vector&lt; array &gt; &amp;, const std::vector&lt; int &gt; &amp;)&gt; jvp, std::function&lt; std::pair&lt; std::vector&lt; array &gt;, std::vector&lt; int &gt; &gt;(const std::vector&lt; array &gt; &amp;, const std::vector&lt; int &gt; &amp;)&gt; vmap)</div><div class="ttdef"><b>Definition</b> primitives.h:755</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_custom_transforms_html_adba1c40c77a2138df6b5f75483f62184"><div class="ttname"><a href="classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184">mlx::core::CustomTransforms::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_depends_html"><div class="ttname"><a href="classmlx_1_1core_1_1_depends.html">mlx::core::Depends</a></div><div class="ttdef"><b>Definition</b> primitives.h:786</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_depends_html"><div class="ttname"><a href="classmlx_1_1core_1_1_depends.html">mlx::core::Depends</a></div><div class="ttdef"><b>Definition</b> primitives.h:805</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_depends_html_a02996fa45f01f7cb9f37074d5f8ccab0"><div class="ttname"><a href="classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0">mlx::core::Depends::vjp</a></div><div class="ttdeci">std::vector&lt; array &gt; vjp(const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotan, const std::vector&lt; int &gt; &amp;argnums, const std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">The vector-Jacobian product.</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_depends_html_a0c7ea6db97337591fa53c6e6bde41e5e"><div class="ttname"><a href="classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e">mlx::core::Depends::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_depends_html_a4ccb792c99f5d8d133d3fac29f7d3f62"><div class="ttname"><a href="classmlx_1_1core_1_1_depends.html#a4ccb792c99f5d8d133d3fac29f7d3f62">mlx::core::Depends::Depends</a></div><div class="ttdeci">Depends(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:788</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_depends_html_a4ccb792c99f5d8d133d3fac29f7d3f62"><div class="ttname"><a href="classmlx_1_1core_1_1_depends.html#a4ccb792c99f5d8d133d3fac29f7d3f62">mlx::core::Depends::Depends</a></div><div class="ttdeci">Depends(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:807</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_depends_html_ae5057f65e69490ad0add8eeda2b75e28"><div class="ttname"><a href="classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28">mlx::core::Depends::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_div_mod_html"><div class="ttname"><a href="classmlx_1_1core_1_1_div_mod.html">mlx::core::DivMod</a></div><div class="ttdef"><b>Definition</b> primitives.h:824</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_div_mod_html"><div class="ttname"><a href="classmlx_1_1core_1_1_div_mod.html">mlx::core::DivMod</a></div><div class="ttdef"><b>Definition</b> primitives.h:843</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_div_mod_html_a003117c9ecf3c06a27248f72a76348dc"><div class="ttname"><a href="classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc">mlx::core::DivMod::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_div_mod_html_a859e3b6149cdceab1c7ccfd2246fb826"><div class="ttname"><a href="classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826">mlx::core::DivMod::DivMod</a></div><div class="ttdeci">DivMod(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:826</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_div_mod_html_a859e3b6149cdceab1c7ccfd2246fb826"><div class="ttname"><a href="classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826">mlx::core::DivMod::DivMod</a></div><div class="ttdeci">DivMod(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:845</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_div_mod_html_ae350b7b93ad128e3133ee14f247193b3"><div class="ttname"><a href="classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3">mlx::core::DivMod::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_divide_html"><div class="ttname"><a href="classmlx_1_1core_1_1_divide.html">mlx::core::Divide</a></div><div class="ttdef"><b>Definition</b> primitives.h:807</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_divide_html_a62fc71e8998be65ff18285dbbd21eedb"><div class="ttname"><a href="classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb">mlx::core::Divide::Divide</a></div><div class="ttdeci">Divide(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:809</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_divide_html"><div class="ttname"><a href="classmlx_1_1core_1_1_divide.html">mlx::core::Divide</a></div><div class="ttdef"><b>Definition</b> primitives.h:826</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_divide_html_a62fc71e8998be65ff18285dbbd21eedb"><div class="ttname"><a href="classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb">mlx::core::Divide::Divide</a></div><div class="ttdeci">Divide(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:828</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_divide_html_a823443c2a8e8b81bbcaeee6ddbcdbf49"><div class="ttname"><a href="classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49">mlx::core::Divide::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_divide_html_abffda0ce37221ddc28dc9eea794f6bc7"><div class="ttname"><a href="classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7">mlx::core::Divide::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_eigh_html"><div class="ttname"><a href="classmlx_1_1core_1_1_eigh.html">mlx::core::Eigh</a></div><div class="ttdef"><b>Definition</b> primitives.h:2199</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_eigh_html_a09414e3fe88a952408d164d6dd0af381"><div class="ttname"><a href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">mlx::core::Eigh::is_equivalent</a></div><div class="ttdeci">bool is_equivalent(const Primitive &amp;other) const override</div><div class="ttdoc">Equivalence check defaults to false unless overridden by the primitive.</div><div class="ttdef"><b>Definition</b> primitives.h:2225</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_eigh_html"><div class="ttname"><a href="classmlx_1_1core_1_1_eigh.html">mlx::core::Eigh</a></div><div class="ttdef"><b>Definition</b> primitives.h:2218</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_eigh_html_a09414e3fe88a952408d164d6dd0af381"><div class="ttname"><a href="classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381">mlx::core::Eigh::is_equivalent</a></div><div class="ttdeci">bool is_equivalent(const Primitive &amp;other) const override</div><div class="ttdoc">Equivalence check defaults to false unless overridden by the primitive.</div><div class="ttdef"><b>Definition</b> primitives.h:2244</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_eigh_html_a67775b41c0a15e356f08d51d9736baa2"><div class="ttname"><a href="classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2">mlx::core::Eigh::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_eigh_html_a894b32e17229394f6a43b4a0655fd8be"><div class="ttname"><a href="classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be">mlx::core::Eigh::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_eigh_html_ad8f5d012ebd5942abeffecca77fcddda"><div class="ttname"><a href="classmlx_1_1core_1_1_eigh.html#ad8f5d012ebd5942abeffecca77fcddda">mlx::core::Eigh::Eigh</a></div><div class="ttdeci">Eigh(Stream stream, std::string uplo, bool compute_eigenvectors)</div><div class="ttdef"><b>Definition</b> primitives.h:2201</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_equal_html"><div class="ttname"><a href="classmlx_1_1core_1_1_equal.html">mlx::core::Equal</a></div><div class="ttdef"><b>Definition</b> primitives.h:880</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_equal_html_a4af81cf2dd071db5bbf8ce1df95fdf36"><div class="ttname"><a href="classmlx_1_1core_1_1_equal.html#a4af81cf2dd071db5bbf8ce1df95fdf36">mlx::core::Equal::Equal</a></div><div class="ttdeci">Equal(Stream stream, bool equal_nan=false)</div><div class="ttdef"><b>Definition</b> primitives.h:882</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_eigh_html_ad8f5d012ebd5942abeffecca77fcddda"><div class="ttname"><a href="classmlx_1_1core_1_1_eigh.html#ad8f5d012ebd5942abeffecca77fcddda">mlx::core::Eigh::Eigh</a></div><div class="ttdeci">Eigh(Stream stream, std::string uplo, bool compute_eigenvectors)</div><div class="ttdef"><b>Definition</b> primitives.h:2220</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_equal_html"><div class="ttname"><a href="classmlx_1_1core_1_1_equal.html">mlx::core::Equal</a></div><div class="ttdef"><b>Definition</b> primitives.h:899</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_equal_html_a4af81cf2dd071db5bbf8ce1df95fdf36"><div class="ttname"><a href="classmlx_1_1core_1_1_equal.html#a4af81cf2dd071db5bbf8ce1df95fdf36">mlx::core::Equal::Equal</a></div><div class="ttdeci">Equal(Stream stream, bool equal_nan=false)</div><div class="ttdef"><b>Definition</b> primitives.h:901</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_equal_html_aabb8aa61fa581defddcdca1274b1b454"><div class="ttname"><a href="classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454">mlx::core::Equal::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_equal_html_ac3757001fec42ceb5ece2954df42161c"><div class="ttname"><a href="classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c">mlx::core::Equal::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_erf_html"><div class="ttname"><a href="classmlx_1_1core_1_1_erf.html">mlx::core::Erf</a></div><div class="ttdef"><b>Definition</b> primitives.h:906</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_erf_html_a702f76f848928d8d7d3d0881ac6e4c82"><div class="ttname"><a href="classmlx_1_1core_1_1_erf.html#a702f76f848928d8d7d3d0881ac6e4c82">mlx::core::Erf::Erf</a></div><div class="ttdeci">Erf(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:908</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_erf_html"><div class="ttname"><a href="classmlx_1_1core_1_1_erf.html">mlx::core::Erf</a></div><div class="ttdef"><b>Definition</b> primitives.h:925</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_erf_html_a702f76f848928d8d7d3d0881ac6e4c82"><div class="ttname"><a href="classmlx_1_1core_1_1_erf.html#a702f76f848928d8d7d3d0881ac6e4c82">mlx::core::Erf::Erf</a></div><div class="ttdeci">Erf(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:927</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_erf_html_a84ea16e43d5b7f83bbc2d5ece78a3fb6"><div class="ttname"><a href="classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6">mlx::core::Erf::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_erf_html_ad8551be664d767dccc3c0d8cc1eca008"><div class="ttname"><a href="classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008">mlx::core::Erf::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_erf_inv_html"><div class="ttname"><a href="classmlx_1_1core_1_1_erf_inv.html">mlx::core::ErfInv</a></div><div class="ttdef"><b>Definition</b> primitives.h:923</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_erf_inv_html"><div class="ttname"><a href="classmlx_1_1core_1_1_erf_inv.html">mlx::core::ErfInv</a></div><div class="ttdef"><b>Definition</b> primitives.h:942</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_erf_inv_html_a4a2413d0634db1f3dae1806ddfa632db"><div class="ttname"><a href="classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db">mlx::core::ErfInv::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_erf_inv_html_a5d0279247b67da4592311559f04e1478"><div class="ttname"><a href="classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478">mlx::core::ErfInv::ErfInv</a></div><div class="ttdeci">ErfInv(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:925</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_erf_inv_html_a5d0279247b67da4592311559f04e1478"><div class="ttname"><a href="classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478">mlx::core::ErfInv::ErfInv</a></div><div class="ttdeci">ErfInv(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:944</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_erf_inv_html_af579627402af3249565134884701d39e"><div class="ttname"><a href="classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e">mlx::core::ErfInv::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_exp_html"><div class="ttname"><a href="classmlx_1_1core_1_1_exp.html">mlx::core::Exp</a></div><div class="ttdef"><b>Definition</b> primitives.h:940</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_exp_html_a1d0a618cbb91ab29ef53b57ff6ed6e06"><div class="ttname"><a href="classmlx_1_1core_1_1_exp.html#a1d0a618cbb91ab29ef53b57ff6ed6e06">mlx::core::Exp::Exp</a></div><div class="ttdeci">Exp(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:942</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_exp_html"><div class="ttname"><a href="classmlx_1_1core_1_1_exp.html">mlx::core::Exp</a></div><div class="ttdef"><b>Definition</b> primitives.h:959</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_exp_html_a1d0a618cbb91ab29ef53b57ff6ed6e06"><div class="ttname"><a href="classmlx_1_1core_1_1_exp.html#a1d0a618cbb91ab29ef53b57ff6ed6e06">mlx::core::Exp::Exp</a></div><div class="ttdeci">Exp(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:961</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_exp_html_a47934c5a5023bc7ae7ae89bff45ebb2c"><div class="ttname"><a href="classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c">mlx::core::Exp::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_exp_html_a7d63695a97a14760fd33b5d4e6590822"><div class="ttname"><a href="classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822">mlx::core::Exp::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_expm1_html"><div class="ttname"><a href="classmlx_1_1core_1_1_expm1.html">mlx::core::Expm1</a></div><div class="ttdef"><b>Definition</b> primitives.h:957</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_expm1_html_a47c2a1b2a4ef6bb07ba77c55ddddaec2"><div class="ttname"><a href="classmlx_1_1core_1_1_expm1.html#a47c2a1b2a4ef6bb07ba77c55ddddaec2">mlx::core::Expm1::Expm1</a></div><div class="ttdeci">Expm1(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:959</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_expm1_html"><div class="ttname"><a href="classmlx_1_1core_1_1_expm1.html">mlx::core::Expm1</a></div><div class="ttdef"><b>Definition</b> primitives.h:976</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_expm1_html_a47c2a1b2a4ef6bb07ba77c55ddddaec2"><div class="ttname"><a href="classmlx_1_1core_1_1_expm1.html#a47c2a1b2a4ef6bb07ba77c55ddddaec2">mlx::core::Expm1::Expm1</a></div><div class="ttdeci">Expm1(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:978</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_expm1_html_a82930071f4b77d883b300f77966aff5f"><div class="ttname"><a href="classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f">mlx::core::Expm1::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_expm1_html_ab9c8b7aa50fe4592d55f8957baac647a"><div class="ttname"><a href="classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a">mlx::core::Expm1::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_f_f_t_html"><div class="ttname"><a href="classmlx_1_1core_1_1_f_f_t.html">mlx::core::FFT</a></div><div class="ttdef"><b>Definition</b> primitives.h:973</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_f_f_t_html_a0cdce626ed2c8eeeecc6949418437839"><div class="ttname"><a href="classmlx_1_1core_1_1_f_f_t.html#a0cdce626ed2c8eeeecc6949418437839">mlx::core::FFT::FFT</a></div><div class="ttdeci">FFT(Stream stream, const std::vector&lt; size_t &gt; &amp;axes, bool inverse, bool real)</div><div class="ttdef"><b>Definition</b> primitives.h:975</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_f_f_t_html"><div class="ttname"><a href="classmlx_1_1core_1_1_f_f_t.html">mlx::core::FFT</a></div><div class="ttdef"><b>Definition</b> primitives.h:992</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_f_f_t_html_a0cdce626ed2c8eeeecc6949418437839"><div class="ttname"><a href="classmlx_1_1core_1_1_f_f_t.html#a0cdce626ed2c8eeeecc6949418437839">mlx::core::FFT::FFT</a></div><div class="ttdeci">FFT(Stream stream, const std::vector&lt; size_t &gt; &amp;axes, bool inverse, bool real)</div><div class="ttdef"><b>Definition</b> primitives.h:994</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_f_f_t_html_a1c21b26d1e9ad7c4da78ae845721b2dd"><div class="ttname"><a href="classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd">mlx::core::FFT::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_f_f_t_html_a6bc262a0c2b5d4fe655e3e2e0ff28635"><div class="ttname"><a href="classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635">mlx::core::FFT::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_floor_html"><div class="ttname"><a href="classmlx_1_1core_1_1_floor.html">mlx::core::Floor</a></div><div class="ttdef"><b>Definition</b> primitives.h:999</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_floor_html"><div class="ttname"><a href="classmlx_1_1core_1_1_floor.html">mlx::core::Floor</a></div><div class="ttdef"><b>Definition</b> primitives.h:1018</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_floor_html_a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7"><div class="ttname"><a href="classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7">mlx::core::Floor::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_floor_html_aaa29c83538099eb8f951c95a41f2eb65"><div class="ttname"><a href="classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65">mlx::core::Floor::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_floor_html_ada4e979b784b732696313d7094e91340"><div class="ttname"><a href="classmlx_1_1core_1_1_floor.html#ada4e979b784b732696313d7094e91340">mlx::core::Floor::Floor</a></div><div class="ttdeci">Floor(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1001</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_full_html"><div class="ttname"><a href="classmlx_1_1core_1_1_full.html">mlx::core::Full</a></div><div class="ttdef"><b>Definition</b> primitives.h:1016</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_floor_html_ada4e979b784b732696313d7094e91340"><div class="ttname"><a href="classmlx_1_1core_1_1_floor.html#ada4e979b784b732696313d7094e91340">mlx::core::Floor::Floor</a></div><div class="ttdeci">Floor(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1020</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_full_html"><div class="ttname"><a href="classmlx_1_1core_1_1_full.html">mlx::core::Full</a></div><div class="ttdef"><b>Definition</b> primitives.h:1035</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_full_html_a3dccd3756599d7fd018b2af0093b082c"><div class="ttname"><a href="classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c">mlx::core::Full::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_full_html_aa54f99bb4cba12a551392dea56003872"><div class="ttname"><a href="classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872">mlx::core::Full::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_full_html_aafcb86a2e41353853ec48c717e0c54d6"><div class="ttname"><a href="classmlx_1_1core_1_1_full.html#aafcb86a2e41353853ec48c717e0c54d6">mlx::core::Full::Full</a></div><div class="ttdeci">Full(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1018</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_gather_html"><div class="ttname"><a href="classmlx_1_1core_1_1_gather.html">mlx::core::Gather</a></div><div class="ttdef"><b>Definition</b> primitives.h:1032</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_gather_html_a5b5f47ceff1d43477c87be5116f261d0"><div class="ttname"><a href="classmlx_1_1core_1_1_gather.html#a5b5f47ceff1d43477c87be5116f261d0">mlx::core::Gather::Gather</a></div><div class="ttdeci">Gather(Stream stream, const std::vector&lt; int &gt; &amp;axes, const std::vector&lt; int &gt; &amp;slice_sizes)</div><div class="ttdef"><b>Definition</b> primitives.h:1034</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_full_html_aafcb86a2e41353853ec48c717e0c54d6"><div class="ttname"><a href="classmlx_1_1core_1_1_full.html#aafcb86a2e41353853ec48c717e0c54d6">mlx::core::Full::Full</a></div><div class="ttdeci">Full(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1037</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_gather_html"><div class="ttname"><a href="classmlx_1_1core_1_1_gather.html">mlx::core::Gather</a></div><div class="ttdef"><b>Definition</b> primitives.h:1051</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_gather_html_a5b5f47ceff1d43477c87be5116f261d0"><div class="ttname"><a href="classmlx_1_1core_1_1_gather.html#a5b5f47ceff1d43477c87be5116f261d0">mlx::core::Gather::Gather</a></div><div class="ttdeci">Gather(Stream stream, const std::vector&lt; int &gt; &amp;axes, const std::vector&lt; int &gt; &amp;slice_sizes)</div><div class="ttdef"><b>Definition</b> primitives.h:1053</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_gather_html_a9ed5587f0d04b59a2b9186c0aac21290"><div class="ttname"><a href="classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290">mlx::core::Gather::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_gather_html_aec48ee529cb2449915a7b27a3c4361e8"><div class="ttname"><a href="classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8">mlx::core::Gather::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_gather_m_m_html"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_m_m.html">mlx::core::GatherMM</a></div><div class="ttdef"><b>Definition</b> primitives.h:508</div></div>
@@ -2813,107 +2840,107 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aclassmlx_1_1core_1_1_gather_m_m_html_a76c9f27c57354f6230b43944882e1bda"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda">mlx::core::GatherMM::vjp</a></div><div class="ttdeci">std::vector&lt; array &gt; vjp(const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotangents, const std::vector&lt; int &gt; &amp;argnums, const std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">The vector-Jacobian product.</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_gather_m_m_html_ad754c35f460a055cc383ad93a5f72da1"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1">mlx::core::GatherMM::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_gather_m_m_html_afd9bbc08138181b80e2fb86536ff3f2a"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_m_m.html#afd9bbc08138181b80e2fb86536ff3f2a">mlx::core::GatherMM::GatherMM</a></div><div class="ttdeci">GatherMM(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:510</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_gather_q_m_m_html"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_q_m_m.html">mlx::core::GatherQMM</a></div><div class="ttdef"><b>Definition</b> primitives.h:1535</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_gather_q_m_m_html_a60ed2ade7f10dd9c9314913a810f9360"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_q_m_m.html#a60ed2ade7f10dd9c9314913a810f9360">mlx::core::GatherQMM::GatherQMM</a></div><div class="ttdeci">GatherQMM(Stream stream, int group_size, int bits, bool transpose)</div><div class="ttdef"><b>Definition</b> primitives.h:1537</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_gather_q_m_m_html"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_q_m_m.html">mlx::core::GatherQMM</a></div><div class="ttdef"><b>Definition</b> primitives.h:1554</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_gather_q_m_m_html_a60ed2ade7f10dd9c9314913a810f9360"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_q_m_m.html#a60ed2ade7f10dd9c9314913a810f9360">mlx::core::GatherQMM::GatherQMM</a></div><div class="ttdeci">GatherQMM(Stream stream, int group_size, int bits, bool transpose)</div><div class="ttdef"><b>Definition</b> primitives.h:1556</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_gather_q_m_m_html_a86eb048afc95646b2e96ec5493e3d887"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887">mlx::core::GatherQMM::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_gather_q_m_m_html_a89aae98bfbdd6563df44ef7d70f0bf8c"><div class="ttname"><a href="classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c">mlx::core::GatherQMM::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_greater_equal_html"><div class="ttname"><a href="classmlx_1_1core_1_1_greater_equal.html">mlx::core::GreaterEqual</a></div><div class="ttdef"><b>Definition</b> primitives.h:1071</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_greater_equal_html"><div class="ttname"><a href="classmlx_1_1core_1_1_greater_equal.html">mlx::core::GreaterEqual</a></div><div class="ttdef"><b>Definition</b> primitives.h:1090</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_greater_equal_html_a15469125b9bea89b64bfeac01590c075"><div class="ttname"><a href="classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075">mlx::core::GreaterEqual::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_greater_equal_html_a19a3c49d5a9b40e17da0e56ef6908527"><div class="ttname"><a href="classmlx_1_1core_1_1_greater_equal.html#a19a3c49d5a9b40e17da0e56ef6908527">mlx::core::GreaterEqual::GreaterEqual</a></div><div class="ttdeci">GreaterEqual(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1073</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_greater_equal_html_a19a3c49d5a9b40e17da0e56ef6908527"><div class="ttname"><a href="classmlx_1_1core_1_1_greater_equal.html#a19a3c49d5a9b40e17da0e56ef6908527">mlx::core::GreaterEqual::GreaterEqual</a></div><div class="ttdeci">GreaterEqual(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1092</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_greater_equal_html_ac246263b4548126c3d4ab7e392575d24"><div class="ttname"><a href="classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24">mlx::core::GreaterEqual::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_greater_html"><div class="ttname"><a href="classmlx_1_1core_1_1_greater.html">mlx::core::Greater</a></div><div class="ttdef"><b>Definition</b> primitives.h:1054</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_greater_html_a1d5992a66c020cd97a70e8e3d8cd1a1b"><div class="ttname"><a href="classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b">mlx::core::Greater::Greater</a></div><div class="ttdeci">Greater(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1056</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_greater_html"><div class="ttname"><a href="classmlx_1_1core_1_1_greater.html">mlx::core::Greater</a></div><div class="ttdef"><b>Definition</b> primitives.h:1073</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_greater_html_a1d5992a66c020cd97a70e8e3d8cd1a1b"><div class="ttname"><a href="classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b">mlx::core::Greater::Greater</a></div><div class="ttdeci">Greater(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1075</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_greater_html_abe1c03f311d0e0b610f3392a6566f2ae"><div class="ttname"><a href="classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae">mlx::core::Greater::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_greater_html_ae8957cccf4c924d941f57a1bb751c878"><div class="ttname"><a href="classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878">mlx::core::Greater::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_hadamard_html"><div class="ttname"><a href="classmlx_1_1core_1_1_hadamard.html">mlx::core::Hadamard</a></div><div class="ttdef"><b>Definition</b> primitives.h:1088</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_hadamard_html"><div class="ttname"><a href="classmlx_1_1core_1_1_hadamard.html">mlx::core::Hadamard</a></div><div class="ttdef"><b>Definition</b> primitives.h:1107</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_hadamard_html_a2470feb690f5463138490763c38b5733"><div class="ttname"><a href="classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733">mlx::core::Hadamard::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_hadamard_html_ab27d6a9df42b3aab41ace3073a4c880d"><div class="ttname"><a href="classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d">mlx::core::Hadamard::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_hadamard_html_abe4a0ed820b126940beec519d4239923"><div class="ttname"><a href="classmlx_1_1core_1_1_hadamard.html#abe4a0ed820b126940beec519d4239923">mlx::core::Hadamard::Hadamard</a></div><div class="ttdeci">Hadamard(Stream stream, float scale)</div><div class="ttdef"><b>Definition</b> primitives.h:1090</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_imag_html"><div class="ttname"><a href="classmlx_1_1core_1_1_imag.html">mlx::core::Imag</a></div><div class="ttdef"><b>Definition</b> primitives.h:1109</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_hadamard_html_abe4a0ed820b126940beec519d4239923"><div class="ttname"><a href="classmlx_1_1core_1_1_hadamard.html#abe4a0ed820b126940beec519d4239923">mlx::core::Hadamard::Hadamard</a></div><div class="ttdeci">Hadamard(Stream stream, float scale)</div><div class="ttdef"><b>Definition</b> primitives.h:1109</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_imag_html"><div class="ttname"><a href="classmlx_1_1core_1_1_imag.html">mlx::core::Imag</a></div><div class="ttdef"><b>Definition</b> primitives.h:1128</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_imag_html_a17d1f1f9f8528668fcdf39b636720829"><div class="ttname"><a href="classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829">mlx::core::Imag::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_imag_html_a247a4d059b0a99678c6be8c15e42c1e6"><div class="ttname"><a href="classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6">mlx::core::Imag::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_imag_html_a284b7de34a316110fdc98e7b753372b2"><div class="ttname"><a href="classmlx_1_1core_1_1_imag.html#a284b7de34a316110fdc98e7b753372b2">mlx::core::Imag::Imag</a></div><div class="ttdeci">Imag(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1111</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_inverse_html"><div class="ttname"><a href="classmlx_1_1core_1_1_inverse.html">mlx::core::Inverse</a></div><div class="ttdef"><b>Definition</b> primitives.h:2166</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_imag_html_a284b7de34a316110fdc98e7b753372b2"><div class="ttname"><a href="classmlx_1_1core_1_1_imag.html#a284b7de34a316110fdc98e7b753372b2">mlx::core::Imag::Imag</a></div><div class="ttdeci">Imag(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1130</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_inverse_html"><div class="ttname"><a href="classmlx_1_1core_1_1_inverse.html">mlx::core::Inverse</a></div><div class="ttdef"><b>Definition</b> primitives.h:2185</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_inverse_html_a086fbbc947ad232e01686ad063a78ed2"><div class="ttname"><a href="classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2">mlx::core::Inverse::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_inverse_html_a71467681e523abb725724490bfeb76ad"><div class="ttname"><a href="classmlx_1_1core_1_1_inverse.html#a71467681e523abb725724490bfeb76ad">mlx::core::Inverse::Inverse</a></div><div class="ttdeci">Inverse(Stream stream, bool tri, bool upper)</div><div class="ttdef"><b>Definition</b> primitives.h:2168</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_inverse_html_a71467681e523abb725724490bfeb76ad"><div class="ttname"><a href="classmlx_1_1core_1_1_inverse.html#a71467681e523abb725724490bfeb76ad">mlx::core::Inverse::Inverse</a></div><div class="ttdeci">Inverse(Stream stream, bool tri, bool upper)</div><div class="ttdef"><b>Definition</b> primitives.h:2187</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_inverse_html_aeb1d8dc9bc4052a616023f65b3c7bb81"><div class="ttname"><a href="classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81">mlx::core::Inverse::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_less_equal_html"><div class="ttname"><a href="classmlx_1_1core_1_1_less_equal.html">mlx::core::LessEqual</a></div><div class="ttdef"><b>Definition</b> primitives.h:1140</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_less_equal_html_a52492a43224d47e7851beec646c27bbc"><div class="ttname"><a href="classmlx_1_1core_1_1_less_equal.html#a52492a43224d47e7851beec646c27bbc">mlx::core::LessEqual::LessEqual</a></div><div class="ttdeci">LessEqual(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1142</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_less_equal_html"><div class="ttname"><a href="classmlx_1_1core_1_1_less_equal.html">mlx::core::LessEqual</a></div><div class="ttdef"><b>Definition</b> primitives.h:1159</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_less_equal_html_a52492a43224d47e7851beec646c27bbc"><div class="ttname"><a href="classmlx_1_1core_1_1_less_equal.html#a52492a43224d47e7851beec646c27bbc">mlx::core::LessEqual::LessEqual</a></div><div class="ttdeci">LessEqual(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1161</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_less_equal_html_a55d1352b0e97841a92503bc57c19ed16"><div class="ttname"><a href="classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16">mlx::core::LessEqual::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_less_equal_html_acf035a82b11e6f63742143ea540fedac"><div class="ttname"><a href="classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac">mlx::core::LessEqual::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_less_html"><div class="ttname"><a href="classmlx_1_1core_1_1_less.html">mlx::core::Less</a></div><div class="ttdef"><b>Definition</b> primitives.h:1123</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_less_html"><div class="ttname"><a href="classmlx_1_1core_1_1_less.html">mlx::core::Less</a></div><div class="ttdef"><b>Definition</b> primitives.h:1142</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_less_html_a32624124ffece066f496b3299056bcef"><div class="ttname"><a href="classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef">mlx::core::Less::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_less_html_a353335ce06ddbe8498d86d129c835917"><div class="ttname"><a href="classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917">mlx::core::Less::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_less_html_aa55c5cfbab0ac30e1b72c080fe9525d7"><div class="ttname"><a href="classmlx_1_1core_1_1_less.html#aa55c5cfbab0ac30e1b72c080fe9525d7">mlx::core::Less::Less</a></div><div class="ttdeci">Less(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1125</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_load_html"><div class="ttname"><a href="classmlx_1_1core_1_1_load.html">mlx::core::Load</a></div><div class="ttdef"><b>Definition</b> primitives.h:1157</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_less_html_aa55c5cfbab0ac30e1b72c080fe9525d7"><div class="ttname"><a href="classmlx_1_1core_1_1_less.html#aa55c5cfbab0ac30e1b72c080fe9525d7">mlx::core::Less::Less</a></div><div class="ttdeci">Less(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1144</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_load_html"><div class="ttname"><a href="classmlx_1_1core_1_1_load.html">mlx::core::Load</a></div><div class="ttdef"><b>Definition</b> primitives.h:1176</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_load_html_a06933e887ea94a4d01d81195c5e07a3d"><div class="ttname"><a href="classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d">mlx::core::Load::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_load_html_a3aa8a537cd90bab048df47dca1ed526a"><div class="ttname"><a href="classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a">mlx::core::Load::Load</a></div><div class="ttdeci">Load(Stream stream, std::shared_ptr&lt; io::Reader &gt; reader, size_t offset, bool swap_endianness=false)</div><div class="ttdef"><b>Definition</b> primitives.h:1159</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_load_html_a3aa8a537cd90bab048df47dca1ed526a"><div class="ttname"><a href="classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a">mlx::core::Load::Load</a></div><div class="ttdeci">Load(Stream stream, std::shared_ptr&lt; io::Reader &gt; reader, size_t offset, bool swap_endianness=false)</div><div class="ttdef"><b>Definition</b> primitives.h:1178</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_load_html_ada026ac30566f3109d8182e35d307c0a"><div class="ttname"><a href="classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a">mlx::core::Load::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_log1p_html"><div class="ttname"><a href="classmlx_1_1core_1_1_log1p.html">mlx::core::Log1p</a></div><div class="ttdef"><b>Definition</b> primitives.h:1223</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_log1p_html"><div class="ttname"><a href="classmlx_1_1core_1_1_log1p.html">mlx::core::Log1p</a></div><div class="ttdef"><b>Definition</b> primitives.h:1242</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_log1p_html_a1b97decae7338d46874e736c95fa7431"><div class="ttname"><a href="classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431">mlx::core::Log1p::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_log1p_html_a8192e5438de99c4cda056987935cba23"><div class="ttname"><a href="classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23">mlx::core::Log1p::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_log1p_html_ab0d6eb90c6f98775fce56f3446ff127a"><div class="ttname"><a href="classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a">mlx::core::Log1p::Log1p</a></div><div class="ttdeci">Log1p(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1225</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_log_add_exp_html"><div class="ttname"><a href="classmlx_1_1core_1_1_log_add_exp.html">mlx::core::LogAddExp</a></div><div class="ttdef"><b>Definition</b> primitives.h:1290</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_log1p_html_ab0d6eb90c6f98775fce56f3446ff127a"><div class="ttname"><a href="classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a">mlx::core::Log1p::Log1p</a></div><div class="ttdeci">Log1p(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1244</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_log_add_exp_html"><div class="ttname"><a href="classmlx_1_1core_1_1_log_add_exp.html">mlx::core::LogAddExp</a></div><div class="ttdef"><b>Definition</b> primitives.h:1309</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_log_add_exp_html_abef17fb590b1a8d356f2a580e45d41f0"><div class="ttname"><a href="classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0">mlx::core::LogAddExp::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_log_add_exp_html_acace355b62ec00df649f9f99e8f2eb7a"><div class="ttname"><a href="classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a">mlx::core::LogAddExp::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_log_add_exp_html_ad8938ca90ccf1a3259973fc68902975a"><div class="ttname"><a href="classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a">mlx::core::LogAddExp::LogAddExp</a></div><div class="ttdeci">LogAddExp(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1292</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_log_html"><div class="ttname"><a href="classmlx_1_1core_1_1_log.html">mlx::core::Log</a></div><div class="ttdef"><b>Definition</b> primitives.h:1189</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_log_html_a044a23e8b1422984628e1cd5ab506421"><div class="ttname"><a href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421">mlx::core::Log::Base</a></div><div class="ttdeci">Base</div><div class="ttdef"><b>Definition</b> primitives.h:1191</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_log_html_a663e54790c60b56eb0ff09f4f6635fb9"><div class="ttname"><a href="classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9">mlx::core::Log::Log</a></div><div class="ttdeci">Log(Stream stream, Base base)</div><div class="ttdef"><b>Definition</b> primitives.h:1193</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_log_add_exp_html_ad8938ca90ccf1a3259973fc68902975a"><div class="ttname"><a href="classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a">mlx::core::LogAddExp::LogAddExp</a></div><div class="ttdeci">LogAddExp(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1311</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_log_html"><div class="ttname"><a href="classmlx_1_1core_1_1_log.html">mlx::core::Log</a></div><div class="ttdef"><b>Definition</b> primitives.h:1208</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_log_html_a044a23e8b1422984628e1cd5ab506421"><div class="ttname"><a href="classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421">mlx::core::Log::Base</a></div><div class="ttdeci">Base</div><div class="ttdef"><b>Definition</b> primitives.h:1210</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_log_html_a663e54790c60b56eb0ff09f4f6635fb9"><div class="ttname"><a href="classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9">mlx::core::Log::Log</a></div><div class="ttdeci">Log(Stream stream, Base base)</div><div class="ttdef"><b>Definition</b> primitives.h:1212</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_log_html_aaaa49e9455f3a197bc319646b5ca6390"><div class="ttname"><a href="classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390">mlx::core::Log::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_log_html_aadc7bb4cb24f3ecbbb9ed54a699ab74f"><div class="ttname"><a href="classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f">mlx::core::Log::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_logical_and_html"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_and.html">mlx::core::LogicalAnd</a></div><div class="ttdef"><b>Definition</b> primitives.h:1256</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_logical_and_html"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_and.html">mlx::core::LogicalAnd</a></div><div class="ttdef"><b>Definition</b> primitives.h:1275</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_logical_and_html_a132b2eedaa3978de5a5350da3c2ca40f"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f">mlx::core::LogicalAnd::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_logical_and_html_aaf2cab8ffcf6606b8babfef60fc06fb3"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3">mlx::core::LogicalAnd::LogicalAnd</a></div><div class="ttdeci">LogicalAnd(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1258</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_logical_and_html_aaf2cab8ffcf6606b8babfef60fc06fb3"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3">mlx::core::LogicalAnd::LogicalAnd</a></div><div class="ttdeci">LogicalAnd(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1277</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_logical_and_html_adbe1c1785af1a8b827289d22b0d170b3"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3">mlx::core::LogicalAnd::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_logical_not_html"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_not.html">mlx::core::LogicalNot</a></div><div class="ttdef"><b>Definition</b> primitives.h:1239</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_logical_not_html"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_not.html">mlx::core::LogicalNot</a></div><div class="ttdef"><b>Definition</b> primitives.h:1258</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_logical_not_html_a1d0d2bc93f935eca6c85ef7bf67f2d6a"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a">mlx::core::LogicalNot::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_logical_not_html_a6f5850b4c78b83d5e2c0d37437fc79b7"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7">mlx::core::LogicalNot::LogicalNot</a></div><div class="ttdeci">LogicalNot(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1241</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_logical_not_html_a6f5850b4c78b83d5e2c0d37437fc79b7"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7">mlx::core::LogicalNot::LogicalNot</a></div><div class="ttdeci">LogicalNot(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1260</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_logical_not_html_acf3f7b3b20ca69533536e0e0a05725b3"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3">mlx::core::LogicalNot::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_logical_or_html"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_or.html">mlx::core::LogicalOr</a></div><div class="ttdef"><b>Definition</b> primitives.h:1273</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_logical_or_html"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_or.html">mlx::core::LogicalOr</a></div><div class="ttdef"><b>Definition</b> primitives.h:1292</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_logical_or_html_a13cd4cbf26589287e85aeaaca42d7f62"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62">mlx::core::LogicalOr::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_logical_or_html_a269c22daca1c15ad010bb860bce93918"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918">mlx::core::LogicalOr::LogicalOr</a></div><div class="ttdeci">LogicalOr(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1275</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_logical_or_html_a269c22daca1c15ad010bb860bce93918"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918">mlx::core::LogicalOr::LogicalOr</a></div><div class="ttdeci">LogicalOr(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1294</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_logical_or_html_a3be1da328f0f8620de2e4fc1d22a077a"><div class="ttname"><a href="classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a">mlx::core::LogicalOr::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_matmul_html"><div class="ttname"><a href="classmlx_1_1core_1_1_matmul.html">mlx::core::Matmul</a></div><div class="ttdef"><b>Definition</b> primitives.h:1307</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_matmul_html"><div class="ttname"><a href="classmlx_1_1core_1_1_matmul.html">mlx::core::Matmul</a></div><div class="ttdef"><b>Definition</b> primitives.h:1326</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_matmul_html_a357a7f57a2a220a91977f810a69413fc"><div class="ttname"><a href="classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc">mlx::core::Matmul::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_matmul_html_a524136cca481598ea20894d85ca66bb0"><div class="ttname"><a href="classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0">mlx::core::Matmul::vjp</a></div><div class="ttdeci">std::vector&lt; array &gt; vjp(const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotangents, const std::vector&lt; int &gt; &amp;argnums, const std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">The vector-Jacobian product.</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_matmul_html_a8707a4e9b75c769e8f1dbca15c6a1ae7"><div class="ttname"><a href="classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7">mlx::core::Matmul::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_matmul_html_adef92f30ab35e540ccb316ea6b94e6f7"><div class="ttname"><a href="classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7">mlx::core::Matmul::Matmul</a></div><div class="ttdeci">Matmul(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1309</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_maximum_html"><div class="ttname"><a href="classmlx_1_1core_1_1_maximum.html">mlx::core::Maximum</a></div><div class="ttdef"><b>Definition</b> primitives.h:1325</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_maximum_html_a28389307e385efe1b2955b86b115e816"><div class="ttname"><a href="classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816">mlx::core::Maximum::Maximum</a></div><div class="ttdeci">Maximum(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1327</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_matmul_html_adef92f30ab35e540ccb316ea6b94e6f7"><div class="ttname"><a href="classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7">mlx::core::Matmul::Matmul</a></div><div class="ttdeci">Matmul(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1328</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_maximum_html"><div class="ttname"><a href="classmlx_1_1core_1_1_maximum.html">mlx::core::Maximum</a></div><div class="ttdef"><b>Definition</b> primitives.h:1344</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_maximum_html_a28389307e385efe1b2955b86b115e816"><div class="ttname"><a href="classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816">mlx::core::Maximum::Maximum</a></div><div class="ttdeci">Maximum(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1346</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_maximum_html_a62b38fbe5f96db58c2b60165ac4eadcf"><div class="ttname"><a href="classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf">mlx::core::Maximum::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_maximum_html_ade0f721b10a6b3a12bdadd34c48f72a7"><div class="ttname"><a href="classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7">mlx::core::Maximum::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_minimum_html"><div class="ttname"><a href="classmlx_1_1core_1_1_minimum.html">mlx::core::Minimum</a></div><div class="ttdef"><b>Definition</b> primitives.h:1342</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_minimum_html"><div class="ttname"><a href="classmlx_1_1core_1_1_minimum.html">mlx::core::Minimum</a></div><div class="ttdef"><b>Definition</b> primitives.h:1361</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_minimum_html_a6b93f493ee87089943a8085fe59dfc6e"><div class="ttname"><a href="classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e">mlx::core::Minimum::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_minimum_html_aadc68afa0afbe2103f19d161f5e0a2ba"><div class="ttname"><a href="classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba">mlx::core::Minimum::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_minimum_html_ab0f2ce17108df44b82cff68886b0f6f5"><div class="ttname"><a href="classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5">mlx::core::Minimum::Minimum</a></div><div class="ttdeci">Minimum(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1344</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_multiply_html"><div class="ttname"><a href="classmlx_1_1core_1_1_multiply.html">mlx::core::Multiply</a></div><div class="ttdef"><b>Definition</b> primitives.h:1359</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_minimum_html_ab0f2ce17108df44b82cff68886b0f6f5"><div class="ttname"><a href="classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5">mlx::core::Minimum::Minimum</a></div><div class="ttdeci">Minimum(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1363</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_multiply_html"><div class="ttname"><a href="classmlx_1_1core_1_1_multiply.html">mlx::core::Multiply</a></div><div class="ttdef"><b>Definition</b> primitives.h:1378</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_multiply_html_a624fce06c047cdc4dfdbdcaaddb25f34"><div class="ttname"><a href="classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34">mlx::core::Multiply::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_multiply_html_a634fcb4e981d8d3f4d94252caf25bee0"><div class="ttname"><a href="classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0">mlx::core::Multiply::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_multiply_html_aca5c50f900321f3eb4d6fbcbc225c00c"><div class="ttname"><a href="classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c">mlx::core::Multiply::Multiply</a></div><div class="ttdeci">Multiply(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1361</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_negative_html"><div class="ttname"><a href="classmlx_1_1core_1_1_negative.html">mlx::core::Negative</a></div><div class="ttdef"><b>Definition</b> primitives.h:1376</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_multiply_html_aca5c50f900321f3eb4d6fbcbc225c00c"><div class="ttname"><a href="classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c">mlx::core::Multiply::Multiply</a></div><div class="ttdeci">Multiply(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1380</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_negative_html"><div class="ttname"><a href="classmlx_1_1core_1_1_negative.html">mlx::core::Negative</a></div><div class="ttdef"><b>Definition</b> primitives.h:1395</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_negative_html_a97f1b316eace0c6d9e576d766940c75b"><div class="ttname"><a href="classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b">mlx::core::Negative::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_negative_html_aa3b73395d9fa5b7215dca488bc0d3c70"><div class="ttname"><a href="classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70">mlx::core::Negative::Negative</a></div><div class="ttdeci">Negative(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1378</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_negative_html_aa3b73395d9fa5b7215dca488bc0d3c70"><div class="ttname"><a href="classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70">mlx::core::Negative::Negative</a></div><div class="ttdeci">Negative(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1397</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_negative_html_af43553dc418c8ebe75fa9cdcba103c3b"><div class="ttname"><a href="classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b">mlx::core::Negative::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_not_equal_html"><div class="ttname"><a href="classmlx_1_1core_1_1_not_equal.html">mlx::core::NotEqual</a></div><div class="ttdef"><b>Definition</b> primitives.h:1393</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_not_equal_html"><div class="ttname"><a href="classmlx_1_1core_1_1_not_equal.html">mlx::core::NotEqual</a></div><div class="ttdef"><b>Definition</b> primitives.h:1412</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_not_equal_html_a61179747e34e203150e9c660dfddb5f2"><div class="ttname"><a href="classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2">mlx::core::NotEqual::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_not_equal_html_a8f95f8b5873850b875b1641df8196047"><div class="ttname"><a href="classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047">mlx::core::NotEqual::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_not_equal_html_ac568397bd17b5d9f25ad1a0ebadedbb9"><div class="ttname"><a href="classmlx_1_1core_1_1_not_equal.html#ac568397bd17b5d9f25ad1a0ebadedbb9">mlx::core::NotEqual::NotEqual</a></div><div class="ttdeci">NotEqual(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1395</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_number_of_elements_html"><div class="ttname"><a href="classmlx_1_1core_1_1_number_of_elements.html">mlx::core::NumberOfElements</a></div><div class="ttdef"><b>Definition</b> primitives.h:1410</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_not_equal_html_ac568397bd17b5d9f25ad1a0ebadedbb9"><div class="ttname"><a href="classmlx_1_1core_1_1_not_equal.html#ac568397bd17b5d9f25ad1a0ebadedbb9">mlx::core::NotEqual::NotEqual</a></div><div class="ttdeci">NotEqual(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1414</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_number_of_elements_html"><div class="ttname"><a href="classmlx_1_1core_1_1_number_of_elements.html">mlx::core::NumberOfElements</a></div><div class="ttdef"><b>Definition</b> primitives.h:1429</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_number_of_elements_html_a2c98c42915fb2bfe12f5c99ea553eff5"><div class="ttname"><a href="classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5">mlx::core::NumberOfElements::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_number_of_elements_html_ac64d7c40ae29d687f8b7d2fa33e13b06"><div class="ttname"><a href="classmlx_1_1core_1_1_number_of_elements.html#ac64d7c40ae29d687f8b7d2fa33e13b06">mlx::core::NumberOfElements::NumberOfElements</a></div><div class="ttdeci">NumberOfElements(Stream stream, std::vector&lt; int &gt; axes, bool inverted, Dtype dtype)</div><div class="ttdef"><b>Definition</b> primitives.h:1412</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_number_of_elements_html_ac64d7c40ae29d687f8b7d2fa33e13b06"><div class="ttname"><a href="classmlx_1_1core_1_1_number_of_elements.html#ac64d7c40ae29d687f8b7d2fa33e13b06">mlx::core::NumberOfElements::NumberOfElements</a></div><div class="ttdeci">NumberOfElements(Stream stream, std::vector&lt; int &gt; axes, bool inverted, Dtype dtype)</div><div class="ttdef"><b>Definition</b> primitives.h:1431</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_number_of_elements_html_acc328321cf5300874ee884367cbede3f"><div class="ttname"><a href="classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f">mlx::core::NumberOfElements::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_pad_html"><div class="ttname"><a href="classmlx_1_1core_1_1_pad.html">mlx::core::Pad</a></div><div class="ttdef"><b>Definition</b> primitives.h:1441</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_pad_html"><div class="ttname"><a href="classmlx_1_1core_1_1_pad.html">mlx::core::Pad</a></div><div class="ttdef"><b>Definition</b> primitives.h:1460</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_pad_html_aaf82dd163cd536fbf97304f8b29080cb"><div class="ttname"><a href="classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb">mlx::core::Pad::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_pad_html_ad03da2c40b1e1f2fdf2649d00fa4ab43"><div class="ttname"><a href="classmlx_1_1core_1_1_pad.html#ad03da2c40b1e1f2fdf2649d00fa4ab43">mlx::core::Pad::Pad</a></div><div class="ttdeci">Pad(Stream stream, const std::vector&lt; int &gt; &amp;axes, const std::vector&lt; int &gt; &amp;low_pad_size, const std::vector&lt; int &gt; &amp;high_pad_size)</div><div class="ttdef"><b>Definition</b> primitives.h:1443</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_pad_html_ad03da2c40b1e1f2fdf2649d00fa4ab43"><div class="ttname"><a href="classmlx_1_1core_1_1_pad.html#ad03da2c40b1e1f2fdf2649d00fa4ab43">mlx::core::Pad::Pad</a></div><div class="ttdeci">Pad(Stream stream, const std::vector&lt; int &gt; &amp;axes, const std::vector&lt; int &gt; &amp;low_pad_size, const std::vector&lt; int &gt; &amp;high_pad_size)</div><div class="ttdef"><b>Definition</b> primitives.h:1462</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_pad_html_aefd4d3a5bd8b6b35b266c9e558ada153"><div class="ttname"><a href="classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153">mlx::core::Pad::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_partition_html"><div class="ttname"><a href="classmlx_1_1core_1_1_partition.html">mlx::core::Partition</a></div><div class="ttdef"><b>Definition</b> primitives.h:1469</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_partition_html"><div class="ttname"><a href="classmlx_1_1core_1_1_partition.html">mlx::core::Partition</a></div><div class="ttdef"><b>Definition</b> primitives.h:1488</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_partition_html_a784596ab567f9f3cb4fe1a69466523d8"><div class="ttname"><a href="classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8">mlx::core::Partition::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_partition_html_a7b82ca3895b6654308fac566b277ac0d"><div class="ttname"><a href="classmlx_1_1core_1_1_partition.html#a7b82ca3895b6654308fac566b277ac0d">mlx::core::Partition::Partition</a></div><div class="ttdeci">Partition(Stream stream, int kth, int axis)</div><div class="ttdef"><b>Definition</b> primitives.h:1471</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_partition_html_a7b82ca3895b6654308fac566b277ac0d"><div class="ttname"><a href="classmlx_1_1core_1_1_partition.html#a7b82ca3895b6654308fac566b277ac0d">mlx::core::Partition::Partition</a></div><div class="ttdeci">Partition(Stream stream, int kth, int axis)</div><div class="ttdef"><b>Definition</b> primitives.h:1490</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_partition_html_a8eca1be21ae9ccfda46e6f3e85f506ef"><div class="ttname"><a href="classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef">mlx::core::Partition::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_power_html"><div class="ttname"><a href="classmlx_1_1core_1_1_power.html">mlx::core::Power</a></div><div class="ttdef"><b>Definition</b> primitives.h:1490</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_power_html"><div class="ttname"><a href="classmlx_1_1core_1_1_power.html">mlx::core::Power</a></div><div class="ttdef"><b>Definition</b> primitives.h:1509</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_power_html_a6783da16fb6ff393aaa57737f1973206"><div class="ttname"><a href="classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206">mlx::core::Power::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_power_html_a7bc6c64179b7a2aef56fe1dafb6459b2"><div class="ttname"><a href="classmlx_1_1core_1_1_power.html#a7bc6c64179b7a2aef56fe1dafb6459b2">mlx::core::Power::Power</a></div><div class="ttdeci">Power(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1492</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_power_html_a7bc6c64179b7a2aef56fe1dafb6459b2"><div class="ttname"><a href="classmlx_1_1core_1_1_power.html#a7bc6c64179b7a2aef56fe1dafb6459b2">mlx::core::Power::Power</a></div><div class="ttdeci">Power(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1511</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_power_html_a80577d4c0853c24027777c90a1ec7e11"><div class="ttname"><a href="classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11">mlx::core::Power::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html">mlx::core::Primitive</a></div><div class="ttdef"><b>Definition</b> primitives.h:48</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html_a1596dc50b910538eae14878e98f07575"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html#a1596dc50b910538eae14878e98f07575">mlx::core::Primitive::eval_cpu</a></div><div class="ttdeci">virtual void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs)=0</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div></div>
@@ -2932,126 +2959,126 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html_ad217376dcf5eff691d731566faec2ba2"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html#ad217376dcf5eff691d731566faec2ba2">mlx::core::Primitive::eval_gpu</a></div><div class="ttdeci">virtual void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs)=0</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html_ae1aff91354ce036596088a3e19474ecb"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb">mlx::core::Primitive::print</a></div><div class="ttdeci">virtual void print(std::ostream &amp;os)=0</div><div class="ttdoc">Print the primitive.</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_primitive_html_afc69f22ee1f6e8a9ecc2c3a8f43b8fdb"><div class="ttname"><a href="classmlx_1_1core_1_1_primitive.html#afc69f22ee1f6e8a9ecc2c3a8f43b8fdb">mlx::core::Primitive::Primitive</a></div><div class="ttdeci">Primitive(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:50</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_q_r_f_html"><div class="ttname"><a href="classmlx_1_1core_1_1_q_r_f.html">mlx::core::QRF</a></div><div class="ttdef"><b>Definition</b> primitives.h:2133</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_q_r_f_html_a44ed2924dc574c4aeb79b1188b5c3983"><div class="ttname"><a href="classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983">mlx::core::QRF::QRF</a></div><div class="ttdeci">QRF(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2135</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_q_r_f_html"><div class="ttname"><a href="classmlx_1_1core_1_1_q_r_f.html">mlx::core::QRF</a></div><div class="ttdef"><b>Definition</b> primitives.h:2152</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_q_r_f_html_a44ed2924dc574c4aeb79b1188b5c3983"><div class="ttname"><a href="classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983">mlx::core::QRF::QRF</a></div><div class="ttdeci">QRF(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2154</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_q_r_f_html_a48493887395d65a27f04de1804d277d2"><div class="ttname"><a href="classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2">mlx::core::QRF::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_q_r_f_html_ae5fa3482192f4713605cd07e7fc1c6c9"><div class="ttname"><a href="classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9">mlx::core::QRF::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_quantized_matmul_html"><div class="ttname"><a href="classmlx_1_1core_1_1_quantized_matmul.html">mlx::core::QuantizedMatmul</a></div><div class="ttdef"><b>Definition</b> primitives.h:1507</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_quantized_matmul_html"><div class="ttname"><a href="classmlx_1_1core_1_1_quantized_matmul.html">mlx::core::QuantizedMatmul</a></div><div class="ttdef"><b>Definition</b> primitives.h:1526</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_quantized_matmul_html_a2812ad007d695ed1aaf9cf706fb9c4b3"><div class="ttname"><a href="classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3">mlx::core::QuantizedMatmul::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_quantized_matmul_html_a5bd164d038d9dc21919f7e0bfdeaa25c"><div class="ttname"><a href="classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c">mlx::core::QuantizedMatmul::QuantizedMatmul</a></div><div class="ttdeci">QuantizedMatmul(Stream stream, int group_size, int bits, bool transpose)</div><div class="ttdef"><b>Definition</b> primitives.h:1509</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_quantized_matmul_html_a5bd164d038d9dc21919f7e0bfdeaa25c"><div class="ttname"><a href="classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c">mlx::core::QuantizedMatmul::QuantizedMatmul</a></div><div class="ttdeci">QuantizedMatmul(Stream stream, int group_size, int bits, bool transpose)</div><div class="ttdef"><b>Definition</b> primitives.h:1528</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_quantized_matmul_html_ab3dfa73b74d8f4f2e9ab4f0eb016b0e3"><div class="ttname"><a href="classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3">mlx::core::QuantizedMatmul::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_random_bits_html"><div class="ttname"><a href="classmlx_1_1core_1_1_random_bits.html">mlx::core::RandomBits</a></div><div class="ttdef"><b>Definition</b> primitives.h:1559</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_random_bits_html"><div class="ttname"><a href="classmlx_1_1core_1_1_random_bits.html">mlx::core::RandomBits</a></div><div class="ttdef"><b>Definition</b> primitives.h:1578</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_random_bits_html_a5752d051cd16cf5f8d4754c0a656f0d2"><div class="ttname"><a href="classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2">mlx::core::RandomBits::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_random_bits_html_a578756866665358577418e4cdd94aa3a"><div class="ttname"><a href="classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a">mlx::core::RandomBits::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_random_bits_html_a7dd5ed03f2a4ab45d1d5e8e2b587de6b"><div class="ttname"><a href="classmlx_1_1core_1_1_random_bits.html#a7dd5ed03f2a4ab45d1d5e8e2b587de6b">mlx::core::RandomBits::RandomBits</a></div><div class="ttdeci">RandomBits(Stream stream, const std::vector&lt; int &gt; &amp;shape, int width)</div><div class="ttdef"><b>Definition</b> primitives.h:1561</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_real_html"><div class="ttname"><a href="classmlx_1_1core_1_1_real.html">mlx::core::Real</a></div><div class="ttdef"><b>Definition</b> primitives.h:1578</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_random_bits_html_a7dd5ed03f2a4ab45d1d5e8e2b587de6b"><div class="ttname"><a href="classmlx_1_1core_1_1_random_bits.html#a7dd5ed03f2a4ab45d1d5e8e2b587de6b">mlx::core::RandomBits::RandomBits</a></div><div class="ttdeci">RandomBits(Stream stream, const std::vector&lt; int &gt; &amp;shape, int width)</div><div class="ttdef"><b>Definition</b> primitives.h:1580</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_real_html"><div class="ttname"><a href="classmlx_1_1core_1_1_real.html">mlx::core::Real</a></div><div class="ttdef"><b>Definition</b> primitives.h:1597</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_real_html_a1e209e88a43bdd1eea43ad0b03f9a7f2"><div class="ttname"><a href="classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2">mlx::core::Real::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_real_html_a365d046caac91b521f0f5a5518037934"><div class="ttname"><a href="classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934">mlx::core::Real::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_real_html_acd4480e3f0834d70ff6b5f1ecef17892"><div class="ttname"><a href="classmlx_1_1core_1_1_real.html#acd4480e3f0834d70ff6b5f1ecef17892">mlx::core::Real::Real</a></div><div class="ttdeci">Real(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1580</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html">mlx::core::Reduce</a></div><div class="ttdef"><b>Definition</b> primitives.h:1619</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html_a055368c1d036fb953a23ef230e33dcbf"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html#a055368c1d036fb953a23ef230e33dcbf">mlx::core::Reduce::Reduce</a></div><div class="ttdeci">Reduce(Stream stream, ReduceType reduce_type, const std::vector&lt; int &gt; &amp;axes)</div><div class="ttdef"><b>Definition</b> primitives.h:1623</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html_a0848518b16ae6d4043d6be247bdf31c9"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9">mlx::core::Reduce::ReduceType</a></div><div class="ttdeci">ReduceType</div><div class="ttdef"><b>Definition</b> primitives.h:1621</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html_a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93">mlx::core::Reduce::And</a></div><div class="ttdeci">@ And</div><div class="ttdef"><b>Definition</b> primitives.h:1621</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_real_html_acd4480e3f0834d70ff6b5f1ecef17892"><div class="ttname"><a href="classmlx_1_1core_1_1_real.html#acd4480e3f0834d70ff6b5f1ecef17892">mlx::core::Real::Real</a></div><div class="ttdeci">Real(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1599</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html">mlx::core::Reduce</a></div><div class="ttdef"><b>Definition</b> primitives.h:1638</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html_a055368c1d036fb953a23ef230e33dcbf"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html#a055368c1d036fb953a23ef230e33dcbf">mlx::core::Reduce::Reduce</a></div><div class="ttdeci">Reduce(Stream stream, ReduceType reduce_type, const std::vector&lt; int &gt; &amp;axes)</div><div class="ttdef"><b>Definition</b> primitives.h:1642</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html_a0848518b16ae6d4043d6be247bdf31c9"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9">mlx::core::Reduce::ReduceType</a></div><div class="ttdeci">ReduceType</div><div class="ttdef"><b>Definition</b> primitives.h:1640</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html_a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93">mlx::core::Reduce::And</a></div><div class="ttdeci">@ And</div><div class="ttdef"><b>Definition</b> primitives.h:1640</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html_abe8f3327d617d0dd7438f066497ae08e"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e">mlx::core::Reduce::is_equivalent</a></div><div class="ttdeci">bool is_equivalent(const Primitive &amp;other) const override</div><div class="ttdoc">Equivalence check defaults to false unless overridden by the primitive.</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html_ae9caaf42edadfe73ea208d98f526890f"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f">mlx::core::Reduce::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_reduce_html_aeb8a58b560c0a09ae3a695df7829acfa"><div class="ttname"><a href="classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa">mlx::core::Reduce::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_remainder_html"><div class="ttname"><a href="classmlx_1_1core_1_1_remainder.html">mlx::core::Remainder</a></div><div class="ttdef"><b>Definition</b> primitives.h:863</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_remainder_html_a4f3eada4a21898af4a77d1d27ce14641"><div class="ttname"><a href="classmlx_1_1core_1_1_remainder.html#a4f3eada4a21898af4a77d1d27ce14641">mlx::core::Remainder::Remainder</a></div><div class="ttdeci">Remainder(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:865</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_remainder_html"><div class="ttname"><a href="classmlx_1_1core_1_1_remainder.html">mlx::core::Remainder</a></div><div class="ttdef"><b>Definition</b> primitives.h:882</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_remainder_html_a4f3eada4a21898af4a77d1d27ce14641"><div class="ttname"><a href="classmlx_1_1core_1_1_remainder.html#a4f3eada4a21898af4a77d1d27ce14641">mlx::core::Remainder::Remainder</a></div><div class="ttdeci">Remainder(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:884</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_remainder_html_a7919ea9b84e42522d51bf0d5a396e161"><div class="ttname"><a href="classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161">mlx::core::Remainder::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_remainder_html_ac6c6c86a0bf02e6e529eb87f6e617ccc"><div class="ttname"><a href="classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc">mlx::core::Remainder::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_reshape_html"><div class="ttname"><a href="classmlx_1_1core_1_1_reshape.html">mlx::core::Reshape</a></div><div class="ttdef"><b>Definition</b> primitives.h:1592</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_reshape_html"><div class="ttname"><a href="classmlx_1_1core_1_1_reshape.html">mlx::core::Reshape</a></div><div class="ttdef"><b>Definition</b> primitives.h:1611</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_reshape_html_a658de2c5f710991b48e14b2bd19b229f"><div class="ttname"><a href="classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f">mlx::core::Reshape::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_reshape_html_aa1e85f28471875750c47351520b56059"><div class="ttname"><a href="classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059">mlx::core::Reshape::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_reshape_html_aa7c5a6e18d0615ad36102de01929eb26"><div class="ttname"><a href="classmlx_1_1core_1_1_reshape.html#aa7c5a6e18d0615ad36102de01929eb26">mlx::core::Reshape::Reshape</a></div><div class="ttdeci">Reshape(Stream stream, const std::vector&lt; int &gt; &amp;shape)</div><div class="ttdef"><b>Definition</b> primitives.h:1594</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_round_html"><div class="ttname"><a href="classmlx_1_1core_1_1_round.html">mlx::core::Round</a></div><div class="ttdef"><b>Definition</b> primitives.h:1674</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_round_html_a1327a359b2aed91f576145a0e70d1dde"><div class="ttname"><a href="classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde">mlx::core::Round::Round</a></div><div class="ttdeci">Round(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1676</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_reshape_html_aa7c5a6e18d0615ad36102de01929eb26"><div class="ttname"><a href="classmlx_1_1core_1_1_reshape.html#aa7c5a6e18d0615ad36102de01929eb26">mlx::core::Reshape::Reshape</a></div><div class="ttdeci">Reshape(Stream stream, const std::vector&lt; int &gt; &amp;shape)</div><div class="ttdef"><b>Definition</b> primitives.h:1613</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_round_html"><div class="ttname"><a href="classmlx_1_1core_1_1_round.html">mlx::core::Round</a></div><div class="ttdef"><b>Definition</b> primitives.h:1693</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_round_html_a1327a359b2aed91f576145a0e70d1dde"><div class="ttname"><a href="classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde">mlx::core::Round::Round</a></div><div class="ttdeci">Round(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1695</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_round_html_ad066b0944b437f64ab546025efa00007"><div class="ttname"><a href="classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007">mlx::core::Round::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_round_html_af7fe5ff8f3db166c203b4be4b07f13ec"><div class="ttname"><a href="classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec">mlx::core::Round::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_s_v_d_html"><div class="ttname"><a href="classmlx_1_1core_1_1_s_v_d.html">mlx::core::SVD</a></div><div class="ttdef"><b>Definition</b> primitives.h:2149</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_s_v_d_html"><div class="ttname"><a href="classmlx_1_1core_1_1_s_v_d.html">mlx::core::SVD</a></div><div class="ttdef"><b>Definition</b> primitives.h:2168</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_s_v_d_html_a637f5c39fa8b10722c04a066f6c1ada6"><div class="ttname"><a href="classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6">mlx::core::SVD::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_s_v_d_html_a7067b2207f826a25549d571856b94e83"><div class="ttname"><a href="classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83">mlx::core::SVD::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_s_v_d_html_ae89ff583e34fa894cccb8e7a475ee6d1"><div class="ttname"><a href="classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1">mlx::core::SVD::SVD</a></div><div class="ttdeci">SVD(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2151</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_scan_html"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html">mlx::core::Scan</a></div><div class="ttdef"><b>Definition</b> primitives.h:1691</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_s_v_d_html_ae89ff583e34fa894cccb8e7a475ee6d1"><div class="ttname"><a href="classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1">mlx::core::SVD::SVD</a></div><div class="ttdeci">SVD(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2170</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_scan_html"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html">mlx::core::Scan</a></div><div class="ttdef"><b>Definition</b> primitives.h:1710</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_scan_html_a15676d9fd066e935782a923fba3e940b"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b">mlx::core::Scan::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_scan_html_a47bf2ec54ead4b8f00f9f188518630f1"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1">mlx::core::Scan::ReduceType</a></div><div class="ttdeci">ReduceType</div><div class="ttdef"><b>Definition</b> primitives.h:1693</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_scan_html_a47bf2ec54ead4b8f00f9f188518630f1ad54b2905015a390708f79bae6cdac56d"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ad54b2905015a390708f79bae6cdac56d">mlx::core::Scan::Max</a></div><div class="ttdeci">@ Max</div><div class="ttdef"><b>Definition</b> primitives.h:1693</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_scan_html_a47bf2ec54ead4b8f00f9f188518630f1"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1">mlx::core::Scan::ReduceType</a></div><div class="ttdeci">ReduceType</div><div class="ttdef"><b>Definition</b> primitives.h:1712</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_scan_html_a47bf2ec54ead4b8f00f9f188518630f1ad54b2905015a390708f79bae6cdac56d"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ad54b2905015a390708f79bae6cdac56d">mlx::core::Scan::Max</a></div><div class="ttdeci">@ Max</div><div class="ttdef"><b>Definition</b> primitives.h:1712</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_scan_html_a54445a4d677ca4fe2a58d08eb5223ac6"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6">mlx::core::Scan::is_equivalent</a></div><div class="ttdeci">bool is_equivalent(const Primitive &amp;other) const override</div><div class="ttdoc">Equivalence check defaults to false unless overridden by the primitive.</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_scan_html_ac93e8f9c6771de825d2186ef34fa7087"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html#ac93e8f9c6771de825d2186ef34fa7087">mlx::core::Scan::Scan</a></div><div class="ttdeci">Scan(Stream stream, ReduceType reduce_type, int axis, bool reverse, bool inclusive)</div><div class="ttdef"><b>Definition</b> primitives.h:1695</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_scan_html_ac93e8f9c6771de825d2186ef34fa7087"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html#ac93e8f9c6771de825d2186ef34fa7087">mlx::core::Scan::Scan</a></div><div class="ttdeci">Scan(Stream stream, ReduceType reduce_type, int axis, bool reverse, bool inclusive)</div><div class="ttdef"><b>Definition</b> primitives.h:1714</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_scan_html_aef22c6fc2b2cb2a907cd8965c7413dde"><div class="ttname"><a href="classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde">mlx::core::Scan::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html">mlx::core::Scatter</a></div><div class="ttdef"><b>Definition</b> primitives.h:1741</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html">mlx::core::Scatter</a></div><div class="ttdef"><b>Definition</b> primitives.h:1760</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_a0208172562abdc90472e6eb5f84c987f"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f">mlx::core::Scatter::is_equivalent</a></div><div class="ttdeci">bool is_equivalent(const Primitive &amp;other) const override</div><div class="ttdoc">Equivalence check defaults to false unless overridden by the primitive.</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_a614d19af11dc30644b2b4941033b613c"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613c">mlx::core::Scatter::ReduceType</a></div><div class="ttdeci">ReduceType</div><div class="ttdef"><b>Definition</b> primitives.h:1743</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_a614d19af11dc30644b2b4941033b613ca1c2da7b96d743296fe660f5fc4072f16"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca1c2da7b96d743296fe660f5fc4072f16">mlx::core::Scatter::Max</a></div><div class="ttdeci">@ Max</div><div class="ttdef"><b>Definition</b> primitives.h:1743</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_a614d19af11dc30644b2b4941033b613c"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613c">mlx::core::Scatter::ReduceType</a></div><div class="ttdeci">ReduceType</div><div class="ttdef"><b>Definition</b> primitives.h:1762</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_a614d19af11dc30644b2b4941033b613ca1c2da7b96d743296fe660f5fc4072f16"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca1c2da7b96d743296fe660f5fc4072f16">mlx::core::Scatter::Max</a></div><div class="ttdeci">@ Max</div><div class="ttdef"><b>Definition</b> primitives.h:1762</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_a7623f590f8b77167b5ebb4f14bc9dc97"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97">mlx::core::Scatter::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_aa9d45cbfb27b814517f6016092b30efa"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">mlx::core::Scatter::print</a></div><div class="ttdeci">void print(std::ostream &amp;os) override</div><div class="ttdoc">Print the primitive.</div><div class="ttdef"><b>Definition</b> primitives.h:1757</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_aa9d45cbfb27b814517f6016092b30efa"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa">mlx::core::Scatter::print</a></div><div class="ttdeci">void print(std::ostream &amp;os) override</div><div class="ttdoc">Print the primitive.</div><div class="ttdef"><b>Definition</b> primitives.h:1776</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_ab304345db3d8cfeea15e27461ae2e678"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678">mlx::core::Scatter::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_ac9b3eff67389ef9aa820753379ffeaa3"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#ac9b3eff67389ef9aa820753379ffeaa3">mlx::core::Scatter::Scatter</a></div><div class="ttdeci">Scatter(Stream stream, ReduceType reduce_type, const std::vector&lt; int &gt; &amp;axes)</div><div class="ttdef"><b>Definition</b> primitives.h:1745</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_select_html"><div class="ttname"><a href="classmlx_1_1core_1_1_select.html">mlx::core::Select</a></div><div class="ttdef"><b>Definition</b> primitives.h:846</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_scatter_html_ac9b3eff67389ef9aa820753379ffeaa3"><div class="ttname"><a href="classmlx_1_1core_1_1_scatter.html#ac9b3eff67389ef9aa820753379ffeaa3">mlx::core::Scatter::Scatter</a></div><div class="ttdeci">Scatter(Stream stream, ReduceType reduce_type, const std::vector&lt; int &gt; &amp;axes)</div><div class="ttdef"><b>Definition</b> primitives.h:1764</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_select_html"><div class="ttname"><a href="classmlx_1_1core_1_1_select.html">mlx::core::Select</a></div><div class="ttdef"><b>Definition</b> primitives.h:865</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_select_html_a2a82b6cba4c386b2b87f225a4b08ea9b"><div class="ttname"><a href="classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b">mlx::core::Select::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_select_html_a6f833fe55dd68ad3726bbf9a8f75eec9"><div class="ttname"><a href="classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9">mlx::core::Select::Select</a></div><div class="ttdeci">Select(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:848</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_select_html_a6f833fe55dd68ad3726bbf9a8f75eec9"><div class="ttname"><a href="classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9">mlx::core::Select::Select</a></div><div class="ttdeci">Select(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:867</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_select_html_aa51aa36e0adbd69e0d23d7c7adf88de2"><div class="ttname"><a href="classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2">mlx::core::Select::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sigmoid_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sigmoid.html">mlx::core::Sigmoid</a></div><div class="ttdef"><b>Definition</b> primitives.h:1784</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sigmoid_html_a47eca99113ec19f0eb60b6a0472c592b"><div class="ttname"><a href="classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b">mlx::core::Sigmoid::Sigmoid</a></div><div class="ttdeci">Sigmoid(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1786</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sigmoid_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sigmoid.html">mlx::core::Sigmoid</a></div><div class="ttdef"><b>Definition</b> primitives.h:1803</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sigmoid_html_a47eca99113ec19f0eb60b6a0472c592b"><div class="ttname"><a href="classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b">mlx::core::Sigmoid::Sigmoid</a></div><div class="ttdeci">Sigmoid(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1805</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sigmoid_html_a7a6bd0222d51d7f25f2719a91ccdfeca"><div class="ttname"><a href="classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca">mlx::core::Sigmoid::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sigmoid_html_aa930ce05734cca529ebcb8d0ca8e1255"><div class="ttname"><a href="classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255">mlx::core::Sigmoid::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sign_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sign.html">mlx::core::Sign</a></div><div class="ttdef"><b>Definition</b> primitives.h:1801</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sign_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sign.html">mlx::core::Sign</a></div><div class="ttdef"><b>Definition</b> primitives.h:1820</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sign_html_a7498ec993b66879be30c5d9762c45a97"><div class="ttname"><a href="classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97">mlx::core::Sign::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sign_html_afa2b48b99a194106006b44af69ffda8b"><div class="ttname"><a href="classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b">mlx::core::Sign::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sign_html_afe951e50907bc23a601ec5fa9eae5763"><div class="ttname"><a href="classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763">mlx::core::Sign::Sign</a></div><div class="ttdeci">Sign(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1803</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sin_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sin.html">mlx::core::Sin</a></div><div class="ttdef"><b>Definition</b> primitives.h:1818</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sin_html_a10d1ecc0ca96e79cdf55b57073d126ea"><div class="ttname"><a href="classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea">mlx::core::Sin::Sin</a></div><div class="ttdeci">Sin(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1820</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sign_html_afe951e50907bc23a601ec5fa9eae5763"><div class="ttname"><a href="classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763">mlx::core::Sign::Sign</a></div><div class="ttdeci">Sign(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1822</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sin_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sin.html">mlx::core::Sin</a></div><div class="ttdef"><b>Definition</b> primitives.h:1837</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sin_html_a10d1ecc0ca96e79cdf55b57073d126ea"><div class="ttname"><a href="classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea">mlx::core::Sin::Sin</a></div><div class="ttdeci">Sin(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1839</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sin_html_a6b59f1156cf8bdad8d45acd1d825cb5e"><div class="ttname"><a href="classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e">mlx::core::Sin::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sin_html_ab34f9cebc2aed55a0b6ab4c991f02eb5"><div class="ttname"><a href="classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5">mlx::core::Sin::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sinh_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sinh.html">mlx::core::Sinh</a></div><div class="ttdef"><b>Definition</b> primitives.h:1835</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sinh_html_a4a4f6814d403c2ce5d6c574b0dca3c96"><div class="ttname"><a href="classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96">mlx::core::Sinh::Sinh</a></div><div class="ttdeci">Sinh(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1837</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sinh_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sinh.html">mlx::core::Sinh</a></div><div class="ttdef"><b>Definition</b> primitives.h:1854</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sinh_html_a4a4f6814d403c2ce5d6c574b0dca3c96"><div class="ttname"><a href="classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96">mlx::core::Sinh::Sinh</a></div><div class="ttdeci">Sinh(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1856</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sinh_html_a5a1af2399f166d5b228b5e83a1837c75"><div class="ttname"><a href="classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75">mlx::core::Sinh::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sinh_html_ab6d5f6f40d177f6435f6a51c71b939dd"><div class="ttname"><a href="classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd">mlx::core::Sinh::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_slice_html"><div class="ttname"><a href="classmlx_1_1core_1_1_slice.html">mlx::core::Slice</a></div><div class="ttdef"><b>Definition</b> primitives.h:1852</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_slice_html"><div class="ttname"><a href="classmlx_1_1core_1_1_slice.html">mlx::core::Slice</a></div><div class="ttdef"><b>Definition</b> primitives.h:1871</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_slice_html_a4b13503f5b2f5c6a90d394b020f9b3f2"><div class="ttname"><a href="classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2">mlx::core::Slice::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_slice_html_a8a38feb7bb6b72bdeebb83f053e2fd7f"><div class="ttname"><a href="classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f">mlx::core::Slice::Slice</a></div><div class="ttdeci">Slice(Stream stream, const std::vector&lt; int &gt; &amp;start_indices, const std::vector&lt; int &gt; &amp;end_indices, const std::vector&lt; int &gt; &amp;strides)</div><div class="ttdef"><b>Definition</b> primitives.h:1854</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_slice_html_a8a38feb7bb6b72bdeebb83f053e2fd7f"><div class="ttname"><a href="classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f">mlx::core::Slice::Slice</a></div><div class="ttdeci">Slice(Stream stream, const std::vector&lt; int &gt; &amp;start_indices, const std::vector&lt; int &gt; &amp;end_indices, const std::vector&lt; int &gt; &amp;strides)</div><div class="ttdef"><b>Definition</b> primitives.h:1873</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_slice_html_aa53c21ff06a7c659e889af6b97d10a4a"><div class="ttname"><a href="classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a">mlx::core::Slice::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_slice_update_html"><div class="ttname"><a href="classmlx_1_1core_1_1_slice_update.html">mlx::core::SliceUpdate</a></div><div class="ttdef"><b>Definition</b> primitives.h:1880</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_slice_update_html_aa30a7f22f557c56e1a2b5fcf44488990"><div class="ttname"><a href="classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990">mlx::core::SliceUpdate::SliceUpdate</a></div><div class="ttdeci">SliceUpdate(Stream stream, const std::vector&lt; int &gt; &amp;start_indices, const std::vector&lt; int &gt; &amp;end_indices, const std::vector&lt; int &gt; &amp;strides)</div><div class="ttdef"><b>Definition</b> primitives.h:1882</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_slice_update_html"><div class="ttname"><a href="classmlx_1_1core_1_1_slice_update.html">mlx::core::SliceUpdate</a></div><div class="ttdef"><b>Definition</b> primitives.h:1899</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_slice_update_html_aa30a7f22f557c56e1a2b5fcf44488990"><div class="ttname"><a href="classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990">mlx::core::SliceUpdate::SliceUpdate</a></div><div class="ttdeci">SliceUpdate(Stream stream, const std::vector&lt; int &gt; &amp;start_indices, const std::vector&lt; int &gt; &amp;end_indices, const std::vector&lt; int &gt; &amp;strides)</div><div class="ttdef"><b>Definition</b> primitives.h:1901</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_slice_update_html_aac1a1d122e5697be057d63552141032b"><div class="ttname"><a href="classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b">mlx::core::SliceUpdate::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_slice_update_html_ad82ca0e3ab88a0e086431050deea831b"><div class="ttname"><a href="classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b">mlx::core::SliceUpdate::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_softmax_html"><div class="ttname"><a href="classmlx_1_1core_1_1_softmax.html">mlx::core::Softmax</a></div><div class="ttdef"><b>Definition</b> primitives.h:1910</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_softmax_html"><div class="ttname"><a href="classmlx_1_1core_1_1_softmax.html">mlx::core::Softmax</a></div><div class="ttdef"><b>Definition</b> primitives.h:1929</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_softmax_html_a35dac69ddcc7e2ec0e1a76fe93db85af"><div class="ttname"><a href="classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af">mlx::core::Softmax::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_softmax_html_a4ec686aac4e06f0dfe2cbd6801af40eb"><div class="ttname"><a href="classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb">mlx::core::Softmax::Softmax</a></div><div class="ttdeci">Softmax(Stream stream, bool precise)</div><div class="ttdef"><b>Definition</b> primitives.h:1912</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_softmax_html_a4ec686aac4e06f0dfe2cbd6801af40eb"><div class="ttname"><a href="classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb">mlx::core::Softmax::Softmax</a></div><div class="ttdeci">Softmax(Stream stream, bool precise)</div><div class="ttdef"><b>Definition</b> primitives.h:1931</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_softmax_html_ac9ebc2eab1683b682e689ed8f4622b79"><div class="ttname"><a href="classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79">mlx::core::Softmax::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sort_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sort.html">mlx::core::Sort</a></div><div class="ttdef"><b>Definition</b> primitives.h:1930</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sort_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sort.html">mlx::core::Sort</a></div><div class="ttdef"><b>Definition</b> primitives.h:1949</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sort_html_a4141c48f0e8670c728663f3722675382"><div class="ttname"><a href="classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382">mlx::core::Sort::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sort_html_a459769a0241b2620e55bedaba19827cd"><div class="ttname"><a href="classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd">mlx::core::Sort::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sort_html_a62943032dbd72e85ceb9b4b7211f4a44"><div class="ttname"><a href="classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44">mlx::core::Sort::Sort</a></div><div class="ttdeci">Sort(Stream stream, int axis)</div><div class="ttdef"><b>Definition</b> primitives.h:1932</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_split_html"><div class="ttname"><a href="classmlx_1_1core_1_1_split.html">mlx::core::Split</a></div><div class="ttdef"><b>Definition</b> primitives.h:1950</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sort_html_a62943032dbd72e85ceb9b4b7211f4a44"><div class="ttname"><a href="classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44">mlx::core::Sort::Sort</a></div><div class="ttdeci">Sort(Stream stream, int axis)</div><div class="ttdef"><b>Definition</b> primitives.h:1951</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_split_html"><div class="ttname"><a href="classmlx_1_1core_1_1_split.html">mlx::core::Split</a></div><div class="ttdef"><b>Definition</b> primitives.h:1969</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_split_html_a78ddda89c4daee73c74cfbc1e44656df"><div class="ttname"><a href="classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df">mlx::core::Split::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_split_html_a897c746ecfdff5119cc5ae3f20499385"><div class="ttname"><a href="classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385">mlx::core::Split::Split</a></div><div class="ttdeci">Split(Stream stream, const std::vector&lt; int &gt; &amp;indices, int axis)</div><div class="ttdef"><b>Definition</b> primitives.h:1952</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_split_html_a897c746ecfdff5119cc5ae3f20499385"><div class="ttname"><a href="classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385">mlx::core::Split::Split</a></div><div class="ttdeci">Split(Stream stream, const std::vector&lt; int &gt; &amp;indices, int axis)</div><div class="ttdef"><b>Definition</b> primitives.h:1971</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_split_html_aff2889cb9074f0fda53edf8fa40b1fd4"><div class="ttname"><a href="classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4">mlx::core::Split::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sqrt_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sqrt.html">mlx::core::Sqrt</a></div><div class="ttdef"><b>Definition</b> primitives.h:1989</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sqrt_html"><div class="ttname"><a href="classmlx_1_1core_1_1_sqrt.html">mlx::core::Sqrt</a></div><div class="ttdef"><b>Definition</b> primitives.h:2008</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sqrt_html_a5a64ecc4eef1e30a2963435dca7cefd5"><div class="ttname"><a href="classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5">mlx::core::Sqrt::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_sqrt_html_a6682a7c31ca427c9d2c5ddb6a479bf29"><div class="ttname"><a href="classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29">mlx::core::Sqrt::Sqrt</a></div><div class="ttdeci">Sqrt(Stream stream, bool recip=false)</div><div class="ttdef"><b>Definition</b> primitives.h:1991</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_sqrt_html_a6682a7c31ca427c9d2c5ddb6a479bf29"><div class="ttname"><a href="classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29">mlx::core::Sqrt::Sqrt</a></div><div class="ttdeci">Sqrt(Stream stream, bool recip=false)</div><div class="ttdef"><b>Definition</b> primitives.h:2010</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_sqrt_html_a6d205e679a593d1ba20206c5c47ba501"><div class="ttname"><a href="classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501">mlx::core::Sqrt::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_square_html"><div class="ttname"><a href="classmlx_1_1core_1_1_square.html">mlx::core::Square</a></div><div class="ttdef"><b>Definition</b> primitives.h:1972</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_square_html"><div class="ttname"><a href="classmlx_1_1core_1_1_square.html">mlx::core::Square</a></div><div class="ttdef"><b>Definition</b> primitives.h:1991</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_square_html_a0ea2a78a5bb52daa4103263bf2f98045"><div class="ttname"><a href="classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045">mlx::core::Square::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_square_html_a1f4d327a705950616da63b83c2829e59"><div class="ttname"><a href="classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59">mlx::core::Square::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_square_html_ab94e28d5c92e6febc1c74e525f730dc4"><div class="ttname"><a href="classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4">mlx::core::Square::Square</a></div><div class="ttdeci">Square(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1974</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_stop_gradient_html"><div class="ttname"><a href="classmlx_1_1core_1_1_stop_gradient.html">mlx::core::StopGradient</a></div><div class="ttdef"><b>Definition</b> primitives.h:2015</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_square_html_ab94e28d5c92e6febc1c74e525f730dc4"><div class="ttname"><a href="classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4">mlx::core::Square::Square</a></div><div class="ttdeci">Square(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:1993</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_stop_gradient_html"><div class="ttname"><a href="classmlx_1_1core_1_1_stop_gradient.html">mlx::core::StopGradient</a></div><div class="ttdef"><b>Definition</b> primitives.h:2034</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_stop_gradient_html_a56207714d374b08f60e4d9cdbc7340b2"><div class="ttname"><a href="classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2">mlx::core::StopGradient::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_stop_gradient_html_a907b96f0a1ce608e211d87ccf2b9ca89"><div class="ttname"><a href="classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89">mlx::core::StopGradient::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_stop_gradient_html_ac70d1ab819d04e00f76bc25aeebaf84f"><div class="ttname"><a href="classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f">mlx::core::StopGradient::StopGradient</a></div><div class="ttdeci">StopGradient(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2017</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_subtract_html"><div class="ttname"><a href="classmlx_1_1core_1_1_subtract.html">mlx::core::Subtract</a></div><div class="ttdef"><b>Definition</b> primitives.h:2031</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_stop_gradient_html_ac70d1ab819d04e00f76bc25aeebaf84f"><div class="ttname"><a href="classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f">mlx::core::StopGradient::StopGradient</a></div><div class="ttdeci">StopGradient(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2036</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_subtract_html"><div class="ttname"><a href="classmlx_1_1core_1_1_subtract.html">mlx::core::Subtract</a></div><div class="ttdef"><b>Definition</b> primitives.h:2050</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_subtract_html_a47574258b6c95f8ad260c114d6d36a12"><div class="ttname"><a href="classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12">mlx::core::Subtract::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_subtract_html_a69021b23daf061764d97fabbc0f4f06c"><div class="ttname"><a href="classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c">mlx::core::Subtract::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_subtract_html_a834854757394f8de7082af65bf86ed9c"><div class="ttname"><a href="classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c">mlx::core::Subtract::Subtract</a></div><div class="ttdeci">Subtract(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2033</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_tan_html"><div class="ttname"><a href="classmlx_1_1core_1_1_tan.html">mlx::core::Tan</a></div><div class="ttdef"><b>Definition</b> primitives.h:2048</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_tan_html_a8dcc9ff660210ccf05134dd95f47de08"><div class="ttname"><a href="classmlx_1_1core_1_1_tan.html#a8dcc9ff660210ccf05134dd95f47de08">mlx::core::Tan::Tan</a></div><div class="ttdeci">Tan(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2050</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_subtract_html_a834854757394f8de7082af65bf86ed9c"><div class="ttname"><a href="classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c">mlx::core::Subtract::Subtract</a></div><div class="ttdeci">Subtract(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2052</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_tan_html"><div class="ttname"><a href="classmlx_1_1core_1_1_tan.html">mlx::core::Tan</a></div><div class="ttdef"><b>Definition</b> primitives.h:2067</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_tan_html_a8dcc9ff660210ccf05134dd95f47de08"><div class="ttname"><a href="classmlx_1_1core_1_1_tan.html#a8dcc9ff660210ccf05134dd95f47de08">mlx::core::Tan::Tan</a></div><div class="ttdeci">Tan(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2069</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_tan_html_a9c9a731158fa60eef30067fe0da9f3e9"><div class="ttname"><a href="classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9">mlx::core::Tan::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_tan_html_aca7dbb4836507005a2032ac957a04d3f"><div class="ttname"><a href="classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f">mlx::core::Tan::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_tanh_html"><div class="ttname"><a href="classmlx_1_1core_1_1_tanh.html">mlx::core::Tanh</a></div><div class="ttdef"><b>Definition</b> primitives.h:2065</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_tanh_html"><div class="ttname"><a href="classmlx_1_1core_1_1_tanh.html">mlx::core::Tanh</a></div><div class="ttdef"><b>Definition</b> primitives.h:2084</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_tanh_html_a48df896599ae93dbce84a5c0f50cf761"><div class="ttname"><a href="classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761">mlx::core::Tanh::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_tanh_html_ae551297bf573e1802fb831440276dee4"><div class="ttname"><a href="classmlx_1_1core_1_1_tanh.html#ae551297bf573e1802fb831440276dee4">mlx::core::Tanh::Tanh</a></div><div class="ttdeci">Tanh(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2067</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_tanh_html_ae551297bf573e1802fb831440276dee4"><div class="ttname"><a href="classmlx_1_1core_1_1_tanh.html#ae551297bf573e1802fb831440276dee4">mlx::core::Tanh::Tanh</a></div><div class="ttdeci">Tanh(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2086</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_tanh_html_af7ed4345f622da069e5b0284067923f5"><div class="ttname"><a href="classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5">mlx::core::Tanh::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_transpose_html"><div class="ttname"><a href="classmlx_1_1core_1_1_transpose.html">mlx::core::Transpose</a></div><div class="ttdef"><b>Definition</b> primitives.h:2113</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_transpose_html_a1a9ba023584c61c7ac93d6dce536760a"><div class="ttname"><a href="classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a">mlx::core::Transpose::Transpose</a></div><div class="ttdeci">Transpose(Stream stream, const std::vector&lt; int &gt; &amp;axes)</div><div class="ttdef"><b>Definition</b> primitives.h:2115</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_transpose_html"><div class="ttname"><a href="classmlx_1_1core_1_1_transpose.html">mlx::core::Transpose</a></div><div class="ttdef"><b>Definition</b> primitives.h:2132</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_transpose_html_a1a9ba023584c61c7ac93d6dce536760a"><div class="ttname"><a href="classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a">mlx::core::Transpose::Transpose</a></div><div class="ttdeci">Transpose(Stream stream, const std::vector&lt; int &gt; &amp;axes)</div><div class="ttdef"><b>Definition</b> primitives.h:2134</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_transpose_html_a1fbcfcca43f9ec06c63a3c14708c30f8"><div class="ttname"><a href="classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8">mlx::core::Transpose::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_transpose_html_a38d25739c08aa594a6775015a1d7d92e"><div class="ttname"><a href="classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e">mlx::core::Transpose::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_unary_primitive_html"><div class="ttname"><a href="classmlx_1_1core_1_1_unary_primitive.html">mlx::core::UnaryPrimitive</a></div><div class="ttdef"><b>Definition</b> primitives.h:127</div></div>
@@ -3065,13 +3092,13 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aclassmlx_1_1core_1_1_unary_primitive_html_aa0ed6e32c36200a3ff9bc592c9b300db"><div class="ttname"><a href="classmlx_1_1core_1_1_unary_primitive.html#aa0ed6e32c36200a3ff9bc592c9b300db">mlx::core::UnaryPrimitive::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override</div><div class="ttdoc">A primitive must know how to evaluate itself on the CPU/GPU for the given inputs and populate the out...</div><div class="ttdef"><b>Definition</b> primitives.h:137</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_unary_primitive_html_ab90b2ea80f1d914be03cf44def5db5a5"><div class="ttname"><a href="classmlx_1_1core_1_1_unary_primitive.html#ab90b2ea80f1d914be03cf44def5db5a5">mlx::core::UnaryPrimitive::operator=</a></div><div class="ttdeci">UnaryPrimitive &amp; operator=(UnaryPrimitive &amp;&amp;other)=delete</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_unary_primitive_html_ac0677ab99a5ca660ed6ab7902ea364de"><div class="ttname"><a href="classmlx_1_1core_1_1_unary_primitive.html#ac0677ab99a5ca660ed6ab7902ea364de">mlx::core::UnaryPrimitive::~UnaryPrimitive</a></div><div class="ttdeci">virtual ~UnaryPrimitive()=default</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_uniform_html"><div class="ttname"><a href="classmlx_1_1core_1_1_uniform.html">mlx::core::Uniform</a></div><div class="ttdef"><b>Definition</b> primitives.h:2082</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_uniform_html"><div class="ttname"><a href="classmlx_1_1core_1_1_uniform.html">mlx::core::Uniform</a></div><div class="ttdef"><b>Definition</b> primitives.h:2101</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_uniform_html_a037a2c96b79b70a64f2b637c9f1a432f"><div class="ttname"><a href="classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f">mlx::core::Uniform::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_uniform_html_a5f88cbf2495f24f87cefd99aaaebe4d0"><div class="ttname"><a href="classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0">mlx::core::Uniform::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_uniform_html_a626aa1091aa77b4a32c02290106b85e1"><div class="ttname"><a href="classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1">mlx::core::Uniform::Uniform</a></div><div class="ttdeci">Uniform(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2084</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_view_html"><div class="ttname"><a href="classmlx_1_1core_1_1_view.html">mlx::core::View</a></div><div class="ttdef"><b>Definition</b> primitives.h:2097</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_uniform_html_a626aa1091aa77b4a32c02290106b85e1"><div class="ttname"><a href="classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1">mlx::core::Uniform::Uniform</a></div><div class="ttdeci">Uniform(Stream stream)</div><div class="ttdef"><b>Definition</b> primitives.h:2103</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_view_html"><div class="ttname"><a href="classmlx_1_1core_1_1_view.html">mlx::core::View</a></div><div class="ttdef"><b>Definition</b> primitives.h:2116</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_view_html_a0ad6deb11914a242f10e8039fcb02497"><div class="ttname"><a href="classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497">mlx::core::View::eval_cpu</a></div><div class="ttdeci">void eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1_view_html_ad7eed156c308e9a29a8b41f965ec941e"><div class="ttname"><a href="classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e">mlx::core::View::View</a></div><div class="ttdeci">View(Stream stream, Dtype dtype)</div><div class="ttdef"><b>Definition</b> primitives.h:2099</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1_view_html_ad7eed156c308e9a29a8b41f965ec941e"><div class="ttname"><a href="classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e">mlx::core::View::View</a></div><div class="ttdeci">View(Stream stream, Dtype dtype)</div><div class="ttdef"><b>Definition</b> primitives.h:2118</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1_view_html_add6e12ff1e476fe1db7718b14f21b075"><div class="ttname"><a href="classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075">mlx::core::View::eval_gpu</a></div><div class="ttdeci">void eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out) override</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
diff --git a/docs/build/html/python/_autosummary/mlx.core.Device.html b/docs/build/html/python/_autosummary/mlx.core.Device.html
index 595b8ae57..ff880fcf0 100644
--- a/docs/build/html/python/_autosummary/mlx.core.Device.html
+++ b/docs/build/html/python/_autosummary/mlx.core.Device.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.Device &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.Device &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Devices and Streams" href="../devices_and_streams.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.Dtype.html b/docs/build/html/python/_autosummary/mlx.core.Dtype.html
index 444b893e8..84df170e2 100644
--- a/docs/build/html/python/_autosummary/mlx.core.Dtype.html
+++ b/docs/build/html/python/_autosummary/mlx.core.Dtype.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.Dtype &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.Dtype &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Data Types" href="../data_types.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.DtypeCategory.html b/docs/build/html/python/_autosummary/mlx.core.DtypeCategory.html
index bb4562c99..0f28cb1ff 100644
--- a/docs/build/html/python/_autosummary/mlx.core.DtypeCategory.html
+++ b/docs/build/html/python/_autosummary/mlx.core.DtypeCategory.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.DtypeCategory &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.DtypeCategory &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.Dtype" href="mlx.core.Dtype.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.abs.html b/docs/build/html/python/_autosummary/mlx.core.abs.html
index 845ff50bc..0edbba635 100644
--- a/docs/build/html/python/_autosummary/mlx.core.abs.html
+++ b/docs/build/html/python/_autosummary/mlx.core.abs.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.abs &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.abs &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Operations" href="../ops.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.add.html b/docs/build/html/python/_autosummary/mlx.core.add.html
index c17e7295a..46acf931b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.add.html
+++ b/docs/build/html/python/_autosummary/mlx.core.add.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.add &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.add &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.abs" href="mlx.core.abs.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.addmm.html b/docs/build/html/python/_autosummary/mlx.core.addmm.html
index 0870a9ec2..e1bac246f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.addmm.html
+++ b/docs/build/html/python/_autosummary/mlx.core.addmm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.addmm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.addmm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.add" href="mlx.core.add.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.all.html b/docs/build/html/python/_autosummary/mlx.core.all.html
index e315832d6..b7dd55e18 100644
--- a/docs/build/html/python/_autosummary/mlx.core.all.html
+++ b/docs/build/html/python/_autosummary/mlx.core.all.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.all &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.all &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.addmm" href="mlx.core.addmm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.allclose.html b/docs/build/html/python/_autosummary/mlx.core.allclose.html
index 933a87992..c69d590f1 100644
--- a/docs/build/html/python/_autosummary/mlx.core.allclose.html
+++ b/docs/build/html/python/_autosummary/mlx.core.allclose.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.allclose &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.allclose &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.all" href="mlx.core.all.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.any.html b/docs/build/html/python/_autosummary/mlx.core.any.html
index b4ee9c590..f1472dbf3 100644
--- a/docs/build/html/python/_autosummary/mlx.core.any.html
+++ b/docs/build/html/python/_autosummary/mlx.core.any.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.any &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.any &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.allclose" href="mlx.core.allclose.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.arange.html b/docs/build/html/python/_autosummary/mlx.core.arange.html
index 7734e0414..3766992ec 100644
--- a/docs/build/html/python/_autosummary/mlx.core.arange.html
+++ b/docs/build/html/python/_autosummary/mlx.core.arange.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.arange &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.arange &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.any" href="mlx.core.any.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.arccos.html b/docs/build/html/python/_autosummary/mlx.core.arccos.html
index 4a0c9f620..59ae6cdb8 100644
--- a/docs/build/html/python/_autosummary/mlx.core.arccos.html
+++ b/docs/build/html/python/_autosummary/mlx.core.arccos.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.arccos &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.arccos &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.arange" href="mlx.core.arange.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.arccosh.html b/docs/build/html/python/_autosummary/mlx.core.arccosh.html
index 01ee928ee..aa0ab59d6 100644
--- a/docs/build/html/python/_autosummary/mlx.core.arccosh.html
+++ b/docs/build/html/python/_autosummary/mlx.core.arccosh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.arccosh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.arccosh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.arccos" href="mlx.core.arccos.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.arcsin.html b/docs/build/html/python/_autosummary/mlx.core.arcsin.html
index 51d90ad51..4b5d7b57b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.arcsin.html
+++ b/docs/build/html/python/_autosummary/mlx.core.arcsin.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.arcsin &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.arcsin &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.arccosh" href="mlx.core.arccosh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.arcsinh.html b/docs/build/html/python/_autosummary/mlx.core.arcsinh.html
index 55429324e..0d0ef608f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.arcsinh.html
+++ b/docs/build/html/python/_autosummary/mlx.core.arcsinh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.arcsinh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.arcsinh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.arcsin" href="mlx.core.arcsin.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.arctan.html b/docs/build/html/python/_autosummary/mlx.core.arctan.html
index 672285f5f..eac1b8fd4 100644
--- a/docs/build/html/python/_autosummary/mlx.core.arctan.html
+++ b/docs/build/html/python/_autosummary/mlx.core.arctan.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.arctan &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.arctan &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.arcsinh" href="mlx.core.arcsinh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.arctan2.html b/docs/build/html/python/_autosummary/mlx.core.arctan2.html
index b41fdcaa9..c93c32892 100644
--- a/docs/build/html/python/_autosummary/mlx.core.arctan2.html
+++ b/docs/build/html/python/_autosummary/mlx.core.arctan2.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.arctan2 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.arctan2 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.arctan" href="mlx.core.arctan.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.arctanh.html b/docs/build/html/python/_autosummary/mlx.core.arctanh.html
index 3c23cfcdb..22b8db91c 100644
--- a/docs/build/html/python/_autosummary/mlx.core.arctanh.html
+++ b/docs/build/html/python/_autosummary/mlx.core.arctanh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.arctanh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.arctanh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.arctan2" href="mlx.core.arctan2.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.argmax.html b/docs/build/html/python/_autosummary/mlx.core.argmax.html
index 1f181bac6..290bb4161 100644
--- a/docs/build/html/python/_autosummary/mlx.core.argmax.html
+++ b/docs/build/html/python/_autosummary/mlx.core.argmax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.argmax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.argmax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.arctanh" href="mlx.core.arctanh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.argmin.html b/docs/build/html/python/_autosummary/mlx.core.argmin.html
index 6937d2090..1df0424e4 100644
--- a/docs/build/html/python/_autosummary/mlx.core.argmin.html
+++ b/docs/build/html/python/_autosummary/mlx.core.argmin.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.argmin &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.argmin &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.argmax" href="mlx.core.argmax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.argpartition.html b/docs/build/html/python/_autosummary/mlx.core.argpartition.html
index 8760ff11f..d10427fab 100644
--- a/docs/build/html/python/_autosummary/mlx.core.argpartition.html
+++ b/docs/build/html/python/_autosummary/mlx.core.argpartition.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.argpartition &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.argpartition &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.argmin" href="mlx.core.argmin.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.argsort.html b/docs/build/html/python/_autosummary/mlx.core.argsort.html
index a094833bf..b98398486 100644
--- a/docs/build/html/python/_autosummary/mlx.core.argsort.html
+++ b/docs/build/html/python/_autosummary/mlx.core.argsort.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.argsort &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.argsort &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.argpartition" href="mlx.core.argpartition.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.T.html b/docs/build/html/python/_autosummary/mlx.core.array.T.html
index d6fb39f7c..9fd367af6 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.T.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.T.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.T &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.T &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.transpose" href="mlx.core.array.transpose.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.abs.html b/docs/build/html/python/_autosummary/mlx.core.array.abs.html
index b0724fa97..6075ece26 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.abs.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.abs.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.abs &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.abs &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.size" href="mlx.core.array.size.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.all.html b/docs/build/html/python/_autosummary/mlx.core.array.all.html
index 215e72d16..1dbde8ff2 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.all.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.all.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.all &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.all &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.abs" href="mlx.core.array.abs.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.any.html b/docs/build/html/python/_autosummary/mlx.core.array.any.html
index ab538dc58..f2f8bb881 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.any.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.any.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.any &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.any &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.all" href="mlx.core.array.all.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.argmax.html b/docs/build/html/python/_autosummary/mlx.core.array.argmax.html
index 641c46e0f..8bfc2aa74 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.argmax.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.argmax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.argmax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.argmax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.any" href="mlx.core.array.any.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.argmin.html b/docs/build/html/python/_autosummary/mlx.core.array.argmin.html
index 7198a13e7..e95fd586a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.argmin.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.argmin.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.argmin &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.argmin &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.argmax" href="mlx.core.array.argmax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.astype.html b/docs/build/html/python/_autosummary/mlx.core.array.astype.html
index ecfdcf812..da06c777c 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.astype.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.astype.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.astype &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.astype &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array" href="mlx.core.array.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.at.html b/docs/build/html/python/_autosummary/mlx.core.array.at.html
index 1d7069ea7..fbd2a504b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.at.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.at.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.at &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.at &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.astype" href="mlx.core.array.astype.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.conj.html b/docs/build/html/python/_autosummary/mlx.core.array.conj.html
index 43ee33bb8..27d0ced20 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.conj.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.conj.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.conj &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.conj &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.argmin" href="mlx.core.array.argmin.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.cos.html b/docs/build/html/python/_autosummary/mlx.core.array.cos.html
index cf916121d..452c646a5 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.cos.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.cos.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.cos &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.cos &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.conj" href="mlx.core.array.conj.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.cummax.html b/docs/build/html/python/_autosummary/mlx.core.array.cummax.html
index 53e45ee46..e08ca16fa 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.cummax.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.cummax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.cummax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.cummax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.cos" href="mlx.core.array.cos.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.cummin.html b/docs/build/html/python/_autosummary/mlx.core.array.cummin.html
index 2073d1b14..38cf2a26a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.cummin.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.cummin.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.cummin &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.cummin &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.cummax" href="mlx.core.array.cummax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.cumprod.html b/docs/build/html/python/_autosummary/mlx.core.array.cumprod.html
index 2f5348a48..4cdb34853 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.cumprod.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.cumprod.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.cumprod &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.cumprod &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.cummin" href="mlx.core.array.cummin.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.cumsum.html b/docs/build/html/python/_autosummary/mlx.core.array.cumsum.html
index f146ab4f1..31a318bca 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.cumsum.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.cumsum.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.cumsum &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.cumsum &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.cumprod" href="mlx.core.array.cumprod.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.diag.html b/docs/build/html/python/_autosummary/mlx.core.array.diag.html
index b19d84637..b0af6c341 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.diag.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.diag.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.diag &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.diag &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.cumsum" href="mlx.core.array.cumsum.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.diagonal.html b/docs/build/html/python/_autosummary/mlx.core.array.diagonal.html
index 777211684..23fb14a7a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.diagonal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.diagonal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.diagonal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.diagonal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.diag" href="mlx.core.array.diag.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.dtype.html b/docs/build/html/python/_autosummary/mlx.core.array.dtype.html
index e3231b4e9..ecab8d722 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.dtype.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.dtype.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.dtype &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.dtype &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.tolist" href="mlx.core.array.tolist.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.exp.html b/docs/build/html/python/_autosummary/mlx.core.array.exp.html
index 586bb60e4..2a29a331c 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.exp.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.exp.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.exp &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.exp &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.diagonal" href="mlx.core.array.diagonal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.flatten.html b/docs/build/html/python/_autosummary/mlx.core.array.flatten.html
index ba45ef7a3..7320fa855 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.flatten.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.flatten.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.flatten &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.flatten &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.exp" href="mlx.core.array.exp.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.html b/docs/build/html/python/_autosummary/mlx.core.array.html
index c548e2fc3..887b4f5b6 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Array" href="../array.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.item.html b/docs/build/html/python/_autosummary/mlx.core.array.item.html
index d617b8cd9..74d3f557e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.item.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.item.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.item &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.item &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.at" href="mlx.core.array.at.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.itemsize.html b/docs/build/html/python/_autosummary/mlx.core.array.itemsize.html
index a1040c73d..109aae10b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.itemsize.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.itemsize.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.itemsize &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.itemsize &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.dtype" href="mlx.core.array.dtype.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.log.html b/docs/build/html/python/_autosummary/mlx.core.array.log.html
index 2a9e88cd1..8db6bcec0 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.log.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.log.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.log &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.log &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.flatten" href="mlx.core.array.flatten.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.log10.html b/docs/build/html/python/_autosummary/mlx.core.array.log10.html
index addc17473..f4998d263 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.log10.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.log10.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.log10 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.log10 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.log" href="mlx.core.array.log.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.log1p.html b/docs/build/html/python/_autosummary/mlx.core.array.log1p.html
index 10bf96610..5e3febb6d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.log1p.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.log1p.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.log1p &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.log1p &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.log10" href="mlx.core.array.log10.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.log2.html b/docs/build/html/python/_autosummary/mlx.core.array.log2.html
index de817a37e..4827e3d4c 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.log2.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.log2.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.log2 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.log2 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.log1p" href="mlx.core.array.log1p.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.logsumexp.html b/docs/build/html/python/_autosummary/mlx.core.array.logsumexp.html
index e67f1483d..70d0cab55 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.logsumexp.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.logsumexp.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.logsumexp &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.logsumexp &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.log2" href="mlx.core.array.log2.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.max.html b/docs/build/html/python/_autosummary/mlx.core.array.max.html
index 3adb74a57..dd533e845 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.max.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.max.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.max &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.max &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.logsumexp" href="mlx.core.array.logsumexp.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.mean.html b/docs/build/html/python/_autosummary/mlx.core.array.mean.html
index 9a1652734..43748a896 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.mean.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.mean.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.mean &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.mean &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.max" href="mlx.core.array.max.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.min.html b/docs/build/html/python/_autosummary/mlx.core.array.min.html
index c3196d28a..981e63d70 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.min.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.min.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.min &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.min &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.mean" href="mlx.core.array.mean.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.moveaxis.html b/docs/build/html/python/_autosummary/mlx.core.array.moveaxis.html
index 90094c782..60b4d88bb 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.moveaxis.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.moveaxis.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.moveaxis &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.moveaxis &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.min" href="mlx.core.array.min.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.nbytes.html b/docs/build/html/python/_autosummary/mlx.core.array.nbytes.html
index 2129d0950..890e8ab91 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.nbytes.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.nbytes.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.nbytes &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.nbytes &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.itemsize" href="mlx.core.array.itemsize.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.ndim.html b/docs/build/html/python/_autosummary/mlx.core.array.ndim.html
index a4d93838f..88495c6f1 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.ndim.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.ndim.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.ndim &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.ndim &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.nbytes" href="mlx.core.array.nbytes.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.prod.html b/docs/build/html/python/_autosummary/mlx.core.array.prod.html
index a30eb23c0..aa2138436 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.prod.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.prod.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.prod &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.prod &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.moveaxis" href="mlx.core.array.moveaxis.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.reciprocal.html b/docs/build/html/python/_autosummary/mlx.core.array.reciprocal.html
index 350e05485..8a60615ba 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.reciprocal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.reciprocal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.reciprocal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.reciprocal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.prod" href="mlx.core.array.prod.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.reshape.html b/docs/build/html/python/_autosummary/mlx.core.array.reshape.html
index d605a18f3..268c43c62 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.reshape.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.reshape.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.reshape &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.reshape &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.reciprocal" href="mlx.core.array.reciprocal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.round.html b/docs/build/html/python/_autosummary/mlx.core.array.round.html
index 1e649bb5e..b525be193 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.round.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.round.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.round &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.round &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.reshape" href="mlx.core.array.reshape.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.rsqrt.html b/docs/build/html/python/_autosummary/mlx.core.array.rsqrt.html
index 02b1710ab..56b0ddc03 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.rsqrt.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.rsqrt.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.rsqrt &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.rsqrt &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.round" href="mlx.core.array.round.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.shape.html b/docs/build/html/python/_autosummary/mlx.core.array.shape.html
index 1383dcdf6..b19b0029e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.shape.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.shape.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.shape &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.shape &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.ndim" href="mlx.core.array.ndim.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.sin.html b/docs/build/html/python/_autosummary/mlx.core.array.sin.html
index 6ce5e835f..0a2332212 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.sin.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.sin.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.sin &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.sin &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.rsqrt" href="mlx.core.array.rsqrt.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.size.html b/docs/build/html/python/_autosummary/mlx.core.array.size.html
index 4645ee22d..ba1427708 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.size.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.size.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.size &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.size &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.shape" href="mlx.core.array.shape.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.split.html b/docs/build/html/python/_autosummary/mlx.core.array.split.html
index 9a6db9d70..52bc08cbf 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.split.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.split.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.split &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.split &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.sin" href="mlx.core.array.sin.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.sqrt.html b/docs/build/html/python/_autosummary/mlx.core.array.sqrt.html
index b257add25..081facb95 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.sqrt.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.sqrt.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.sqrt &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.sqrt &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.split" href="mlx.core.array.split.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.square.html b/docs/build/html/python/_autosummary/mlx.core.array.square.html
index 89d8ec513..10c1f54f0 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.square.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.square.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.square &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.square &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.sqrt" href="mlx.core.array.sqrt.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.squeeze.html b/docs/build/html/python/_autosummary/mlx.core.array.squeeze.html
index ee859738e..6d1e91b7e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.squeeze.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.squeeze.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.squeeze &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.squeeze &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.square" href="mlx.core.array.square.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.std.html b/docs/build/html/python/_autosummary/mlx.core.array.std.html
index 49dedf680..424da05f0 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.std.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.std.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.std &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.std &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.squeeze" href="mlx.core.array.squeeze.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.sum.html b/docs/build/html/python/_autosummary/mlx.core.array.sum.html
index 8bf408c1d..e1b12a6c6 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.sum.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.sum.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.sum &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.sum &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.std" href="mlx.core.array.std.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.swapaxes.html b/docs/build/html/python/_autosummary/mlx.core.array.swapaxes.html
index 889374130..bac030d8e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.swapaxes.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.swapaxes.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.swapaxes &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.swapaxes &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.sum" href="mlx.core.array.sum.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.tolist.html b/docs/build/html/python/_autosummary/mlx.core.array.tolist.html
index 02f2c841e..0dfcf51a0 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.tolist.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.tolist.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.tolist &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.tolist &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.item" href="mlx.core.array.item.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.transpose.html b/docs/build/html/python/_autosummary/mlx.core.array.transpose.html
index e76508566..498049319 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.transpose.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.transpose.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.transpose &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.transpose &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.swapaxes" href="mlx.core.array.swapaxes.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.var.html b/docs/build/html/python/_autosummary/mlx.core.array.var.html
index 7cd25caa2..dc7854cb8 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.var.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.var.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.var &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.var &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.T" href="mlx.core.array.T.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array.view.html b/docs/build/html/python/_autosummary/mlx.core.array.view.html
index 93582c4c3..da0efcf15 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array.view.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array.view.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array.view &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array.view &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.var" href="mlx.core.array.var.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.array_equal.html b/docs/build/html/python/_autosummary/mlx.core.array_equal.html
index bb346b089..a7d1e2271 100644
--- a/docs/build/html/python/_autosummary/mlx.core.array_equal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.array_equal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.array_equal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.array_equal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.argsort" href="mlx.core.argsort.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.as_strided.html b/docs/build/html/python/_autosummary/mlx.core.as_strided.html
index 1f1ab296e..b230f5957 100644
--- a/docs/build/html/python/_autosummary/mlx.core.as_strided.html
+++ b/docs/build/html/python/_autosummary/mlx.core.as_strided.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.as_strided &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.as_strided &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array_equal" href="mlx.core.array_equal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.atleast_1d.html b/docs/build/html/python/_autosummary/mlx.core.atleast_1d.html
index 7b11d52e8..54dbaaa59 100644
--- a/docs/build/html/python/_autosummary/mlx.core.atleast_1d.html
+++ b/docs/build/html/python/_autosummary/mlx.core.atleast_1d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.atleast_1d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.atleast_1d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.as_strided" href="mlx.core.as_strided.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.atleast_2d.html b/docs/build/html/python/_autosummary/mlx.core.atleast_2d.html
index a80fc090a..d4c71f158 100644
--- a/docs/build/html/python/_autosummary/mlx.core.atleast_2d.html
+++ b/docs/build/html/python/_autosummary/mlx.core.atleast_2d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.atleast_2d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.atleast_2d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.atleast_1d" href="mlx.core.atleast_1d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.atleast_3d.html b/docs/build/html/python/_autosummary/mlx.core.atleast_3d.html
index 76dcb8689..866f2fe87 100644
--- a/docs/build/html/python/_autosummary/mlx.core.atleast_3d.html
+++ b/docs/build/html/python/_autosummary/mlx.core.atleast_3d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.atleast_3d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.atleast_3d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.atleast_2d" href="mlx.core.atleast_2d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.bitwise_and.html b/docs/build/html/python/_autosummary/mlx.core.bitwise_and.html
index 0248d1efd..774d4f60b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.bitwise_and.html
+++ b/docs/build/html/python/_autosummary/mlx.core.bitwise_and.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.bitwise_and &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.bitwise_and &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.atleast_3d" href="mlx.core.atleast_3d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.bitwise_or.html b/docs/build/html/python/_autosummary/mlx.core.bitwise_or.html
index 937a3601a..a32708932 100644
--- a/docs/build/html/python/_autosummary/mlx.core.bitwise_or.html
+++ b/docs/build/html/python/_autosummary/mlx.core.bitwise_or.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.bitwise_or &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.bitwise_or &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.bitwise_and" href="mlx.core.bitwise_and.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.bitwise_xor.html b/docs/build/html/python/_autosummary/mlx.core.bitwise_xor.html
index 4aebefda8..064502332 100644
--- a/docs/build/html/python/_autosummary/mlx.core.bitwise_xor.html
+++ b/docs/build/html/python/_autosummary/mlx.core.bitwise_xor.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.bitwise_xor &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.bitwise_xor &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.bitwise_or" href="mlx.core.bitwise_or.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.block_masked_mm.html b/docs/build/html/python/_autosummary/mlx.core.block_masked_mm.html
index 768402874..edf5ad7c7 100644
--- a/docs/build/html/python/_autosummary/mlx.core.block_masked_mm.html
+++ b/docs/build/html/python/_autosummary/mlx.core.block_masked_mm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.block_masked_mm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.block_masked_mm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.bitwise_xor" href="mlx.core.bitwise_xor.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.broadcast_to.html b/docs/build/html/python/_autosummary/mlx.core.broadcast_to.html
index aa0dfbd17..e47056925 100644
--- a/docs/build/html/python/_autosummary/mlx.core.broadcast_to.html
+++ b/docs/build/html/python/_autosummary/mlx.core.broadcast_to.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.broadcast_to &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.broadcast_to &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.block_masked_mm" href="mlx.core.block_masked_mm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.ceil.html b/docs/build/html/python/_autosummary/mlx.core.ceil.html
index f70ed7381..5f679d624 100644
--- a/docs/build/html/python/_autosummary/mlx.core.ceil.html
+++ b/docs/build/html/python/_autosummary/mlx.core.ceil.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.ceil &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.ceil &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.broadcast_to" href="mlx.core.broadcast_to.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.clip.html b/docs/build/html/python/_autosummary/mlx.core.clip.html
index 2ed7ba24f..8bca4c6af 100644
--- a/docs/build/html/python/_autosummary/mlx.core.clip.html
+++ b/docs/build/html/python/_autosummary/mlx.core.clip.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.clip &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.clip &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.ceil" href="mlx.core.ceil.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.compile.html b/docs/build/html/python/_autosummary/mlx.core.compile.html
index 44f7641bb..a081abcdb 100644
--- a/docs/build/html/python/_autosummary/mlx.core.compile.html
+++ b/docs/build/html/python/_autosummary/mlx.core.compile.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.compile &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.compile &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.eval" href="mlx.core.eval.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.concatenate.html b/docs/build/html/python/_autosummary/mlx.core.concatenate.html
index cc094e30e..68619f039 100644
--- a/docs/build/html/python/_autosummary/mlx.core.concatenate.html
+++ b/docs/build/html/python/_autosummary/mlx.core.concatenate.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.concatenate &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.concatenate &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.clip" href="mlx.core.clip.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.conj.html b/docs/build/html/python/_autosummary/mlx.core.conj.html
index 33b59fa07..2f3c5c3e1 100644
--- a/docs/build/html/python/_autosummary/mlx.core.conj.html
+++ b/docs/build/html/python/_autosummary/mlx.core.conj.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.conj &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.conj &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.concatenate" href="mlx.core.concatenate.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.conjugate.html b/docs/build/html/python/_autosummary/mlx.core.conjugate.html
index c45c175ba..96a641c47 100644
--- a/docs/build/html/python/_autosummary/mlx.core.conjugate.html
+++ b/docs/build/html/python/_autosummary/mlx.core.conjugate.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.conjugate &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.conjugate &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.conj" href="mlx.core.conj.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.conv1d.html b/docs/build/html/python/_autosummary/mlx.core.conv1d.html
index 00fdf34f6..7756c0853 100644
--- a/docs/build/html/python/_autosummary/mlx.core.conv1d.html
+++ b/docs/build/html/python/_autosummary/mlx.core.conv1d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.conv1d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.conv1d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.convolve" href="mlx.core.convolve.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.conv2d.html b/docs/build/html/python/_autosummary/mlx.core.conv2d.html
index 3a62ce7e6..9309ab73e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.conv2d.html
+++ b/docs/build/html/python/_autosummary/mlx.core.conv2d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.conv2d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.conv2d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.conv1d" href="mlx.core.conv1d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.conv3d.html b/docs/build/html/python/_autosummary/mlx.core.conv3d.html
index b9f54b978..20d142702 100644
--- a/docs/build/html/python/_autosummary/mlx.core.conv3d.html
+++ b/docs/build/html/python/_autosummary/mlx.core.conv3d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.conv3d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.conv3d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.conv2d" href="mlx.core.conv2d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.conv_general.html b/docs/build/html/python/_autosummary/mlx.core.conv_general.html
index a8a6ad987..5013c6f7d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.conv_general.html
+++ b/docs/build/html/python/_autosummary/mlx.core.conv_general.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.conv_general &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.conv_general &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.conv_transpose3d" href="mlx.core.conv_transpose3d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.conv_transpose1d.html b/docs/build/html/python/_autosummary/mlx.core.conv_transpose1d.html
index cc2e2e95d..5fd2b8e07 100644
--- a/docs/build/html/python/_autosummary/mlx.core.conv_transpose1d.html
+++ b/docs/build/html/python/_autosummary/mlx.core.conv_transpose1d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.conv_transpose1d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.conv_transpose1d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.conv3d" href="mlx.core.conv3d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.conv_transpose2d.html b/docs/build/html/python/_autosummary/mlx.core.conv_transpose2d.html
index 7964c6336..58e7d920b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.conv_transpose2d.html
+++ b/docs/build/html/python/_autosummary/mlx.core.conv_transpose2d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.conv_transpose2d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.conv_transpose2d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.conv_transpose1d" href="mlx.core.conv_transpose1d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.conv_transpose3d.html b/docs/build/html/python/_autosummary/mlx.core.conv_transpose3d.html
index 6072d889b..1a429936f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.conv_transpose3d.html
+++ b/docs/build/html/python/_autosummary/mlx.core.conv_transpose3d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.conv_transpose3d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.conv_transpose3d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.conv_transpose2d" href="mlx.core.conv_transpose2d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.convolve.html b/docs/build/html/python/_autosummary/mlx.core.convolve.html
index d0c717202..24d4ae240 100644
--- a/docs/build/html/python/_autosummary/mlx.core.convolve.html
+++ b/docs/build/html/python/_autosummary/mlx.core.convolve.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.convolve &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.convolve &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.conjugate" href="mlx.core.conjugate.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.cos.html b/docs/build/html/python/_autosummary/mlx.core.cos.html
index f038e1d45..b1c041860 100644
--- a/docs/build/html/python/_autosummary/mlx.core.cos.html
+++ b/docs/build/html/python/_autosummary/mlx.core.cos.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.cos &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.cos &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.conv_general" href="mlx.core.conv_general.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.cosh.html b/docs/build/html/python/_autosummary/mlx.core.cosh.html
index 17f091146..7c3f2c7b1 100644
--- a/docs/build/html/python/_autosummary/mlx.core.cosh.html
+++ b/docs/build/html/python/_autosummary/mlx.core.cosh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.cosh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.cosh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.cos" href="mlx.core.cos.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.cummax.html b/docs/build/html/python/_autosummary/mlx.core.cummax.html
index bcbf313e4..bf8765ca7 100644
--- a/docs/build/html/python/_autosummary/mlx.core.cummax.html
+++ b/docs/build/html/python/_autosummary/mlx.core.cummax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.cummax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.cummax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.cosh" href="mlx.core.cosh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.cummin.html b/docs/build/html/python/_autosummary/mlx.core.cummin.html
index d96b5b68b..23025f942 100644
--- a/docs/build/html/python/_autosummary/mlx.core.cummin.html
+++ b/docs/build/html/python/_autosummary/mlx.core.cummin.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.cummin &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.cummin &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.cummax" href="mlx.core.cummax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.cumprod.html b/docs/build/html/python/_autosummary/mlx.core.cumprod.html
index 56370f9b8..87d53ef6f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.cumprod.html
+++ b/docs/build/html/python/_autosummary/mlx.core.cumprod.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.cumprod &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.cumprod &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.cummin" href="mlx.core.cummin.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.cumsum.html b/docs/build/html/python/_autosummary/mlx.core.cumsum.html
index 24d7cd5f9..d7d9f0f5d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.cumsum.html
+++ b/docs/build/html/python/_autosummary/mlx.core.cumsum.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.cumsum &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.cumsum &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.cumprod" href="mlx.core.cumprod.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.custom_function.html b/docs/build/html/python/_autosummary/mlx.core.custom_function.html
index 4c79155ce..ff20c514c 100644
--- a/docs/build/html/python/_autosummary/mlx.core.custom_function.html
+++ b/docs/build/html/python/_autosummary/mlx.core.custom_function.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.custom_function &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.custom_function &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.compile" href="mlx.core.compile.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.default_device.html b/docs/build/html/python/_autosummary/mlx.core.default_device.html
index d1afe6dca..04e99c394 100644
--- a/docs/build/html/python/_autosummary/mlx.core.default_device.html
+++ b/docs/build/html/python/_autosummary/mlx.core.default_device.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.default_device &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.default_device &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.Stream" href="stream_class.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.default_stream.html b/docs/build/html/python/_autosummary/mlx.core.default_stream.html
index 13838d8e5..768562fa2 100644
--- a/docs/build/html/python/_autosummary/mlx.core.default_stream.html
+++ b/docs/build/html/python/_autosummary/mlx.core.default_stream.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.default_stream &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.default_stream &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.set_default_device" href="mlx.core.set_default_device.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.degrees.html b/docs/build/html/python/_autosummary/mlx.core.degrees.html
index a31cecb7f..e948c82d7 100644
--- a/docs/build/html/python/_autosummary/mlx.core.degrees.html
+++ b/docs/build/html/python/_autosummary/mlx.core.degrees.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.degrees &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.degrees &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.cumsum" href="mlx.core.cumsum.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.dequantize.html b/docs/build/html/python/_autosummary/mlx.core.dequantize.html
index e8595f66e..55b834591 100644
--- a/docs/build/html/python/_autosummary/mlx.core.dequantize.html
+++ b/docs/build/html/python/_autosummary/mlx.core.dequantize.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.dequantize &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.dequantize &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.degrees" href="mlx.core.degrees.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.diag.html b/docs/build/html/python/_autosummary/mlx.core.diag.html
index d032011e8..33d1a8047 100644
--- a/docs/build/html/python/_autosummary/mlx.core.diag.html
+++ b/docs/build/html/python/_autosummary/mlx.core.diag.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.diag &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.diag &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.dequantize" href="mlx.core.dequantize.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.diagonal.html b/docs/build/html/python/_autosummary/mlx.core.diagonal.html
index 9745beba2..c7473c799 100644
--- a/docs/build/html/python/_autosummary/mlx.core.diagonal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.diagonal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.diagonal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.diagonal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.diag" href="mlx.core.diag.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.disable_compile.html b/docs/build/html/python/_autosummary/mlx.core.disable_compile.html
index 1fdf82fa5..b6de85f32 100644
--- a/docs/build/html/python/_autosummary/mlx.core.disable_compile.html
+++ b/docs/build/html/python/_autosummary/mlx.core.disable_compile.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.disable_compile &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.disable_compile &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.custom_function" href="mlx.core.custom_function.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.distributed.Group.html b/docs/build/html/python/_autosummary/mlx.core.distributed.Group.html
index e2da256e4..94d2fb680 100644
--- a/docs/build/html/python/_autosummary/mlx.core.distributed.Group.html
+++ b/docs/build/html/python/_autosummary/mlx.core.distributed.Group.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.distributed.Group &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.distributed.Group &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Distributed Communication" href="../distributed.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.distributed.all_gather.html b/docs/build/html/python/_autosummary/mlx.core.distributed.all_gather.html
index aedf3d963..7ab351c4a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.distributed.all_gather.html
+++ b/docs/build/html/python/_autosummary/mlx.core.distributed.all_gather.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.distributed.all_gather &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.distributed.all_gather &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.distributed.all_sum" href="mlx.core.distributed.all_sum.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.distributed.all_sum.html b/docs/build/html/python/_autosummary/mlx.core.distributed.all_sum.html
index e3ba93c5b..cd3bc6982 100644
--- a/docs/build/html/python/_autosummary/mlx.core.distributed.all_sum.html
+++ b/docs/build/html/python/_autosummary/mlx.core.distributed.all_sum.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.distributed.all_sum &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.distributed.all_sum &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.distributed.init" href="mlx.core.distributed.init.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.distributed.init.html b/docs/build/html/python/_autosummary/mlx.core.distributed.init.html
index 1eb50bf1b..b4a83ce5f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.distributed.init.html
+++ b/docs/build/html/python/_autosummary/mlx.core.distributed.init.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.distributed.init &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.distributed.init &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.distributed.is_available" href="mlx.core.distributed.is_available.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.distributed.is_available.html b/docs/build/html/python/_autosummary/mlx.core.distributed.is_available.html
index 9056493d3..ef1eac2b4 100644
--- a/docs/build/html/python/_autosummary/mlx.core.distributed.is_available.html
+++ b/docs/build/html/python/_autosummary/mlx.core.distributed.is_available.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.distributed.is_available &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.distributed.is_available &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.distributed.Group" href="mlx.core.distributed.Group.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.distributed.recv.html b/docs/build/html/python/_autosummary/mlx.core.distributed.recv.html
index c77dfef3e..604cdddf9 100644
--- a/docs/build/html/python/_autosummary/mlx.core.distributed.recv.html
+++ b/docs/build/html/python/_autosummary/mlx.core.distributed.recv.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.distributed.recv &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.distributed.recv &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.distributed.send" href="mlx.core.distributed.send.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.distributed.recv_like.html b/docs/build/html/python/_autosummary/mlx.core.distributed.recv_like.html
index c6e8045ac..e21db6291 100644
--- a/docs/build/html/python/_autosummary/mlx.core.distributed.recv_like.html
+++ b/docs/build/html/python/_autosummary/mlx.core.distributed.recv_like.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.distributed.recv_like &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.distributed.recv_like &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.distributed.recv" href="mlx.core.distributed.recv.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.distributed.send.html b/docs/build/html/python/_autosummary/mlx.core.distributed.send.html
index 2f62ec5a0..b5a1d9234 100644
--- a/docs/build/html/python/_autosummary/mlx.core.distributed.send.html
+++ b/docs/build/html/python/_autosummary/mlx.core.distributed.send.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.distributed.send &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.distributed.send &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.distributed.all_gather" href="mlx.core.distributed.all_gather.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.divide.html b/docs/build/html/python/_autosummary/mlx.core.divide.html
index 5ed322a6c..d84684d3f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.divide.html
+++ b/docs/build/html/python/_autosummary/mlx.core.divide.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.divide &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.divide &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.diagonal" href="mlx.core.diagonal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.divmod.html b/docs/build/html/python/_autosummary/mlx.core.divmod.html
index 46201a50b..222182000 100644
--- a/docs/build/html/python/_autosummary/mlx.core.divmod.html
+++ b/docs/build/html/python/_autosummary/mlx.core.divmod.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.divmod &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.divmod &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.divide" href="mlx.core.divide.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.einsum.html b/docs/build/html/python/_autosummary/mlx.core.einsum.html
index 56baf4694..de9ac682e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.einsum.html
+++ b/docs/build/html/python/_autosummary/mlx.core.einsum.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.einsum &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.einsum &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.divmod" href="mlx.core.divmod.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.einsum_path.html b/docs/build/html/python/_autosummary/mlx.core.einsum_path.html
index 38661609d..b8b89f8fb 100644
--- a/docs/build/html/python/_autosummary/mlx.core.einsum_path.html
+++ b/docs/build/html/python/_autosummary/mlx.core.einsum_path.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.einsum_path &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.einsum_path &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.einsum" href="mlx.core.einsum.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.enable_compile.html b/docs/build/html/python/_autosummary/mlx.core.enable_compile.html
index 581356a5f..0e13be2d3 100644
--- a/docs/build/html/python/_autosummary/mlx.core.enable_compile.html
+++ b/docs/build/html/python/_autosummary/mlx.core.enable_compile.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.enable_compile &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.enable_compile &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.disable_compile" href="mlx.core.disable_compile.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.equal.html b/docs/build/html/python/_autosummary/mlx.core.equal.html
index 05ac1a8e0..0e746d96a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.equal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.equal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.equal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.equal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.einsum_path" href="mlx.core.einsum_path.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.erf.html b/docs/build/html/python/_autosummary/mlx.core.erf.html
index cbeab2c47..46f818d5f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.erf.html
+++ b/docs/build/html/python/_autosummary/mlx.core.erf.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.erf &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.erf &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.equal" href="mlx.core.equal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.erfinv.html b/docs/build/html/python/_autosummary/mlx.core.erfinv.html
index b90a6e18b..b7e8aa973 100644
--- a/docs/build/html/python/_autosummary/mlx.core.erfinv.html
+++ b/docs/build/html/python/_autosummary/mlx.core.erfinv.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.erfinv &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.erfinv &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.erf" href="mlx.core.erf.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.eval.html b/docs/build/html/python/_autosummary/mlx.core.eval.html
index bf5b124d6..457160b9d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.eval.html
+++ b/docs/build/html/python/_autosummary/mlx.core.eval.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.eval &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.eval &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Transforms" href="../transforms.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.exp.html b/docs/build/html/python/_autosummary/mlx.core.exp.html
index 7202e7e82..354d179d7 100644
--- a/docs/build/html/python/_autosummary/mlx.core.exp.html
+++ b/docs/build/html/python/_autosummary/mlx.core.exp.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.exp &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.exp &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.erfinv" href="mlx.core.erfinv.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.expand_dims.html b/docs/build/html/python/_autosummary/mlx.core.expand_dims.html
index fe8fbd46a..8575dd099 100644
--- a/docs/build/html/python/_autosummary/mlx.core.expand_dims.html
+++ b/docs/build/html/python/_autosummary/mlx.core.expand_dims.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.expand_dims &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.expand_dims &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.expm1" href="mlx.core.expm1.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.expm1.html b/docs/build/html/python/_autosummary/mlx.core.expm1.html
index 4e0a97c51..b5975ef0f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.expm1.html
+++ b/docs/build/html/python/_autosummary/mlx.core.expm1.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.expm1 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.expm1 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.exp" href="mlx.core.exp.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.eye.html b/docs/build/html/python/_autosummary/mlx.core.eye.html
index 73f41fe93..2ab1d703e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.eye.html
+++ b/docs/build/html/python/_autosummary/mlx.core.eye.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.eye &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.eye &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.expand_dims" href="mlx.core.expand_dims.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fast.affine_quantize.html b/docs/build/html/python/_autosummary/mlx.core.fast.affine_quantize.html
deleted file mode 100644
index 57b56fb68..000000000
--- a/docs/build/html/python/_autosummary/mlx.core.fast.affine_quantize.html
+++ /dev/null
@@ -1,1001 +0,0 @@
-
-<!DOCTYPE html>
-
-
-<html lang="en" data-content_root="../../" >
-
-  <head>
-    <meta charset="utf-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
-
-    <title>mlx.core.fast.affine_quantize &#8212; MLX 0.20.0 documentation</title>
-  
-  
-  
-  <script data-cfasync="false">
-    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
-    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
-  </script>
-  <!-- 
-    this give us a css class that will be invisible only if js is disabled 
-  -->
-  <noscript>
-    <style>
-      .pst-js-only { display: none !important; }
-
-    </style>
-  </noscript>
-  
-  <!-- Loaded before other Sphinx assets -->
-  <link href="../../_static/styles/theme.css?digest=26a4bc78f4c0ddb94549" rel="stylesheet" />
-<link href="../../_static/styles/pydata-sphinx-theme.css?digest=26a4bc78f4c0ddb94549" rel="stylesheet" />
-
-    <link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=fa44fd50" />
-    <link rel="stylesheet" type="text/css" href="../../_static/styles/sphinx-book-theme.css?v=a3416100" />
-  
-  <!-- So that users can add custom icons -->
-  <script src="../../_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
-  <!-- Pre-loaded scripts that we'll load fully later -->
-  <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
-<link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
-
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
-    <script src="../../_static/doctools.js?v=9a2dae69"></script>
-    <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
-    <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
-    <script async="async" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
-    <script>DOCUMENTATION_OPTIONS.pagename = 'python/_autosummary/mlx.core.fast.affine_quantize';</script>
-    <link rel="icon" href="../../_static/mlx_logo.png"/>
-    <link rel="index" title="Index" href="../../genindex.html" />
-    <link rel="search" title="Search" href="../../search.html" />
-    <link rel="next" title="mlx.core.fast.metal_kernel" href="mlx.core.fast.metal_kernel.html" />
-    <link rel="prev" title="mlx.core.fast.scaled_dot_product_attention" href="mlx.core.fast.scaled_dot_product_attention.html" />
-  <meta name="viewport" content="width=device-width, initial-scale=1"/>
-  <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
-  </head>
-  
-  
-  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
-
-  
-  
-  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
-  
-  <div id="pst-scroll-pixel-helper"></div>
-  
-  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
-    <i class="fa-solid fa-arrow-up"></i>Back to top</button>
-
-  
-  <dialog id="pst-search-dialog">
-    
-<form class="bd-search d-flex align-items-center"
-      action="../../search.html"
-      method="get">
-  <i class="fa-solid fa-magnifying-glass"></i>
-  <input type="search"
-         class="form-control"
-         name="q"
-         placeholder="Search..."
-         aria-label="Search..."
-         autocomplete="off"
-         autocorrect="off"
-         autocapitalize="off"
-         spellcheck="false"/>
-  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
-</form>
-  </dialog>
-
-  <div class="pst-async-banner-revealer d-none">
-  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
-</div>
-
-  
-    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
-    </header>
-  
-
-  <div class="bd-container">
-    <div class="bd-container__inner bd-page-width">
-      
-      
-      
-      <dialog id="pst-primary-sidebar-modal"></dialog>
-      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
-        
-
-  
-  <div class="sidebar-header-items sidebar-primary__section">
-    
-    
-    
-    
-  </div>
-  
-    <div class="sidebar-primary-items__start sidebar-primary__section">
-        <div class="sidebar-primary-item">
-
-  
-    
-  
-
-<a class="navbar-brand logo" href="../../index.html">
-  
-  
-  
-  
-  
-    
-    
-      
-    
-    
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
-  
-  
-</a></div>
-        <div class="sidebar-primary-item">
-
-<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
- <i class="fa-solid fa-magnifying-glass"></i>
- <span class="search-button__default-text">Search</span>
- <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
-</button></div>
-        <div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
-    <div class="bd-toc-item navbar-nav active">
-        <p aria-level="2" class="caption" role="heading"><span class="caption-text">Install</span></p>
-<ul class="nav bd-sidenav">
-<li class="toctree-l1"><a class="reference internal" href="../../install.html">Build and Install</a></li>
-</ul>
-<p aria-level="2" class="caption" role="heading"><span class="caption-text">Usage</span></p>
-<ul class="nav bd-sidenav">
-<li class="toctree-l1"><a class="reference internal" href="../../usage/quick_start.html">Quick Start Guide</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../usage/lazy_evaluation.html">Lazy Evaluation</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../usage/unified_memory.html">Unified Memory</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../usage/indexing.html">Indexing Arrays</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../usage/saving_and_loading.html">Saving and Loading Arrays</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../usage/function_transforms.html">Function Transforms</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../usage/compile.html">Compilation</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../usage/numpy.html">Conversion to NumPy and Other Frameworks</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../usage/distributed.html">Distributed Communication</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../usage/using_streams.html">Using Streams</a></li>
-</ul>
-<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
-<ul class="nav bd-sidenav">
-<li class="toctree-l1"><a class="reference internal" href="../../examples/linear_regression.html">Linear Regression</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../examples/mlp.html">Multi-Layer Perceptron</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../examples/llama-inference.html">LLM inference</a></li>
-</ul>
-<p aria-level="2" class="caption" role="heading"><span class="caption-text">Python API Reference</span></p>
-<ul class="current nav bd-sidenav">
-<li class="toctree-l1 has-children"><a class="reference internal" href="../array.html">Array</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.html">mlx.core.array</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.astype.html">mlx.core.array.astype</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.at.html">mlx.core.array.at</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.item.html">mlx.core.array.item</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.tolist.html">mlx.core.array.tolist</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.dtype.html">mlx.core.array.dtype</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.itemsize.html">mlx.core.array.itemsize</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.nbytes.html">mlx.core.array.nbytes</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.ndim.html">mlx.core.array.ndim</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.shape.html">mlx.core.array.shape</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.size.html">mlx.core.array.size</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.abs.html">mlx.core.array.abs</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.all.html">mlx.core.array.all</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.any.html">mlx.core.array.any</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.argmax.html">mlx.core.array.argmax</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.argmin.html">mlx.core.array.argmin</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.conj.html">mlx.core.array.conj</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.cos.html">mlx.core.array.cos</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.cummax.html">mlx.core.array.cummax</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.cummin.html">mlx.core.array.cummin</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.cumprod.html">mlx.core.array.cumprod</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.cumsum.html">mlx.core.array.cumsum</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.diag.html">mlx.core.array.diag</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.diagonal.html">mlx.core.array.diagonal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.exp.html">mlx.core.array.exp</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.flatten.html">mlx.core.array.flatten</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.log.html">mlx.core.array.log</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.log10.html">mlx.core.array.log10</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.log1p.html">mlx.core.array.log1p</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.log2.html">mlx.core.array.log2</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.logsumexp.html">mlx.core.array.logsumexp</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.max.html">mlx.core.array.max</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.mean.html">mlx.core.array.mean</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.min.html">mlx.core.array.min</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.moveaxis.html">mlx.core.array.moveaxis</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.prod.html">mlx.core.array.prod</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.reciprocal.html">mlx.core.array.reciprocal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.reshape.html">mlx.core.array.reshape</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.round.html">mlx.core.array.round</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.rsqrt.html">mlx.core.array.rsqrt</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.sin.html">mlx.core.array.sin</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.split.html">mlx.core.array.split</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.sqrt.html">mlx.core.array.sqrt</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.square.html">mlx.core.array.square</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.squeeze.html">mlx.core.array.squeeze</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.std.html">mlx.core.array.std</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.sum.html">mlx.core.array.sum</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.swapaxes.html">mlx.core.array.swapaxes</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.transpose.html">mlx.core.array.transpose</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.T.html">mlx.core.array.T</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.var.html">mlx.core.array.var</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array.view.html">mlx.core.array.view</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../data_types.html">Data Types</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.Dtype.html">mlx.core.Dtype</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.DtypeCategory.html">mlx.core.DtypeCategory</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.issubdtype.html">mlx.core.issubdtype</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../devices_and_streams.html">Devices and Streams</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.Device.html">mlx.core.Device</a></li>
-<li class="toctree-l2"><a class="reference internal" href="stream_class.html">mlx.core.Stream</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.default_device.html">mlx.core.default_device</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.set_default_device.html">mlx.core.set_default_device</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.default_stream.html">mlx.core.default_stream</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.new_stream.html">mlx.core.new_stream</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.set_default_stream.html">mlx.core.set_default_stream</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.stream.html">mlx.core.stream</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.synchronize.html">mlx.core.synchronize</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../ops.html">Operations</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.abs.html">mlx.core.abs</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.add.html">mlx.core.add</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.addmm.html">mlx.core.addmm</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.all.html">mlx.core.all</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.allclose.html">mlx.core.allclose</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.any.html">mlx.core.any</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.arange.html">mlx.core.arange</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.arccos.html">mlx.core.arccos</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.arccosh.html">mlx.core.arccosh</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.arcsin.html">mlx.core.arcsin</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.arcsinh.html">mlx.core.arcsinh</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.arctan.html">mlx.core.arctan</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.arctan2.html">mlx.core.arctan2</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.arctanh.html">mlx.core.arctanh</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.argmax.html">mlx.core.argmax</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.argmin.html">mlx.core.argmin</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.argpartition.html">mlx.core.argpartition</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.argsort.html">mlx.core.argsort</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.array_equal.html">mlx.core.array_equal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.as_strided.html">mlx.core.as_strided</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.atleast_1d.html">mlx.core.atleast_1d</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.atleast_2d.html">mlx.core.atleast_2d</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.atleast_3d.html">mlx.core.atleast_3d</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.bitwise_and.html">mlx.core.bitwise_and</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.bitwise_or.html">mlx.core.bitwise_or</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.bitwise_xor.html">mlx.core.bitwise_xor</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.block_masked_mm.html">mlx.core.block_masked_mm</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.broadcast_to.html">mlx.core.broadcast_to</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.ceil.html">mlx.core.ceil</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.clip.html">mlx.core.clip</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.concatenate.html">mlx.core.concatenate</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.conj.html">mlx.core.conj</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.conjugate.html">mlx.core.conjugate</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.convolve.html">mlx.core.convolve</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.conv1d.html">mlx.core.conv1d</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.conv2d.html">mlx.core.conv2d</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.conv3d.html">mlx.core.conv3d</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.conv_transpose1d.html">mlx.core.conv_transpose1d</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.conv_transpose2d.html">mlx.core.conv_transpose2d</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.conv_transpose3d.html">mlx.core.conv_transpose3d</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.conv_general.html">mlx.core.conv_general</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.cos.html">mlx.core.cos</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.cosh.html">mlx.core.cosh</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.cummax.html">mlx.core.cummax</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.cummin.html">mlx.core.cummin</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.cumprod.html">mlx.core.cumprod</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.cumsum.html">mlx.core.cumsum</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.degrees.html">mlx.core.degrees</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.dequantize.html">mlx.core.dequantize</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.diag.html">mlx.core.diag</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.diagonal.html">mlx.core.diagonal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.divide.html">mlx.core.divide</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.divmod.html">mlx.core.divmod</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.einsum.html">mlx.core.einsum</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.einsum_path.html">mlx.core.einsum_path</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.equal.html">mlx.core.equal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.erf.html">mlx.core.erf</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.erfinv.html">mlx.core.erfinv</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.exp.html">mlx.core.exp</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.expm1.html">mlx.core.expm1</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.expand_dims.html">mlx.core.expand_dims</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.eye.html">mlx.core.eye</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.flatten.html">mlx.core.flatten</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.floor.html">mlx.core.floor</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.floor_divide.html">mlx.core.floor_divide</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.full.html">mlx.core.full</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.gather_mm.html">mlx.core.gather_mm</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.gather_qmm.html">mlx.core.gather_qmm</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.greater.html">mlx.core.greater</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.greater_equal.html">mlx.core.greater_equal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.hadamard_transform.html">mlx.core.hadamard_transform</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.identity.html">mlx.core.identity</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.imag.html">mlx.core.imag</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.inner.html">mlx.core.inner</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.isfinite.html">mlx.core.isfinite</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.isclose.html">mlx.core.isclose</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.isinf.html">mlx.core.isinf</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.isnan.html">mlx.core.isnan</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.isneginf.html">mlx.core.isneginf</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.isposinf.html">mlx.core.isposinf</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.issubdtype.html">mlx.core.issubdtype</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.left_shift.html">mlx.core.left_shift</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.less.html">mlx.core.less</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.less_equal.html">mlx.core.less_equal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linspace.html">mlx.core.linspace</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.load.html">mlx.core.load</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.log.html">mlx.core.log</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.log2.html">mlx.core.log2</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.log10.html">mlx.core.log10</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.log1p.html">mlx.core.log1p</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.logaddexp.html">mlx.core.logaddexp</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.logical_not.html">mlx.core.logical_not</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.logical_and.html">mlx.core.logical_and</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.logical_or.html">mlx.core.logical_or</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.logsumexp.html">mlx.core.logsumexp</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.matmul.html">mlx.core.matmul</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.max.html">mlx.core.max</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.maximum.html">mlx.core.maximum</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.mean.html">mlx.core.mean</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.meshgrid.html">mlx.core.meshgrid</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.min.html">mlx.core.min</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.minimum.html">mlx.core.minimum</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.moveaxis.html">mlx.core.moveaxis</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.multiply.html">mlx.core.multiply</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.nan_to_num.html">mlx.core.nan_to_num</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.negative.html">mlx.core.negative</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.not_equal.html">mlx.core.not_equal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.ones.html">mlx.core.ones</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.ones_like.html">mlx.core.ones_like</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.outer.html">mlx.core.outer</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.partition.html">mlx.core.partition</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.pad.html">mlx.core.pad</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.power.html">mlx.core.power</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.prod.html">mlx.core.prod</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.put_along_axis.html">mlx.core.put_along_axis</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.quantize.html">mlx.core.quantize</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.quantized_matmul.html">mlx.core.quantized_matmul</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.radians.html">mlx.core.radians</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.real.html">mlx.core.real</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.reciprocal.html">mlx.core.reciprocal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.remainder.html">mlx.core.remainder</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.repeat.html">mlx.core.repeat</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.reshape.html">mlx.core.reshape</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.right_shift.html">mlx.core.right_shift</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.roll.html">mlx.core.roll</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.round.html">mlx.core.round</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.rsqrt.html">mlx.core.rsqrt</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.save.html">mlx.core.save</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.savez.html">mlx.core.savez</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.savez_compressed.html">mlx.core.savez_compressed</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.save_gguf.html">mlx.core.save_gguf</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.save_safetensors.html">mlx.core.save_safetensors</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.sigmoid.html">mlx.core.sigmoid</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.sign.html">mlx.core.sign</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.sin.html">mlx.core.sin</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.sinh.html">mlx.core.sinh</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.softmax.html">mlx.core.softmax</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.sort.html">mlx.core.sort</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.split.html">mlx.core.split</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.sqrt.html">mlx.core.sqrt</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.square.html">mlx.core.square</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.squeeze.html">mlx.core.squeeze</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.stack.html">mlx.core.stack</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.std.html">mlx.core.std</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.stop_gradient.html">mlx.core.stop_gradient</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.subtract.html">mlx.core.subtract</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.sum.html">mlx.core.sum</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.swapaxes.html">mlx.core.swapaxes</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.take.html">mlx.core.take</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.take_along_axis.html">mlx.core.take_along_axis</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.tan.html">mlx.core.tan</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.tanh.html">mlx.core.tanh</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.tensordot.html">mlx.core.tensordot</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.tile.html">mlx.core.tile</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.topk.html">mlx.core.topk</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.trace.html">mlx.core.trace</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.transpose.html">mlx.core.transpose</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.tri.html">mlx.core.tri</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.tril.html">mlx.core.tril</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.triu.html">mlx.core.triu</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.var.html">mlx.core.var</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.view.html">mlx.core.view</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.where.html">mlx.core.where</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.zeros.html">mlx.core.zeros</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.zeros_like.html">mlx.core.zeros_like</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../random.html">Random</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.bernoulli.html">mlx.core.random.bernoulli</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.categorical.html">mlx.core.random.categorical</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.gumbel.html">mlx.core.random.gumbel</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.key.html">mlx.core.random.key</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.normal.html">mlx.core.random.normal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.multivariate_normal.html">mlx.core.random.multivariate_normal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.randint.html">mlx.core.random.randint</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.seed.html">mlx.core.random.seed</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.split.html">mlx.core.random.split</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.truncated_normal.html">mlx.core.random.truncated_normal</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.uniform.html">mlx.core.random.uniform</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.laplace.html">mlx.core.random.laplace</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.random.permutation.html">mlx.core.random.permutation</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../transforms.html">Transforms</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.eval.html">mlx.core.eval</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.compile.html">mlx.core.compile</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.custom_function.html">mlx.core.custom_function</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.disable_compile.html">mlx.core.disable_compile</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.enable_compile.html">mlx.core.enable_compile</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.grad.html">mlx.core.grad</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.value_and_grad.html">mlx.core.value_and_grad</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.jvp.html">mlx.core.jvp</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.vjp.html">mlx.core.vjp</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.vmap.html">mlx.core.vmap</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 current active has-children"><a class="reference internal" href="../fast.html">Fast</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rms_norm.html">mlx.core.fast.rms_norm</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2 current active"><a class="current reference internal" href="#">mlx.core.fast.affine_quantize</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../fft.html">FFT</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.fft.html">mlx.core.fft.fft</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.ifft.html">mlx.core.fft.ifft</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.fft2.html">mlx.core.fft.fft2</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.ifft2.html">mlx.core.fft.ifft2</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.fftn.html">mlx.core.fft.fftn</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.ifftn.html">mlx.core.fft.ifftn</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.rfft.html">mlx.core.fft.rfft</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.irfft.html">mlx.core.fft.irfft</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.rfft2.html">mlx.core.fft.rfft2</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.irfft2.html">mlx.core.fft.irfft2</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.rfftn.html">mlx.core.fft.rfftn</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fft.irfftn.html">mlx.core.fft.irfftn</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../linalg.html">Linear Algebra</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.inv.html">mlx.core.linalg.inv</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.tri_inv.html">mlx.core.linalg.tri_inv</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.norm.html">mlx.core.linalg.norm</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.cholesky.html">mlx.core.linalg.cholesky</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.cholesky_inv.html">mlx.core.linalg.cholesky_inv</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.cross.html">mlx.core.linalg.cross</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.qr.html">mlx.core.linalg.qr</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.svd.html">mlx.core.linalg.svd</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.eigvalsh.html">mlx.core.linalg.eigvalsh</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.linalg.eigh.html">mlx.core.linalg.eigh</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../metal.html">Metal</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.is_available.html">mlx.core.metal.is_available</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.device_info.html">mlx.core.metal.device_info</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.get_active_memory.html">mlx.core.metal.get_active_memory</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.get_peak_memory.html">mlx.core.metal.get_peak_memory</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.reset_peak_memory.html">mlx.core.metal.reset_peak_memory</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.get_cache_memory.html">mlx.core.metal.get_cache_memory</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.set_memory_limit.html">mlx.core.metal.set_memory_limit</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.set_cache_limit.html">mlx.core.metal.set_cache_limit</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.set_wired_limit.html">mlx.core.metal.set_wired_limit</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.clear_cache.html">mlx.core.metal.clear_cache</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.start_capture.html">mlx.core.metal.start_capture</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.metal.stop_capture.html">mlx.core.metal.stop_capture</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../nn.html">Neural Networks</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.nn.value_and_grad.html">mlx.nn.value_and_grad</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.nn.quantize.html">mlx.nn.quantize</a></li>
-<li class="toctree-l2 has-children"><a class="reference internal" href="../nn/module.html">Module</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.training.html">mlx.nn.Module.training</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.state.html">mlx.nn.Module.state</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.apply.html">mlx.nn.Module.apply</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.apply_to_modules.html">mlx.nn.Module.apply_to_modules</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.children.html">mlx.nn.Module.children</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.eval.html">mlx.nn.Module.eval</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.filter_and_map.html">mlx.nn.Module.filter_and_map</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.freeze.html">mlx.nn.Module.freeze</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.leaf_modules.html">mlx.nn.Module.leaf_modules</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.load_weights.html">mlx.nn.Module.load_weights</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.modules.html">mlx.nn.Module.modules</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.named_modules.html">mlx.nn.Module.named_modules</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.parameters.html">mlx.nn.Module.parameters</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.save_weights.html">mlx.nn.Module.save_weights</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.set_dtype.html">mlx.nn.Module.set_dtype</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.train.html">mlx.nn.Module.train</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.trainable_parameters.html">mlx.nn.Module.trainable_parameters</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.unfreeze.html">mlx.nn.Module.unfreeze</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.update.html">mlx.nn.Module.update</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Module.update_modules.html">mlx.nn.Module.update_modules</a></li>
-</ul>
-</details></li>
-<li class="toctree-l2 has-children"><a class="reference internal" href="../nn/layers.html">Layers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv2d.html">mlx.nn.Conv2d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv3d.html">mlx.nn.Conv3d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ConvTranspose1d.html">mlx.nn.ConvTranspose1d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ConvTranspose2d.html">mlx.nn.ConvTranspose2d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ConvTranspose3d.html">mlx.nn.ConvTranspose3d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Dropout.html">mlx.nn.Dropout</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Dropout2d.html">mlx.nn.Dropout2d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Dropout3d.html">mlx.nn.Dropout3d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Embedding.html">mlx.nn.Embedding</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ELU.html">mlx.nn.ELU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.GELU.html">mlx.nn.GELU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.GLU.html">mlx.nn.GLU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.GroupNorm.html">mlx.nn.GroupNorm</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.GRU.html">mlx.nn.GRU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.HardShrink.html">mlx.nn.HardShrink</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.HardTanh.html">mlx.nn.HardTanh</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Hardswish.html">mlx.nn.Hardswish</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.InstanceNorm.html">mlx.nn.InstanceNorm</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LayerNorm.html">mlx.nn.LayerNorm</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LeakyReLU.html">mlx.nn.LeakyReLU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Linear.html">mlx.nn.Linear</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LogSigmoid.html">mlx.nn.LogSigmoid</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LogSoftmax.html">mlx.nn.LogSoftmax</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.QuantizedEmbedding.html">mlx.nn.QuantizedEmbedding</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.QuantizedLinear.html">mlx.nn.QuantizedLinear</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.RMSNorm.html">mlx.nn.RMSNorm</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ReLU.html">mlx.nn.ReLU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ReLU6.html">mlx.nn.ReLU6</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.RNN.html">mlx.nn.RNN</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.RoPE.html">mlx.nn.RoPE</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.SELU.html">mlx.nn.SELU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Sequential.html">mlx.nn.Sequential</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Sigmoid.html">mlx.nn.Sigmoid</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.SiLU.html">mlx.nn.SiLU</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding.html">mlx.nn.SinusoidalPositionalEncoding</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Softmin.html">mlx.nn.Softmin</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Softshrink.html">mlx.nn.Softshrink</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Softsign.html">mlx.nn.Softsign</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Softmax.html">mlx.nn.Softmax</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Softplus.html">mlx.nn.Softplus</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Step.html">mlx.nn.Step</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Tanh.html">mlx.nn.Tanh</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Transformer.html">mlx.nn.Transformer</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Upsample.html">mlx.nn.Upsample</a></li>
-</ul>
-</details></li>
-<li class="toctree-l2 has-children"><a class="reference internal" href="../nn/functions.html">Functions</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.elu.html">mlx.nn.elu</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.celu.html">mlx.nn.celu</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.gelu.html">mlx.nn.gelu</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.gelu_approx.html">mlx.nn.gelu_approx</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.gelu_fast_approx.html">mlx.nn.gelu_fast_approx</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.glu.html">mlx.nn.glu</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.hard_shrink.html">mlx.nn.hard_shrink</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.hard_tanh.html">mlx.nn.hard_tanh</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.hardswish.html">mlx.nn.hardswish</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.leaky_relu.html">mlx.nn.leaky_relu</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.log_sigmoid.html">mlx.nn.log_sigmoid</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.log_softmax.html">mlx.nn.log_softmax</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.mish.html">mlx.nn.mish</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.prelu.html">mlx.nn.prelu</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.relu.html">mlx.nn.relu</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.relu6.html">mlx.nn.relu6</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.selu.html">mlx.nn.selu</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.sigmoid.html">mlx.nn.sigmoid</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.silu.html">mlx.nn.silu</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.softmax.html">mlx.nn.softmax</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.softmin.html">mlx.nn.softmin</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.softplus.html">mlx.nn.softplus</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.softshrink.html">mlx.nn.softshrink</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.step.html">mlx.nn.step</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.tanh.html">mlx.nn.tanh</a></li>
-</ul>
-</details></li>
-<li class="toctree-l2 has-children"><a class="reference internal" href="../nn/losses.html">Loss Functions</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy.html">mlx.nn.losses.binary_cross_entropy</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.html">mlx.nn.losses.cosine_similarity_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.cross_entropy.html">mlx.nn.losses.cross_entropy</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.html">mlx.nn.losses.gaussian_nll_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.hinge_loss.html">mlx.nn.losses.hinge_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.huber_loss.html">mlx.nn.losses.huber_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.kl_div_loss.html">mlx.nn.losses.kl_div_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.l1_loss.html">mlx.nn.losses.l1_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss.html">mlx.nn.losses.log_cosh_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss.html">mlx.nn.losses.margin_ranking_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.mse_loss.html">mlx.nn.losses.mse_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.nll_loss.html">mlx.nn.losses.nll_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss.html">mlx.nn.losses.smooth_l1_loss</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary_functions/mlx.nn.losses.triplet_loss.html">mlx.nn.losses.triplet_loss</a></li>
-</ul>
-</details></li>
-<li class="toctree-l2 has-children"><a class="reference internal" href="../nn/init.html">Initializers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.init.constant.html">mlx.nn.init.constant</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.init.normal.html">mlx.nn.init.normal</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.init.uniform.html">mlx.nn.init.uniform</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.init.identity.html">mlx.nn.init.identity</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.init.glorot_normal.html">mlx.nn.init.glorot_normal</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.init.glorot_uniform.html">mlx.nn.init.glorot_uniform</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.init.he_normal.html">mlx.nn.init.he_normal</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.init.he_uniform.html">mlx.nn.init.he_uniform</a></li>
-</ul>
-</details></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../optimizers.html">Optimizers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2 has-children"><a class="reference internal" href="../optimizers/optimizer.html">Optimizer</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.Optimizer.state.html">mlx.optimizers.Optimizer.state</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.html">mlx.optimizers.Optimizer.apply_gradients</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.Optimizer.init.html">mlx.optimizers.Optimizer.init</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.Optimizer.update.html">mlx.optimizers.Optimizer.update</a></li>
-</ul>
-</details></li>
-<li class="toctree-l2 has-children"><a class="reference internal" href="../optimizers/common_optimizers.html">Common Optimizers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.SGD.html">mlx.optimizers.SGD</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.RMSprop.html">mlx.optimizers.RMSprop</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.Adagrad.html">mlx.optimizers.Adagrad</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.Adafactor.html">mlx.optimizers.Adafactor</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.AdaDelta.html">mlx.optimizers.AdaDelta</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.Adam.html">mlx.optimizers.Adam</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.AdamW.html">mlx.optimizers.AdamW</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.Adamax.html">mlx.optimizers.Adamax</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.Lion.html">mlx.optimizers.Lion</a></li>
-</ul>
-</details></li>
-<li class="toctree-l2 has-children"><a class="reference internal" href="../optimizers/schedulers.html">Schedulers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.cosine_decay.html">mlx.optimizers.cosine_decay</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.exponential_decay.html">mlx.optimizers.exponential_decay</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.join_schedules.html">mlx.optimizers.join_schedules</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.linear_schedule.html">mlx.optimizers.linear_schedule</a></li>
-<li class="toctree-l3"><a class="reference internal" href="../optimizers/_autosummary/mlx.optimizers.step_decay.html">mlx.optimizers.step_decay</a></li>
-</ul>
-</details></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.optimizers.clip_grad_norm.html">mlx.optimizers.clip_grad_norm</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../distributed.html">Distributed Communication</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.distributed.Group.html">mlx.core.distributed.Group</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.distributed.is_available.html">mlx.core.distributed.is_available</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.distributed.init.html">mlx.core.distributed.init</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.distributed.all_sum.html">mlx.core.distributed.all_sum</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.distributed.all_gather.html">mlx.core.distributed.all_gather</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.distributed.send.html">mlx.core.distributed.send</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.distributed.recv.html">mlx.core.distributed.recv</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.distributed.recv_like.html">mlx.core.distributed.recv_like</a></li>
-</ul>
-</details></li>
-<li class="toctree-l1 has-children"><a class="reference internal" href="../tree_utils.html">Tree Utils</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
-<li class="toctree-l2"><a class="reference internal" href="mlx.utils.tree_flatten.html">mlx.utils.tree_flatten</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.utils.tree_unflatten.html">mlx.utils.tree_unflatten</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.utils.tree_map.html">mlx.utils.tree_map</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.utils.tree_map_with_path.html">mlx.utils.tree_map_with_path</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.utils.tree_reduce.html">mlx.utils.tree_reduce</a></li>
-</ul>
-</details></li>
-</ul>
-<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API Reference</span></p>
-<ul class="nav bd-sidenav">
-<li class="toctree-l1"><a class="reference internal" href="../../cpp/ops.html">Operations</a></li>
-</ul>
-<p aria-level="2" class="caption" role="heading"><span class="caption-text">Further Reading</span></p>
-<ul class="nav bd-sidenav">
-<li class="toctree-l1"><a class="reference internal" href="../../dev/extensions.html">Custom Extensions in MLX</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../dev/metal_debugger.html">Metal Debugger</a></li>
-<li class="toctree-l1"><a class="reference internal" href="../../dev/custom_metal_kernels.html">Custom Metal Kernels</a></li>
-</ul>
-
-    </div>
-</nav></div>
-    </div>
-  
-  
-  <div class="sidebar-primary-items__end sidebar-primary__section">
-  </div>
-  
-  <div id="rtd-footer-container"></div>
-
-
-      </div>
-      
-      <main id="main-content" class="bd-main" role="main">
-        
-        
-
-<div class="sbt-scroll-pixel-helper"></div>
-
-          <div class="bd-content">
-            <div class="bd-article-container">
-              
-              <div class="bd-header-article d-print-none">
-<div class="header-article-items header-article__inner">
-  
-    <div class="header-article-items__start">
-      
-        <div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
-  <span class="fa-solid fa-bars"></span>
-</button></div>
-      
-    </div>
-  
-  
-    <div class="header-article-items__end">
-      
-        <div class="header-article-item">
-
-<div class="article-header-buttons">
-
-
-<a href="https://github.com/ml-explore/mlx" target="_blank"
-   class="btn btn-sm btn-source-repository-button"
-   title="Source repository"
-   data-bs-placement="bottom" data-bs-toggle="tooltip"
->
-  
-
-<span class="btn__icon-container">
-  <i class="fab fa-github"></i>
-  </span>
-
-</a>
-
-
-
-
-
-
-<div class="dropdown dropdown-download-buttons">
-  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
-    <i class="fas fa-download"></i>
-  </button>
-  <ul class="dropdown-menu">
-      
-      
-      
-      <li><a href="../../_sources/python/_autosummary/mlx.core.fast.affine_quantize.rst" target="_blank"
-   class="btn btn-sm btn-download-source-button dropdown-item"
-   title="Download source file"
-   data-bs-placement="left" data-bs-toggle="tooltip"
->
-  
-
-<span class="btn__icon-container">
-  <i class="fas fa-file"></i>
-  </span>
-<span class="btn__text-container">.rst</span>
-</a>
-</li>
-      
-      
-      
-      
-      <li>
-<button onclick="window.print()"
-  class="btn btn-sm btn-download-pdf-button dropdown-item"
-  title="Print to PDF"
-  data-bs-placement="left" data-bs-toggle="tooltip"
->
-  
-
-<span class="btn__icon-container">
-  <i class="fas fa-file-pdf"></i>
-  </span>
-<span class="btn__text-container">.pdf</span>
-</button>
-</li>
-      
-  </ul>
-</div>
-
-
-
-
-<button onclick="toggleFullScreen()"
-  class="btn btn-sm btn-fullscreen-button"
-  title="Fullscreen mode"
-  data-bs-placement="bottom" data-bs-toggle="tooltip"
->
-  
-
-<span class="btn__icon-container">
-  <i class="fas fa-expand"></i>
-  </span>
-
-</button>
-
-
-
-<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
-  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
-  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
-  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
-</button>
-
-
-<button class="btn btn-sm pst-navbar-icon search-button search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
-    <i class="fa-solid fa-magnifying-glass fa-lg"></i>
-</button>
-<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
-    <span class="fa-solid fa-list"></span>
-</button>
-</div></div>
-      
-    </div>
-  
-</div>
-</div>
-              
-              
-
-<div id="jb-print-docs-body" class="onlyprint">
-    <h1>mlx.core.fast.affine_quantize</h1>
-    <!-- Table of contents -->
-    <div id="print-main-content">
-        <div id="jb-print-toc">
-            
-            <div>
-                <h2> Contents </h2>
-            </div>
-            <nav aria-label="Page">
-                <ul class="visible nav section-nav flex-column">
-<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mlx.core.fast.affine_quantize"><code class="docutils literal notranslate"><span class="pre">affine_quantize()</span></code></a></li>
-</ul>
-            </nav>
-        </div>
-    </div>
-</div>
-
-              
-                
-<div id="searchbox"></div>
-                <article class="bd-article">
-                  
-  <section id="mlx-core-fast-affine-quantize">
-<h1>mlx.core.fast.affine_quantize<a class="headerlink" href="#mlx-core-fast-affine-quantize" title="Link to this heading">#</a></h1>
-<dl class="py function">
-<dt class="sig sig-object py" id="mlx.core.fast.affine_quantize">
-<span class="sig-name descname"><span class="pre">affine_quantize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">w</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><span class="pre">array</span></a></span></em>, <em class="sig-param"><span class="o"><span class="pre">/</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">scales</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><span class="pre">array</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">biases</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><span class="pre">array</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">group_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">64</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">4</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stream</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference internal" href="stream_class.html#mlx.core.Stream" title="mlx.core.Stream"><span class="pre">Stream</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference internal" href="mlx.core.Device.html#mlx.core.Device" title="mlx.core.Device"><span class="pre">Device</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><span class="pre">array</span></a></span></span><a class="headerlink" href="#mlx.core.fast.affine_quantize" title="Link to this definition">#</a></dt>
-<dd><p>Quantize the matrix <code class="docutils literal notranslate"><span class="pre">w</span></code> using the provided <code class="docutils literal notranslate"><span class="pre">scales</span></code> and
-<code class="docutils literal notranslate"><span class="pre">biases</span></code> and the <code class="docutils literal notranslate"><span class="pre">group_size</span></code> and <code class="docutils literal notranslate"><span class="pre">bits</span></code> configuration.</p>
-<p>Formally, given the notation in <code class="xref py py-func docutils literal notranslate"><span class="pre">quantize()</span></code>, we compute
-<span class="math notranslate nohighlight">\(w_i\)</span> from <span class="math notranslate nohighlight">\(\hat{w_i}\)</span> and corresponding <span class="math notranslate nohighlight">\(s\)</span> and
-<span class="math notranslate nohighlight">\(\beta\)</span> as follows</p>
-<div class="math notranslate nohighlight">
-\[w_i = s (\hat{w_i} + \beta)\]</div>
-<dl class="field-list simple">
-<dt class="field-odd">Parameters<span class="colon">:</span></dt>
-<dd class="field-odd"><ul class="simple">
-<li><p><strong>w</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a>) – Matrix to be quantize</p></li>
-<li><p><strong>scales</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a>) – The scales to use per <code class="docutils literal notranslate"><span class="pre">group_size</span></code> elements of <code class="docutils literal notranslate"><span class="pre">w</span></code></p></li>
-<li><p><strong>biases</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a>) – The biases to use per <code class="docutils literal notranslate"><span class="pre">group_size</span></code> elements of <code class="docutils literal notranslate"><span class="pre">w</span></code></p></li>
-<li><p><strong>group_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – The size of the group in <code class="docutils literal notranslate"><span class="pre">w</span></code> that shares a
-scale and bias. (default: <code class="docutils literal notranslate"><span class="pre">64</span></code>)</p></li>
-<li><p><strong>bits</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – The number of bits occupied by each element in
-<code class="docutils literal notranslate"><span class="pre">w</span></code>. (default: <code class="docutils literal notranslate"><span class="pre">4</span></code>)</p></li>
-</ul>
-</dd>
-<dt class="field-even">Returns<span class="colon">:</span></dt>
-<dd class="field-even"><p>The quantized version of <code class="docutils literal notranslate"><span class="pre">w</span></code></p>
-</dd>
-<dt class="field-odd">Return type<span class="colon">:</span></dt>
-<dd class="field-odd"><p><a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a></p>
-</dd>
-</dl>
-</dd></dl>
-
-</section>
-
-
-                </article>
-              
-
-              
-              
-              
-              
-                <footer class="prev-next-footer d-print-none">
-                  
-<div class="prev-next-area">
-    <a class="left-prev"
-       href="mlx.core.fast.scaled_dot_product_attention.html"
-       title="previous page">
-      <i class="fa-solid fa-angle-left"></i>
-      <div class="prev-next-info">
-        <p class="prev-next-subtitle">previous</p>
-        <p class="prev-next-title">mlx.core.fast.scaled_dot_product_attention</p>
-      </div>
-    </a>
-    <a class="right-next"
-       href="mlx.core.fast.metal_kernel.html"
-       title="next page">
-      <div class="prev-next-info">
-        <p class="prev-next-subtitle">next</p>
-        <p class="prev-next-title">mlx.core.fast.metal_kernel</p>
-      </div>
-      <i class="fa-solid fa-angle-right"></i>
-    </a>
-</div>
-                </footer>
-              
-            </div>
-            
-            
-              
-                <dialog id="pst-secondary-sidebar-modal"></dialog>
-                <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
-
-
-  <div class="sidebar-secondary-item">
-  <div class="page-toc tocsection onthispage">
-    <i class="fa-solid fa-list"></i> Contents
-  </div>
-  <nav class="bd-toc-nav page-toc">
-    <ul class="visible nav section-nav flex-column">
-<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mlx.core.fast.affine_quantize"><code class="docutils literal notranslate"><span class="pre">affine_quantize()</span></code></a></li>
-</ul>
-  </nav></div>
-
-</div></div>
-              
-            
-          </div>
-          <footer class="bd-footer-content">
-            
-<div class="bd-footer-content__inner container">
-  
-  <div class="footer-item">
-    
-<p class="component-author">
-By MLX Contributors
-</p>
-
-  </div>
-  
-  <div class="footer-item">
-    
-
-  <p class="copyright">
-    
-      © Copyright 2023, MLX Contributors.
-      <br/>
-    
-  </p>
-
-  </div>
-  
-  <div class="footer-item">
-    
-  </div>
-  
-  <div class="footer-item">
-    
-  </div>
-  
-</div>
-          </footer>
-        
-
-      </main>
-    </div>
-  </div>
-  
-  <!-- Scripts loaded after <body> so the DOM is not blocked -->
-  <script defer src="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549"></script>
-<script defer src="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549"></script>
-
-  <footer class="bd-footer">
-  </footer>
-  </body>
-</html>
\ No newline at end of file
diff --git a/docs/build/html/python/_autosummary/mlx.core.fast.layer_norm.html b/docs/build/html/python/_autosummary/mlx.core.fast.layer_norm.html
index 972875960..3f04d8c85 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fast.layer_norm.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fast.layer_norm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fast.layer_norm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fast.layer_norm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fast.rms_norm" href="mlx.core.fast.rms_norm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fast.metal_kernel.html b/docs/build/html/python/_autosummary/mlx.core.fast.metal_kernel.html
index 0a1c7d6cc..727c2f48e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fast.metal_kernel.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fast.metal_kernel.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fast.metal_kernel &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fast.metal_kernel &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -48,10 +48,10 @@
     <link rel="index" title="Index" href="../../genindex.html" />
     <link rel="search" title="Search" href="../../search.html" />
     <link rel="next" title="FFT" href="../fft.html" />
-    <link rel="prev" title="mlx.core.fast.affine_quantize" href="mlx.core.fast.affine_quantize.html" />
+    <link rel="prev" title="mlx.core.fast.scaled_dot_product_attention" href="mlx.core.fast.scaled_dot_product_attention.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -937,12 +938,12 @@ e.g. <code class="docutils literal notranslate"><span class="pre">device</span>
                   
 <div class="prev-next-area">
     <a class="left-prev"
-       href="mlx.core.fast.affine_quantize.html"
+       href="mlx.core.fast.scaled_dot_product_attention.html"
        title="previous page">
       <i class="fa-solid fa-angle-left"></i>
       <div class="prev-next-info">
         <p class="prev-next-subtitle">previous</p>
-        <p class="prev-next-title">mlx.core.fast.affine_quantize</p>
+        <p class="prev-next-title">mlx.core.fast.scaled_dot_product_attention</p>
       </div>
     </a>
     <a class="right-next"
diff --git a/docs/build/html/python/_autosummary/mlx.core.fast.rms_norm.html b/docs/build/html/python/_autosummary/mlx.core.fast.rms_norm.html
index 38c4f86b6..c1392fe47 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fast.rms_norm.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fast.rms_norm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fast.rms_norm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fast.rms_norm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Fast" href="../fast.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fast.rope.html b/docs/build/html/python/_autosummary/mlx.core.fast.rope.html
index 2c6c22d88..67698d35e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fast.rope.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fast.rope.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fast.rope &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fast.rope &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fast.layer_norm" href="mlx.core.fast.layer_norm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html b/docs/build/html/python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html
index 1a8ec936b..17c65c442 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fast.scaled_dot_product_attention &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fast.scaled_dot_product_attention &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -47,11 +47,11 @@
     <link rel="icon" href="../../_static/mlx_logo.png"/>
     <link rel="index" title="Index" href="../../genindex.html" />
     <link rel="search" title="Search" href="../../search.html" />
-    <link rel="next" title="mlx.core.fast.affine_quantize" href="mlx.core.fast.affine_quantize.html" />
+    <link rel="next" title="mlx.core.fast.metal_kernel" href="mlx.core.fast.metal_kernel.html" />
     <link rel="prev" title="mlx.core.fast.rope" href="mlx.core.fast.rope.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -877,14 +878,25 @@
 the input precision.</p>
 <p>Note: For Grouped Query Attention and Multi-Query Attention, the <code class="docutils literal notranslate"><span class="pre">k</span></code>
 and <code class="docutils literal notranslate"><span class="pre">v</span></code> inputs should not be pre-tiled to match <code class="docutils literal notranslate"><span class="pre">q</span></code>.</p>
+<p>In the following the dimensions are given by:</p>
+<ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">B</span></code>: The batch size.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">N_q</span></code>: The number of query heads.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">N_kv</span></code>: The number of key and value heads.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">T_q</span></code>: The number of queries per example.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">T_kv</span></code>: The number of keys and values per example.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">D</span></code>: The per-head dimension.</p></li>
+</ul>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
-<li><p><strong>q</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a>) – Input query array.</p></li>
-<li><p><strong>k</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a>) – Input keys array.</p></li>
-<li><p><strong>v</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a>) – Input values array.</p></li>
+<li><p><strong>q</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a>) – Queries with shape <code class="docutils literal notranslate"><span class="pre">[B,</span> <span class="pre">N_q,</span> <span class="pre">T_q,</span> <span class="pre">D]</span></code>.</p></li>
+<li><p><strong>k</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a>) – Keys with shape <code class="docutils literal notranslate"><span class="pre">[B,</span> <span class="pre">N_kv,</span> <span class="pre">T_kv,</span> <span class="pre">D]</span></code>.</p></li>
+<li><p><strong>v</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a>) – Values with shape <code class="docutils literal notranslate"><span class="pre">[B,</span> <span class="pre">N_kv,</span> <span class="pre">T_kv,</span> <span class="pre">D]</span></code>.</p></li>
 <li><p><strong>scale</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#float" title="(in Python v3.13)"><em>float</em></a>) – Scale for queries (typically <code class="docutils literal notranslate"><span class="pre">1.0</span> <span class="pre">/</span> <span class="pre">sqrt(q.shape(-1)</span></code>)</p></li>
-<li><p><strong>mask</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a><em>, </em><em>optional</em>) – An additive mask to apply to the query-key scores.</p></li>
+<li><p><strong>mask</strong> (<a class="reference internal" href="mlx.core.array.html#mlx.core.array" title="mlx.core.array"><em>array</em></a><em>, </em><em>optional</em>) – An additive mask to apply to the query-key
+scores. The mask can have at most 4 dimensions and must be
+broadcast-compatible with the shape <code class="docutils literal notranslate"><span class="pre">[B,</span> <span class="pre">N,</span> <span class="pre">T_q,</span> <span class="pre">T_kv]</span></code>.</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns<span class="colon">:</span></dt>
@@ -919,11 +931,11 @@ and <code class="docutils literal notranslate"><span class="pre">v</span></code>
       </div>
     </a>
     <a class="right-next"
-       href="mlx.core.fast.affine_quantize.html"
+       href="mlx.core.fast.metal_kernel.html"
        title="next page">
       <div class="prev-next-info">
         <p class="prev-next-subtitle">next</p>
-        <p class="prev-next-title">mlx.core.fast.affine_quantize</p>
+        <p class="prev-next-title">mlx.core.fast.metal_kernel</p>
       </div>
       <i class="fa-solid fa-angle-right"></i>
     </a>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.fft.html b/docs/build/html/python/_autosummary/mlx.core.fft.fft.html
index 6fa043492..b040467ed 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.fft.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.fft.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.fft &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.fft &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="FFT" href="../fft.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.fft2.html b/docs/build/html/python/_autosummary/mlx.core.fft.fft2.html
index ab9b7a00c..3f91a21ef 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.fft2.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.fft2.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.fft2 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.fft2 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.ifft" href="mlx.core.fft.ifft.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.fftn.html b/docs/build/html/python/_autosummary/mlx.core.fft.fftn.html
index 41210f046..d165e2323 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.fftn.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.fftn.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.fftn &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.fftn &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.ifft2" href="mlx.core.fft.ifft2.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.ifft.html b/docs/build/html/python/_autosummary/mlx.core.fft.ifft.html
index 64718c95a..7e633dcca 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.ifft.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.ifft.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.ifft &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.ifft &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.fft" href="mlx.core.fft.fft.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.ifft2.html b/docs/build/html/python/_autosummary/mlx.core.fft.ifft2.html
index 77277593c..750a8a98e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.ifft2.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.ifft2.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.ifft2 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.ifft2 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.fft2" href="mlx.core.fft.fft2.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.ifftn.html b/docs/build/html/python/_autosummary/mlx.core.fft.ifftn.html
index 40868024a..f5e5b2f25 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.ifftn.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.ifftn.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.ifftn &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.ifftn &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.fftn" href="mlx.core.fft.fftn.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.irfft.html b/docs/build/html/python/_autosummary/mlx.core.fft.irfft.html
index d60e55610..dec9d88ae 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.irfft.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.irfft.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.irfft &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.irfft &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.rfft" href="mlx.core.fft.rfft.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.irfft2.html b/docs/build/html/python/_autosummary/mlx.core.fft.irfft2.html
index acba21bce..5e61afe7d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.irfft2.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.irfft2.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.irfft2 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.irfft2 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.rfft2" href="mlx.core.fft.rfft2.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.irfftn.html b/docs/build/html/python/_autosummary/mlx.core.fft.irfftn.html
index d1b14d382..c67bc7bc7 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.irfftn.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.irfftn.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.irfftn &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.irfftn &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.rfftn" href="mlx.core.fft.rfftn.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.rfft.html b/docs/build/html/python/_autosummary/mlx.core.fft.rfft.html
index 9bd83216c..bbe18751d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.rfft.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.rfft.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.rfft &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.rfft &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.ifftn" href="mlx.core.fft.ifftn.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.rfft2.html b/docs/build/html/python/_autosummary/mlx.core.fft.rfft2.html
index 31781b966..83bb49c07 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.rfft2.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.rfft2.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.rfft2 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.rfft2 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.irfft" href="mlx.core.fft.irfft.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.fft.rfftn.html b/docs/build/html/python/_autosummary/mlx.core.fft.rfftn.html
index ea80c133d..fc752df0d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fft.rfftn.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fft.rfftn.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.fft.rfftn &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.fft.rfftn &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fft.irfft2" href="mlx.core.fft.irfft2.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.flatten.html b/docs/build/html/python/_autosummary/mlx.core.flatten.html
index ba05386b6..6a02a304d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.flatten.html
+++ b/docs/build/html/python/_autosummary/mlx.core.flatten.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.flatten &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.flatten &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.eye" href="mlx.core.eye.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.floor.html b/docs/build/html/python/_autosummary/mlx.core.floor.html
index f302dd5df..3133de92f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.floor.html
+++ b/docs/build/html/python/_autosummary/mlx.core.floor.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.floor &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.floor &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.flatten" href="mlx.core.flatten.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.floor_divide.html b/docs/build/html/python/_autosummary/mlx.core.floor_divide.html
index 4b9f0d16f..46f020f64 100644
--- a/docs/build/html/python/_autosummary/mlx.core.floor_divide.html
+++ b/docs/build/html/python/_autosummary/mlx.core.floor_divide.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.floor_divide &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.floor_divide &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.floor" href="mlx.core.floor.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.full.html b/docs/build/html/python/_autosummary/mlx.core.full.html
index a78f8bb7f..8c8b67d24 100644
--- a/docs/build/html/python/_autosummary/mlx.core.full.html
+++ b/docs/build/html/python/_autosummary/mlx.core.full.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.full &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.full &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.floor_divide" href="mlx.core.floor_divide.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.gather_mm.html b/docs/build/html/python/_autosummary/mlx.core.gather_mm.html
index 4ff9702c1..dbf6052b6 100644
--- a/docs/build/html/python/_autosummary/mlx.core.gather_mm.html
+++ b/docs/build/html/python/_autosummary/mlx.core.gather_mm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.gather_mm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.gather_mm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.full" href="mlx.core.full.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.gather_qmm.html b/docs/build/html/python/_autosummary/mlx.core.gather_qmm.html
index 5f3e3cb3c..fe599fe23 100644
--- a/docs/build/html/python/_autosummary/mlx.core.gather_qmm.html
+++ b/docs/build/html/python/_autosummary/mlx.core.gather_qmm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.gather_qmm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.gather_qmm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.gather_mm" href="mlx.core.gather_mm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.grad.html b/docs/build/html/python/_autosummary/mlx.core.grad.html
index bff71576e..05b0eb694 100644
--- a/docs/build/html/python/_autosummary/mlx.core.grad.html
+++ b/docs/build/html/python/_autosummary/mlx.core.grad.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.grad &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.grad &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.enable_compile" href="mlx.core.enable_compile.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.greater.html b/docs/build/html/python/_autosummary/mlx.core.greater.html
index c132fd28a..0c9db7f9c 100644
--- a/docs/build/html/python/_autosummary/mlx.core.greater.html
+++ b/docs/build/html/python/_autosummary/mlx.core.greater.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.greater &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.greater &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.gather_qmm" href="mlx.core.gather_qmm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.greater_equal.html b/docs/build/html/python/_autosummary/mlx.core.greater_equal.html
index 27afd96b1..c50e31dfb 100644
--- a/docs/build/html/python/_autosummary/mlx.core.greater_equal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.greater_equal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.greater_equal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.greater_equal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.greater" href="mlx.core.greater.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.hadamard_transform.html b/docs/build/html/python/_autosummary/mlx.core.hadamard_transform.html
index 8f9ef1fbe..a1190f54d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.hadamard_transform.html
+++ b/docs/build/html/python/_autosummary/mlx.core.hadamard_transform.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.hadamard_transform &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.hadamard_transform &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.greater_equal" href="mlx.core.greater_equal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.identity.html b/docs/build/html/python/_autosummary/mlx.core.identity.html
index c65c927d6..127234cfa 100644
--- a/docs/build/html/python/_autosummary/mlx.core.identity.html
+++ b/docs/build/html/python/_autosummary/mlx.core.identity.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.identity &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.identity &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.hadamard_transform" href="mlx.core.hadamard_transform.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.imag.html b/docs/build/html/python/_autosummary/mlx.core.imag.html
index b6556c078..56c8e67b1 100644
--- a/docs/build/html/python/_autosummary/mlx.core.imag.html
+++ b/docs/build/html/python/_autosummary/mlx.core.imag.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.imag &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.imag &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.identity" href="mlx.core.identity.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.inner.html b/docs/build/html/python/_autosummary/mlx.core.inner.html
index 6bda4af5e..2371e2211 100644
--- a/docs/build/html/python/_autosummary/mlx.core.inner.html
+++ b/docs/build/html/python/_autosummary/mlx.core.inner.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.inner &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.inner &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.imag" href="mlx.core.imag.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.isclose.html b/docs/build/html/python/_autosummary/mlx.core.isclose.html
index 32a33bbad..50c41fb6c 100644
--- a/docs/build/html/python/_autosummary/mlx.core.isclose.html
+++ b/docs/build/html/python/_autosummary/mlx.core.isclose.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.isclose &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.isclose &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.isfinite" href="mlx.core.isfinite.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.isfinite.html b/docs/build/html/python/_autosummary/mlx.core.isfinite.html
index c2351b15a..23ef27ef3 100644
--- a/docs/build/html/python/_autosummary/mlx.core.isfinite.html
+++ b/docs/build/html/python/_autosummary/mlx.core.isfinite.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.isfinite &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.isfinite &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.inner" href="mlx.core.inner.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.isinf.html b/docs/build/html/python/_autosummary/mlx.core.isinf.html
index 36d1bb9c8..db2d1cb8b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.isinf.html
+++ b/docs/build/html/python/_autosummary/mlx.core.isinf.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.isinf &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.isinf &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.isclose" href="mlx.core.isclose.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.isnan.html b/docs/build/html/python/_autosummary/mlx.core.isnan.html
index d0e4e03a5..78d6fec31 100644
--- a/docs/build/html/python/_autosummary/mlx.core.isnan.html
+++ b/docs/build/html/python/_autosummary/mlx.core.isnan.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.isnan &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.isnan &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.isinf" href="mlx.core.isinf.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.isneginf.html b/docs/build/html/python/_autosummary/mlx.core.isneginf.html
index 564482811..85528dca1 100644
--- a/docs/build/html/python/_autosummary/mlx.core.isneginf.html
+++ b/docs/build/html/python/_autosummary/mlx.core.isneginf.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.isneginf &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.isneginf &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.isnan" href="mlx.core.isnan.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.isposinf.html b/docs/build/html/python/_autosummary/mlx.core.isposinf.html
index 34ab27294..6d366a24f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.isposinf.html
+++ b/docs/build/html/python/_autosummary/mlx.core.isposinf.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.isposinf &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.isposinf &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.isneginf" href="mlx.core.isneginf.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.issubdtype.html b/docs/build/html/python/_autosummary/mlx.core.issubdtype.html
index b922fcb81..29746498d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.issubdtype.html
+++ b/docs/build/html/python/_autosummary/mlx.core.issubdtype.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.issubdtype &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.issubdtype &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.DtypeCategory" href="mlx.core.DtypeCategory.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.jvp.html b/docs/build/html/python/_autosummary/mlx.core.jvp.html
index a8e672a49..ac833fa0d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.jvp.html
+++ b/docs/build/html/python/_autosummary/mlx.core.jvp.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.jvp &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.jvp &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.value_and_grad" href="mlx.core.value_and_grad.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.left_shift.html b/docs/build/html/python/_autosummary/mlx.core.left_shift.html
index f08d839a5..4d44d35bc 100644
--- a/docs/build/html/python/_autosummary/mlx.core.left_shift.html
+++ b/docs/build/html/python/_autosummary/mlx.core.left_shift.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.left_shift &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.left_shift &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.isposinf" href="mlx.core.isposinf.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.less.html b/docs/build/html/python/_autosummary/mlx.core.less.html
index f9aac6dd6..9b0ab9018 100644
--- a/docs/build/html/python/_autosummary/mlx.core.less.html
+++ b/docs/build/html/python/_autosummary/mlx.core.less.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.less &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.less &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.left_shift" href="mlx.core.left_shift.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.less_equal.html b/docs/build/html/python/_autosummary/mlx.core.less_equal.html
index 8431edbe4..ea7e87236 100644
--- a/docs/build/html/python/_autosummary/mlx.core.less_equal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.less_equal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.less_equal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.less_equal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.less" href="mlx.core.less.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.cholesky.html b/docs/build/html/python/_autosummary/mlx.core.linalg.cholesky.html
index 37e6c97e1..39d7bcba2 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.cholesky.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.cholesky.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.cholesky &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.cholesky &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.linalg.norm" href="mlx.core.linalg.norm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.cholesky_inv.html b/docs/build/html/python/_autosummary/mlx.core.linalg.cholesky_inv.html
index 537472e28..6db52db0f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.cholesky_inv.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.cholesky_inv.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.cholesky_inv &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.cholesky_inv &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.linalg.cholesky" href="mlx.core.linalg.cholesky.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.cross.html b/docs/build/html/python/_autosummary/mlx.core.linalg.cross.html
index e60f70ffc..fbf8dd71e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.cross.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.cross.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.cross &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.cross &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.linalg.cholesky_inv" href="mlx.core.linalg.cholesky_inv.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.eigh.html b/docs/build/html/python/_autosummary/mlx.core.linalg.eigh.html
index 2f3d2c1dc..f3c935135 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.eigh.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.eigh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.eigh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.eigh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.linalg.eigvalsh" href="mlx.core.linalg.eigvalsh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.eigvalsh.html b/docs/build/html/python/_autosummary/mlx.core.linalg.eigvalsh.html
index df988dc53..ff88174c8 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.eigvalsh.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.eigvalsh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.eigvalsh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.eigvalsh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.linalg.svd" href="mlx.core.linalg.svd.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.inv.html b/docs/build/html/python/_autosummary/mlx.core.linalg.inv.html
index bad9dc47b..21107cab4 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.inv.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.inv.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.inv &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.inv &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Linear Algebra" href="../linalg.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.norm.html b/docs/build/html/python/_autosummary/mlx.core.linalg.norm.html
index d8b9beb43..bcf97ca91 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.norm.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.norm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.norm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.norm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.linalg.tri_inv" href="mlx.core.linalg.tri_inv.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.qr.html b/docs/build/html/python/_autosummary/mlx.core.linalg.qr.html
index 7c28021fa..bc9715c41 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.qr.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.qr.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.qr &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.qr &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.linalg.cross" href="mlx.core.linalg.cross.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.svd.html b/docs/build/html/python/_autosummary/mlx.core.linalg.svd.html
index 05d6ca0b8..d0b08c368 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.svd.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.svd.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.svd &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.svd &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.linalg.qr" href="mlx.core.linalg.qr.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linalg.tri_inv.html b/docs/build/html/python/_autosummary/mlx.core.linalg.tri_inv.html
index e0953762c..d570221b7 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linalg.tri_inv.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linalg.tri_inv.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linalg.tri_inv &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linalg.tri_inv &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.linalg.inv" href="mlx.core.linalg.inv.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.linspace.html b/docs/build/html/python/_autosummary/mlx.core.linspace.html
index bcbc81f66..e777279c1 100644
--- a/docs/build/html/python/_autosummary/mlx.core.linspace.html
+++ b/docs/build/html/python/_autosummary/mlx.core.linspace.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.linspace &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.linspace &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.less_equal" href="mlx.core.less_equal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.load.html b/docs/build/html/python/_autosummary/mlx.core.load.html
index 67dc7aed9..b8e6fe8a2 100644
--- a/docs/build/html/python/_autosummary/mlx.core.load.html
+++ b/docs/build/html/python/_autosummary/mlx.core.load.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.load &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.load &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.linspace" href="mlx.core.linspace.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.log.html b/docs/build/html/python/_autosummary/mlx.core.log.html
index 218e28544..7da9fd9e3 100644
--- a/docs/build/html/python/_autosummary/mlx.core.log.html
+++ b/docs/build/html/python/_autosummary/mlx.core.log.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.log &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.log &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.load" href="mlx.core.load.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.log10.html b/docs/build/html/python/_autosummary/mlx.core.log10.html
index cccd93b4c..6b89cbd2b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.log10.html
+++ b/docs/build/html/python/_autosummary/mlx.core.log10.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.log10 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.log10 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.log2" href="mlx.core.log2.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.log1p.html b/docs/build/html/python/_autosummary/mlx.core.log1p.html
index 64124d759..7c4eaab8f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.log1p.html
+++ b/docs/build/html/python/_autosummary/mlx.core.log1p.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.log1p &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.log1p &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.log10" href="mlx.core.log10.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.log2.html b/docs/build/html/python/_autosummary/mlx.core.log2.html
index a0327394e..402aee173 100644
--- a/docs/build/html/python/_autosummary/mlx.core.log2.html
+++ b/docs/build/html/python/_autosummary/mlx.core.log2.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.log2 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.log2 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.log" href="mlx.core.log.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.logaddexp.html b/docs/build/html/python/_autosummary/mlx.core.logaddexp.html
index a12075db6..fa8d3693b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.logaddexp.html
+++ b/docs/build/html/python/_autosummary/mlx.core.logaddexp.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.logaddexp &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.logaddexp &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.log1p" href="mlx.core.log1p.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.logical_and.html b/docs/build/html/python/_autosummary/mlx.core.logical_and.html
index a45bf29ab..b42d4c20f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.logical_and.html
+++ b/docs/build/html/python/_autosummary/mlx.core.logical_and.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.logical_and &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.logical_and &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.logical_not" href="mlx.core.logical_not.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.logical_not.html b/docs/build/html/python/_autosummary/mlx.core.logical_not.html
index e28a858d7..d7b68e15a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.logical_not.html
+++ b/docs/build/html/python/_autosummary/mlx.core.logical_not.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.logical_not &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.logical_not &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.logaddexp" href="mlx.core.logaddexp.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.logical_or.html b/docs/build/html/python/_autosummary/mlx.core.logical_or.html
index 6dd3ead1d..7e3d12077 100644
--- a/docs/build/html/python/_autosummary/mlx.core.logical_or.html
+++ b/docs/build/html/python/_autosummary/mlx.core.logical_or.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.logical_or &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.logical_or &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.logical_and" href="mlx.core.logical_and.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.logsumexp.html b/docs/build/html/python/_autosummary/mlx.core.logsumexp.html
index 340ec9990..e366a1406 100644
--- a/docs/build/html/python/_autosummary/mlx.core.logsumexp.html
+++ b/docs/build/html/python/_autosummary/mlx.core.logsumexp.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.logsumexp &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.logsumexp &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.logical_or" href="mlx.core.logical_or.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.matmul.html b/docs/build/html/python/_autosummary/mlx.core.matmul.html
index 30a2e8db9..a4d6cdcf5 100644
--- a/docs/build/html/python/_autosummary/mlx.core.matmul.html
+++ b/docs/build/html/python/_autosummary/mlx.core.matmul.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.matmul &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.matmul &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.logsumexp" href="mlx.core.logsumexp.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.max.html b/docs/build/html/python/_autosummary/mlx.core.max.html
index e8e9ed95d..2d753fd73 100644
--- a/docs/build/html/python/_autosummary/mlx.core.max.html
+++ b/docs/build/html/python/_autosummary/mlx.core.max.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.max &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.max &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.matmul" href="mlx.core.matmul.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.maximum.html b/docs/build/html/python/_autosummary/mlx.core.maximum.html
index 94540aec4..905a56796 100644
--- a/docs/build/html/python/_autosummary/mlx.core.maximum.html
+++ b/docs/build/html/python/_autosummary/mlx.core.maximum.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.maximum &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.maximum &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.max" href="mlx.core.max.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.mean.html b/docs/build/html/python/_autosummary/mlx.core.mean.html
index d913aa888..407e1355f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.mean.html
+++ b/docs/build/html/python/_autosummary/mlx.core.mean.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.mean &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.mean &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.maximum" href="mlx.core.maximum.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.meshgrid.html b/docs/build/html/python/_autosummary/mlx.core.meshgrid.html
index 8076eef3e..4f1b4bd51 100644
--- a/docs/build/html/python/_autosummary/mlx.core.meshgrid.html
+++ b/docs/build/html/python/_autosummary/mlx.core.meshgrid.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.meshgrid &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.meshgrid &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.mean" href="mlx.core.mean.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.clear_cache.html b/docs/build/html/python/_autosummary/mlx.core.metal.clear_cache.html
index 2bfe05a3f..61af65008 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.clear_cache.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.clear_cache.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.clear_cache &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.clear_cache &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.set_wired_limit" href="mlx.core.metal.set_wired_limit.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.device_info.html b/docs/build/html/python/_autosummary/mlx.core.metal.device_info.html
index 1cf6d14be..f902a2f72 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.device_info.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.device_info.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.device_info &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.device_info &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.is_available" href="mlx.core.metal.is_available.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.get_active_memory.html b/docs/build/html/python/_autosummary/mlx.core.metal.get_active_memory.html
index 6c0738148..d7df40722 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.get_active_memory.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.get_active_memory.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.get_active_memory &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.get_active_memory &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.device_info" href="mlx.core.metal.device_info.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.get_cache_memory.html b/docs/build/html/python/_autosummary/mlx.core.metal.get_cache_memory.html
index dba50670a..e862a4369 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.get_cache_memory.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.get_cache_memory.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.get_cache_memory &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.get_cache_memory &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.reset_peak_memory" href="mlx.core.metal.reset_peak_memory.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.get_peak_memory.html b/docs/build/html/python/_autosummary/mlx.core.metal.get_peak_memory.html
index c82a07515..405fc9ee8 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.get_peak_memory.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.get_peak_memory.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.get_peak_memory &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.get_peak_memory &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.get_active_memory" href="mlx.core.metal.get_active_memory.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.is_available.html b/docs/build/html/python/_autosummary/mlx.core.metal.is_available.html
index c3dd502ba..9f569225f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.is_available.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.is_available.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.is_available &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.is_available &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Metal" href="../metal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.reset_peak_memory.html b/docs/build/html/python/_autosummary/mlx.core.metal.reset_peak_memory.html
index c77c24104..08b8899e1 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.reset_peak_memory.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.reset_peak_memory.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.reset_peak_memory &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.reset_peak_memory &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.get_peak_memory" href="mlx.core.metal.get_peak_memory.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.set_cache_limit.html b/docs/build/html/python/_autosummary/mlx.core.metal.set_cache_limit.html
index d74293c46..eceee3b82 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.set_cache_limit.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.set_cache_limit.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.set_cache_limit &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.set_cache_limit &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.set_memory_limit" href="mlx.core.metal.set_memory_limit.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.set_memory_limit.html b/docs/build/html/python/_autosummary/mlx.core.metal.set_memory_limit.html
index 0782b4043..b9274bd50 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.set_memory_limit.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.set_memory_limit.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.set_memory_limit &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.set_memory_limit &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.get_cache_memory" href="mlx.core.metal.get_cache_memory.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.set_wired_limit.html b/docs/build/html/python/_autosummary/mlx.core.metal.set_wired_limit.html
index 3caa647a2..a128c02c4 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.set_wired_limit.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.set_wired_limit.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.set_wired_limit &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.set_wired_limit &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.set_cache_limit" href="mlx.core.metal.set_cache_limit.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.start_capture.html b/docs/build/html/python/_autosummary/mlx.core.metal.start_capture.html
index bd9b77b87..92164abe9 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.start_capture.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.start_capture.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.start_capture &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.start_capture &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.clear_cache" href="mlx.core.metal.clear_cache.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.metal.stop_capture.html b/docs/build/html/python/_autosummary/mlx.core.metal.stop_capture.html
index b1d547efc..c6235717f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.metal.stop_capture.html
+++ b/docs/build/html/python/_autosummary/mlx.core.metal.stop_capture.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.metal.stop_capture &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.metal.stop_capture &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.metal.start_capture" href="mlx.core.metal.start_capture.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.min.html b/docs/build/html/python/_autosummary/mlx.core.min.html
index 299e2e57e..79ab3f5eb 100644
--- a/docs/build/html/python/_autosummary/mlx.core.min.html
+++ b/docs/build/html/python/_autosummary/mlx.core.min.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.min &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.min &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.meshgrid" href="mlx.core.meshgrid.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.minimum.html b/docs/build/html/python/_autosummary/mlx.core.minimum.html
index fac9d2969..d5c62cd7a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.minimum.html
+++ b/docs/build/html/python/_autosummary/mlx.core.minimum.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.minimum &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.minimum &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.min" href="mlx.core.min.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.moveaxis.html b/docs/build/html/python/_autosummary/mlx.core.moveaxis.html
index 27a98b8e8..752e47586 100644
--- a/docs/build/html/python/_autosummary/mlx.core.moveaxis.html
+++ b/docs/build/html/python/_autosummary/mlx.core.moveaxis.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.moveaxis &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.moveaxis &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.minimum" href="mlx.core.minimum.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.multiply.html b/docs/build/html/python/_autosummary/mlx.core.multiply.html
index fd9d7c784..e5e047015 100644
--- a/docs/build/html/python/_autosummary/mlx.core.multiply.html
+++ b/docs/build/html/python/_autosummary/mlx.core.multiply.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.multiply &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.multiply &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.moveaxis" href="mlx.core.moveaxis.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.nan_to_num.html b/docs/build/html/python/_autosummary/mlx.core.nan_to_num.html
index 71fc64d2e..6aa383ae5 100644
--- a/docs/build/html/python/_autosummary/mlx.core.nan_to_num.html
+++ b/docs/build/html/python/_autosummary/mlx.core.nan_to_num.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.nan_to_num &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.nan_to_num &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.multiply" href="mlx.core.multiply.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.negative.html b/docs/build/html/python/_autosummary/mlx.core.negative.html
index bc47c1e90..604582f8a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.negative.html
+++ b/docs/build/html/python/_autosummary/mlx.core.negative.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.negative &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.negative &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.nan_to_num" href="mlx.core.nan_to_num.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.new_stream.html b/docs/build/html/python/_autosummary/mlx.core.new_stream.html
index 93c7b6ddb..6966b0650 100644
--- a/docs/build/html/python/_autosummary/mlx.core.new_stream.html
+++ b/docs/build/html/python/_autosummary/mlx.core.new_stream.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.new_stream &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.new_stream &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.default_stream" href="mlx.core.default_stream.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.not_equal.html b/docs/build/html/python/_autosummary/mlx.core.not_equal.html
index 5dc9f9548..34088ed97 100644
--- a/docs/build/html/python/_autosummary/mlx.core.not_equal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.not_equal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.not_equal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.not_equal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.negative" href="mlx.core.negative.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.ones.html b/docs/build/html/python/_autosummary/mlx.core.ones.html
index 130ac326f..6a636b60d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.ones.html
+++ b/docs/build/html/python/_autosummary/mlx.core.ones.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.ones &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.ones &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.not_equal" href="mlx.core.not_equal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.ones_like.html b/docs/build/html/python/_autosummary/mlx.core.ones_like.html
index e8fcda564..2d70f7ea8 100644
--- a/docs/build/html/python/_autosummary/mlx.core.ones_like.html
+++ b/docs/build/html/python/_autosummary/mlx.core.ones_like.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.ones_like &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.ones_like &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.ones" href="mlx.core.ones.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.outer.html b/docs/build/html/python/_autosummary/mlx.core.outer.html
index f64988f90..3b42da8af 100644
--- a/docs/build/html/python/_autosummary/mlx.core.outer.html
+++ b/docs/build/html/python/_autosummary/mlx.core.outer.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.outer &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.outer &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.ones_like" href="mlx.core.ones_like.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.pad.html b/docs/build/html/python/_autosummary/mlx.core.pad.html
index 6e6379f0f..257261da7 100644
--- a/docs/build/html/python/_autosummary/mlx.core.pad.html
+++ b/docs/build/html/python/_autosummary/mlx.core.pad.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.pad &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.pad &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.partition" href="mlx.core.partition.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.partition.html b/docs/build/html/python/_autosummary/mlx.core.partition.html
index 0eb470c3a..f0d0bf52a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.partition.html
+++ b/docs/build/html/python/_autosummary/mlx.core.partition.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.partition &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.partition &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.outer" href="mlx.core.outer.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.power.html b/docs/build/html/python/_autosummary/mlx.core.power.html
index 1ab38085f..f1156b4f7 100644
--- a/docs/build/html/python/_autosummary/mlx.core.power.html
+++ b/docs/build/html/python/_autosummary/mlx.core.power.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.power &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.power &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.pad" href="mlx.core.pad.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.prod.html b/docs/build/html/python/_autosummary/mlx.core.prod.html
index fde44364c..c58ee3680 100644
--- a/docs/build/html/python/_autosummary/mlx.core.prod.html
+++ b/docs/build/html/python/_autosummary/mlx.core.prod.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.prod &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.prod &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.power" href="mlx.core.power.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.put_along_axis.html b/docs/build/html/python/_autosummary/mlx.core.put_along_axis.html
index 8d9b5da93..1ea8c032f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.put_along_axis.html
+++ b/docs/build/html/python/_autosummary/mlx.core.put_along_axis.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.put_along_axis &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.put_along_axis &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.prod" href="mlx.core.prod.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.quantize.html b/docs/build/html/python/_autosummary/mlx.core.quantize.html
index 355df6315..062ad8279 100644
--- a/docs/build/html/python/_autosummary/mlx.core.quantize.html
+++ b/docs/build/html/python/_autosummary/mlx.core.quantize.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.quantize &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.quantize &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.put_along_axis" href="mlx.core.put_along_axis.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.quantized_matmul.html b/docs/build/html/python/_autosummary/mlx.core.quantized_matmul.html
index 084a47864..199dab1b6 100644
--- a/docs/build/html/python/_autosummary/mlx.core.quantized_matmul.html
+++ b/docs/build/html/python/_autosummary/mlx.core.quantized_matmul.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.quantized_matmul &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.quantized_matmul &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.quantize" href="mlx.core.quantize.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.radians.html b/docs/build/html/python/_autosummary/mlx.core.radians.html
index 66f81ca88..e40c69c83 100644
--- a/docs/build/html/python/_autosummary/mlx.core.radians.html
+++ b/docs/build/html/python/_autosummary/mlx.core.radians.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.radians &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.radians &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.quantized_matmul" href="mlx.core.quantized_matmul.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.bernoulli.html b/docs/build/html/python/_autosummary/mlx.core.random.bernoulli.html
index bae4cd133..a778d2b41 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.bernoulli.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.bernoulli.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.bernoulli &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.bernoulli &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Random" href="../random.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.categorical.html b/docs/build/html/python/_autosummary/mlx.core.random.categorical.html
index 270b1d5a3..090a397ba 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.categorical.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.categorical.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.categorical &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.categorical &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.bernoulli" href="mlx.core.random.bernoulli.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.gumbel.html b/docs/build/html/python/_autosummary/mlx.core.random.gumbel.html
index 780f5fc6d..3b5641897 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.gumbel.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.gumbel.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.gumbel &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.gumbel &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.categorical" href="mlx.core.random.categorical.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.key.html b/docs/build/html/python/_autosummary/mlx.core.random.key.html
index c7fe74d47..d8ed1563e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.key.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.key.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.key &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.key &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.gumbel" href="mlx.core.random.gumbel.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.laplace.html b/docs/build/html/python/_autosummary/mlx.core.random.laplace.html
index 49c74b436..debc0b904 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.laplace.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.laplace.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.laplace &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.laplace &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.uniform" href="mlx.core.random.uniform.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.multivariate_normal.html b/docs/build/html/python/_autosummary/mlx.core.random.multivariate_normal.html
index 392d7ef26..aa1e1130d 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.multivariate_normal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.multivariate_normal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.multivariate_normal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.multivariate_normal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.normal" href="mlx.core.random.normal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.normal.html b/docs/build/html/python/_autosummary/mlx.core.random.normal.html
index 8b53a37f9..18f71685b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.normal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.normal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.normal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.normal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.key" href="mlx.core.random.key.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.permutation.html b/docs/build/html/python/_autosummary/mlx.core.random.permutation.html
index ed8fe1631..06e5a06e6 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.permutation.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.permutation.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.permutation &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.permutation &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.laplace" href="mlx.core.random.laplace.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.randint.html b/docs/build/html/python/_autosummary/mlx.core.random.randint.html
index 585610331..6cf230065 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.randint.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.randint.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.randint &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.randint &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.multivariate_normal" href="mlx.core.random.multivariate_normal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.seed.html b/docs/build/html/python/_autosummary/mlx.core.random.seed.html
index 1685e59e9..3439b8a01 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.seed.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.seed.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.seed &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.seed &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.randint" href="mlx.core.random.randint.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.split.html b/docs/build/html/python/_autosummary/mlx.core.random.split.html
index 12432f389..98639b5d0 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.split.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.split.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.split &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.split &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.seed" href="mlx.core.random.seed.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.truncated_normal.html b/docs/build/html/python/_autosummary/mlx.core.random.truncated_normal.html
index 6a3e43bee..71ddd9e17 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.truncated_normal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.truncated_normal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.truncated_normal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.truncated_normal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.split" href="mlx.core.random.split.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.random.uniform.html b/docs/build/html/python/_autosummary/mlx.core.random.uniform.html
index 2a28834ed..f24e47032 100644
--- a/docs/build/html/python/_autosummary/mlx.core.random.uniform.html
+++ b/docs/build/html/python/_autosummary/mlx.core.random.uniform.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.random.uniform &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.random.uniform &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.truncated_normal" href="mlx.core.random.truncated_normal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.real.html b/docs/build/html/python/_autosummary/mlx.core.real.html
index 35c90fd0f..97f14bbe1 100644
--- a/docs/build/html/python/_autosummary/mlx.core.real.html
+++ b/docs/build/html/python/_autosummary/mlx.core.real.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.real &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.real &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.radians" href="mlx.core.radians.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.reciprocal.html b/docs/build/html/python/_autosummary/mlx.core.reciprocal.html
index 98684163f..916942806 100644
--- a/docs/build/html/python/_autosummary/mlx.core.reciprocal.html
+++ b/docs/build/html/python/_autosummary/mlx.core.reciprocal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.reciprocal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.reciprocal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.real" href="mlx.core.real.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.remainder.html b/docs/build/html/python/_autosummary/mlx.core.remainder.html
index d808f2b74..623e2c84a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.remainder.html
+++ b/docs/build/html/python/_autosummary/mlx.core.remainder.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.remainder &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.remainder &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.reciprocal" href="mlx.core.reciprocal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.repeat.html b/docs/build/html/python/_autosummary/mlx.core.repeat.html
index fa40bdc1e..2e35cc60a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.repeat.html
+++ b/docs/build/html/python/_autosummary/mlx.core.repeat.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.repeat &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.repeat &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.remainder" href="mlx.core.remainder.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.reshape.html b/docs/build/html/python/_autosummary/mlx.core.reshape.html
index b0805d3c5..1529bd888 100644
--- a/docs/build/html/python/_autosummary/mlx.core.reshape.html
+++ b/docs/build/html/python/_autosummary/mlx.core.reshape.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.reshape &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.reshape &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.repeat" href="mlx.core.repeat.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.right_shift.html b/docs/build/html/python/_autosummary/mlx.core.right_shift.html
index f1a4da0f3..d4602616a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.right_shift.html
+++ b/docs/build/html/python/_autosummary/mlx.core.right_shift.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.right_shift &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.right_shift &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.reshape" href="mlx.core.reshape.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.roll.html b/docs/build/html/python/_autosummary/mlx.core.roll.html
index d7fc4582c..0f022bcb5 100644
--- a/docs/build/html/python/_autosummary/mlx.core.roll.html
+++ b/docs/build/html/python/_autosummary/mlx.core.roll.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.roll &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.roll &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.right_shift" href="mlx.core.right_shift.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.round.html b/docs/build/html/python/_autosummary/mlx.core.round.html
index 7b2238dee..4eddc4a49 100644
--- a/docs/build/html/python/_autosummary/mlx.core.round.html
+++ b/docs/build/html/python/_autosummary/mlx.core.round.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.round &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.round &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.roll" href="mlx.core.roll.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.rsqrt.html b/docs/build/html/python/_autosummary/mlx.core.rsqrt.html
index f7bbc9fae..faf229dbc 100644
--- a/docs/build/html/python/_autosummary/mlx.core.rsqrt.html
+++ b/docs/build/html/python/_autosummary/mlx.core.rsqrt.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.rsqrt &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.rsqrt &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.round" href="mlx.core.round.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.save.html b/docs/build/html/python/_autosummary/mlx.core.save.html
index 9aa30abb2..a64cbd5d0 100644
--- a/docs/build/html/python/_autosummary/mlx.core.save.html
+++ b/docs/build/html/python/_autosummary/mlx.core.save.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.save &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.save &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.rsqrt" href="mlx.core.rsqrt.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.save_gguf.html b/docs/build/html/python/_autosummary/mlx.core.save_gguf.html
index 563d0f6bc..eb47b6049 100644
--- a/docs/build/html/python/_autosummary/mlx.core.save_gguf.html
+++ b/docs/build/html/python/_autosummary/mlx.core.save_gguf.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.save_gguf &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.save_gguf &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.savez_compressed" href="mlx.core.savez_compressed.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.save_safetensors.html b/docs/build/html/python/_autosummary/mlx.core.save_safetensors.html
index ee029ec24..1c1abf5bc 100644
--- a/docs/build/html/python/_autosummary/mlx.core.save_safetensors.html
+++ b/docs/build/html/python/_autosummary/mlx.core.save_safetensors.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.save_safetensors &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.save_safetensors &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.save_gguf" href="mlx.core.save_gguf.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.savez.html b/docs/build/html/python/_autosummary/mlx.core.savez.html
index 7640aabcf..9180168bf 100644
--- a/docs/build/html/python/_autosummary/mlx.core.savez.html
+++ b/docs/build/html/python/_autosummary/mlx.core.savez.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.savez &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.savez &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.save" href="mlx.core.save.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.savez_compressed.html b/docs/build/html/python/_autosummary/mlx.core.savez_compressed.html
index bd8493592..5d3b133cc 100644
--- a/docs/build/html/python/_autosummary/mlx.core.savez_compressed.html
+++ b/docs/build/html/python/_autosummary/mlx.core.savez_compressed.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.savez_compressed &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.savez_compressed &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.savez" href="mlx.core.savez.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.set_default_device.html b/docs/build/html/python/_autosummary/mlx.core.set_default_device.html
index c97ec5a0d..3d9e5dda3 100644
--- a/docs/build/html/python/_autosummary/mlx.core.set_default_device.html
+++ b/docs/build/html/python/_autosummary/mlx.core.set_default_device.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.set_default_device &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.set_default_device &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.default_device" href="mlx.core.default_device.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.set_default_stream.html b/docs/build/html/python/_autosummary/mlx.core.set_default_stream.html
index dfedfe67a..3837bce68 100644
--- a/docs/build/html/python/_autosummary/mlx.core.set_default_stream.html
+++ b/docs/build/html/python/_autosummary/mlx.core.set_default_stream.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.set_default_stream &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.set_default_stream &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.new_stream" href="mlx.core.new_stream.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.sigmoid.html b/docs/build/html/python/_autosummary/mlx.core.sigmoid.html
index 33ef71904..e440fb688 100644
--- a/docs/build/html/python/_autosummary/mlx.core.sigmoid.html
+++ b/docs/build/html/python/_autosummary/mlx.core.sigmoid.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.sigmoid &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.sigmoid &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.save_safetensors" href="mlx.core.save_safetensors.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.sign.html b/docs/build/html/python/_autosummary/mlx.core.sign.html
index c7c7fceea..326d2b7aa 100644
--- a/docs/build/html/python/_autosummary/mlx.core.sign.html
+++ b/docs/build/html/python/_autosummary/mlx.core.sign.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.sign &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.sign &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.sigmoid" href="mlx.core.sigmoid.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.sin.html b/docs/build/html/python/_autosummary/mlx.core.sin.html
index 04802634a..ea1bf29b2 100644
--- a/docs/build/html/python/_autosummary/mlx.core.sin.html
+++ b/docs/build/html/python/_autosummary/mlx.core.sin.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.sin &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.sin &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.sign" href="mlx.core.sign.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.sinh.html b/docs/build/html/python/_autosummary/mlx.core.sinh.html
index ba2882361..72ea18f3c 100644
--- a/docs/build/html/python/_autosummary/mlx.core.sinh.html
+++ b/docs/build/html/python/_autosummary/mlx.core.sinh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.sinh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.sinh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.sin" href="mlx.core.sin.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.softmax.html b/docs/build/html/python/_autosummary/mlx.core.softmax.html
index ecddacb30..8143ce0c4 100644
--- a/docs/build/html/python/_autosummary/mlx.core.softmax.html
+++ b/docs/build/html/python/_autosummary/mlx.core.softmax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.softmax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.softmax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.sinh" href="mlx.core.sinh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.sort.html b/docs/build/html/python/_autosummary/mlx.core.sort.html
index 2fee164eb..2178bc208 100644
--- a/docs/build/html/python/_autosummary/mlx.core.sort.html
+++ b/docs/build/html/python/_autosummary/mlx.core.sort.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.sort &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.sort &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.softmax" href="mlx.core.softmax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.split.html b/docs/build/html/python/_autosummary/mlx.core.split.html
index 089ca9ab2..d4d5a2ad9 100644
--- a/docs/build/html/python/_autosummary/mlx.core.split.html
+++ b/docs/build/html/python/_autosummary/mlx.core.split.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.split &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.split &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.sort" href="mlx.core.sort.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.sqrt.html b/docs/build/html/python/_autosummary/mlx.core.sqrt.html
index 17b71396c..049f15b9c 100644
--- a/docs/build/html/python/_autosummary/mlx.core.sqrt.html
+++ b/docs/build/html/python/_autosummary/mlx.core.sqrt.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.sqrt &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.sqrt &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.split" href="mlx.core.split.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.square.html b/docs/build/html/python/_autosummary/mlx.core.square.html
index 2c1744cc8..afd87c2b6 100644
--- a/docs/build/html/python/_autosummary/mlx.core.square.html
+++ b/docs/build/html/python/_autosummary/mlx.core.square.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.square &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.square &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.sqrt" href="mlx.core.sqrt.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.squeeze.html b/docs/build/html/python/_autosummary/mlx.core.squeeze.html
index bc7872dda..b931d91c0 100644
--- a/docs/build/html/python/_autosummary/mlx.core.squeeze.html
+++ b/docs/build/html/python/_autosummary/mlx.core.squeeze.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.squeeze &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.squeeze &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.square" href="mlx.core.square.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.stack.html b/docs/build/html/python/_autosummary/mlx.core.stack.html
index 5c0abc8dd..de1b8e82f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.stack.html
+++ b/docs/build/html/python/_autosummary/mlx.core.stack.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.stack &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.stack &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.squeeze" href="mlx.core.squeeze.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.std.html b/docs/build/html/python/_autosummary/mlx.core.std.html
index de426f720..e8c2cdd72 100644
--- a/docs/build/html/python/_autosummary/mlx.core.std.html
+++ b/docs/build/html/python/_autosummary/mlx.core.std.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.std &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.std &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.stack" href="mlx.core.stack.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.stop_gradient.html b/docs/build/html/python/_autosummary/mlx.core.stop_gradient.html
index 179024325..aed67e20f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.stop_gradient.html
+++ b/docs/build/html/python/_autosummary/mlx.core.stop_gradient.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.stop_gradient &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.stop_gradient &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.std" href="mlx.core.std.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.stream.html b/docs/build/html/python/_autosummary/mlx.core.stream.html
index 4873de9ff..5240a01fb 100644
--- a/docs/build/html/python/_autosummary/mlx.core.stream.html
+++ b/docs/build/html/python/_autosummary/mlx.core.stream.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.stream &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.stream &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.set_default_stream" href="mlx.core.set_default_stream.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.subtract.html b/docs/build/html/python/_autosummary/mlx.core.subtract.html
index 045ec832d..e61c94a51 100644
--- a/docs/build/html/python/_autosummary/mlx.core.subtract.html
+++ b/docs/build/html/python/_autosummary/mlx.core.subtract.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.subtract &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.subtract &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.stop_gradient" href="mlx.core.stop_gradient.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.sum.html b/docs/build/html/python/_autosummary/mlx.core.sum.html
index 634b87a0d..c4ad7a65b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.sum.html
+++ b/docs/build/html/python/_autosummary/mlx.core.sum.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.sum &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.sum &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.subtract" href="mlx.core.subtract.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.swapaxes.html b/docs/build/html/python/_autosummary/mlx.core.swapaxes.html
index c8d04d959..2e0de6a74 100644
--- a/docs/build/html/python/_autosummary/mlx.core.swapaxes.html
+++ b/docs/build/html/python/_autosummary/mlx.core.swapaxes.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.swapaxes &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.swapaxes &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.sum" href="mlx.core.sum.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.synchronize.html b/docs/build/html/python/_autosummary/mlx.core.synchronize.html
index d9fadf6f3..cfa0d2b86 100644
--- a/docs/build/html/python/_autosummary/mlx.core.synchronize.html
+++ b/docs/build/html/python/_autosummary/mlx.core.synchronize.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.synchronize &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.synchronize &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.stream" href="mlx.core.stream.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.take.html b/docs/build/html/python/_autosummary/mlx.core.take.html
index f1ca89535..89a73d77a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.take.html
+++ b/docs/build/html/python/_autosummary/mlx.core.take.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.take &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.take &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.swapaxes" href="mlx.core.swapaxes.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.take_along_axis.html b/docs/build/html/python/_autosummary/mlx.core.take_along_axis.html
index d2de8adad..3eeea1ceb 100644
--- a/docs/build/html/python/_autosummary/mlx.core.take_along_axis.html
+++ b/docs/build/html/python/_autosummary/mlx.core.take_along_axis.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.take_along_axis &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.take_along_axis &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.take" href="mlx.core.take.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.tan.html b/docs/build/html/python/_autosummary/mlx.core.tan.html
index a076304c6..4ca135839 100644
--- a/docs/build/html/python/_autosummary/mlx.core.tan.html
+++ b/docs/build/html/python/_autosummary/mlx.core.tan.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.tan &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.tan &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.take_along_axis" href="mlx.core.take_along_axis.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.tanh.html b/docs/build/html/python/_autosummary/mlx.core.tanh.html
index 2af7b83a0..8dc142955 100644
--- a/docs/build/html/python/_autosummary/mlx.core.tanh.html
+++ b/docs/build/html/python/_autosummary/mlx.core.tanh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.tanh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.tanh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.tan" href="mlx.core.tan.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.tensordot.html b/docs/build/html/python/_autosummary/mlx.core.tensordot.html
index 3b3d25e5c..b64ef0565 100644
--- a/docs/build/html/python/_autosummary/mlx.core.tensordot.html
+++ b/docs/build/html/python/_autosummary/mlx.core.tensordot.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.tensordot &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.tensordot &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.tanh" href="mlx.core.tanh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.tile.html b/docs/build/html/python/_autosummary/mlx.core.tile.html
index fad037fef..87ac74c16 100644
--- a/docs/build/html/python/_autosummary/mlx.core.tile.html
+++ b/docs/build/html/python/_autosummary/mlx.core.tile.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.tile &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.tile &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.tensordot" href="mlx.core.tensordot.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.topk.html b/docs/build/html/python/_autosummary/mlx.core.topk.html
index e73e7a924..b2410d020 100644
--- a/docs/build/html/python/_autosummary/mlx.core.topk.html
+++ b/docs/build/html/python/_autosummary/mlx.core.topk.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.topk &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.topk &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.tile" href="mlx.core.tile.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.trace.html b/docs/build/html/python/_autosummary/mlx.core.trace.html
index 5a74d9859..ff19ad578 100644
--- a/docs/build/html/python/_autosummary/mlx.core.trace.html
+++ b/docs/build/html/python/_autosummary/mlx.core.trace.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.trace &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.trace &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.topk" href="mlx.core.topk.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.transpose.html b/docs/build/html/python/_autosummary/mlx.core.transpose.html
index 8d8b566e9..6b6f8f760 100644
--- a/docs/build/html/python/_autosummary/mlx.core.transpose.html
+++ b/docs/build/html/python/_autosummary/mlx.core.transpose.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.transpose &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.transpose &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.trace" href="mlx.core.trace.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.tri.html b/docs/build/html/python/_autosummary/mlx.core.tri.html
index 2478717ce..f795c230f 100644
--- a/docs/build/html/python/_autosummary/mlx.core.tri.html
+++ b/docs/build/html/python/_autosummary/mlx.core.tri.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.tri &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.tri &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.transpose" href="mlx.core.transpose.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.tril.html b/docs/build/html/python/_autosummary/mlx.core.tril.html
index 0edbb8b44..4de21ff78 100644
--- a/docs/build/html/python/_autosummary/mlx.core.tril.html
+++ b/docs/build/html/python/_autosummary/mlx.core.tril.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.tril &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.tril &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.tri" href="mlx.core.tri.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.triu.html b/docs/build/html/python/_autosummary/mlx.core.triu.html
index 68b97a59e..ea3797fe9 100644
--- a/docs/build/html/python/_autosummary/mlx.core.triu.html
+++ b/docs/build/html/python/_autosummary/mlx.core.triu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.triu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.triu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.tril" href="mlx.core.tril.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.value_and_grad.html b/docs/build/html/python/_autosummary/mlx.core.value_and_grad.html
index 0fe2616a9..97a5c09f7 100644
--- a/docs/build/html/python/_autosummary/mlx.core.value_and_grad.html
+++ b/docs/build/html/python/_autosummary/mlx.core.value_and_grad.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.value_and_grad &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.value_and_grad &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.grad" href="mlx.core.grad.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.var.html b/docs/build/html/python/_autosummary/mlx.core.var.html
index ad97c98f3..1032ee0bd 100644
--- a/docs/build/html/python/_autosummary/mlx.core.var.html
+++ b/docs/build/html/python/_autosummary/mlx.core.var.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.var &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.var &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.triu" href="mlx.core.triu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.view.html b/docs/build/html/python/_autosummary/mlx.core.view.html
index b89041a29..1689f53a8 100644
--- a/docs/build/html/python/_autosummary/mlx.core.view.html
+++ b/docs/build/html/python/_autosummary/mlx.core.view.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.view &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.view &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.var" href="mlx.core.var.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.vjp.html b/docs/build/html/python/_autosummary/mlx.core.vjp.html
index b13115fa3..11b92db8e 100644
--- a/docs/build/html/python/_autosummary/mlx.core.vjp.html
+++ b/docs/build/html/python/_autosummary/mlx.core.vjp.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.vjp &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.vjp &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.jvp" href="mlx.core.jvp.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.vmap.html b/docs/build/html/python/_autosummary/mlx.core.vmap.html
index 8052b6cb6..93c10e501 100644
--- a/docs/build/html/python/_autosummary/mlx.core.vmap.html
+++ b/docs/build/html/python/_autosummary/mlx.core.vmap.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.vmap &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.vmap &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.vjp" href="mlx.core.vjp.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.where.html b/docs/build/html/python/_autosummary/mlx.core.where.html
index 1813d2c26..a64a3b50a 100644
--- a/docs/build/html/python/_autosummary/mlx.core.where.html
+++ b/docs/build/html/python/_autosummary/mlx.core.where.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.where &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.where &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.view" href="mlx.core.view.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.zeros.html b/docs/build/html/python/_autosummary/mlx.core.zeros.html
index c1520db51..de6306b3b 100644
--- a/docs/build/html/python/_autosummary/mlx.core.zeros.html
+++ b/docs/build/html/python/_autosummary/mlx.core.zeros.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.zeros &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.zeros &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.where" href="mlx.core.where.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.core.zeros_like.html b/docs/build/html/python/_autosummary/mlx.core.zeros_like.html
index e054c8a45..00b237fcf 100644
--- a/docs/build/html/python/_autosummary/mlx.core.zeros_like.html
+++ b/docs/build/html/python/_autosummary/mlx.core.zeros_like.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.zeros_like &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.zeros_like &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.zeros" href="mlx.core.zeros.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.nn.quantize.html b/docs/build/html/python/_autosummary/mlx.nn.quantize.html
index 9a10f872d..1590da631 100644
--- a/docs/build/html/python/_autosummary/mlx.nn.quantize.html
+++ b/docs/build/html/python/_autosummary/mlx.nn.quantize.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.quantize &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.quantize &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.value_and_grad" href="mlx.nn.value_and_grad.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.nn.value_and_grad.html b/docs/build/html/python/_autosummary/mlx.nn.value_and_grad.html
index 028185ba1..1db6357bc 100644
--- a/docs/build/html/python/_autosummary/mlx.nn.value_and_grad.html
+++ b/docs/build/html/python/_autosummary/mlx.nn.value_and_grad.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.value_and_grad &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.value_and_grad &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Neural Networks" href="../nn.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.optimizers.clip_grad_norm.html b/docs/build/html/python/_autosummary/mlx.optimizers.clip_grad_norm.html
index 349275974..beb942b1e 100644
--- a/docs/build/html/python/_autosummary/mlx.optimizers.clip_grad_norm.html
+++ b/docs/build/html/python/_autosummary/mlx.optimizers.clip_grad_norm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.clip_grad_norm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.clip_grad_norm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.optimizers.step_decay" href="../optimizers/_autosummary/mlx.optimizers.step_decay.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.utils.tree_flatten.html b/docs/build/html/python/_autosummary/mlx.utils.tree_flatten.html
index 92a1e43d3..bce5080db 100644
--- a/docs/build/html/python/_autosummary/mlx.utils.tree_flatten.html
+++ b/docs/build/html/python/_autosummary/mlx.utils.tree_flatten.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.utils.tree_flatten &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.utils.tree_flatten &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Tree Utils" href="../tree_utils.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.utils.tree_map.html b/docs/build/html/python/_autosummary/mlx.utils.tree_map.html
index caa881e54..78daab29d 100644
--- a/docs/build/html/python/_autosummary/mlx.utils.tree_map.html
+++ b/docs/build/html/python/_autosummary/mlx.utils.tree_map.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.utils.tree_map &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.utils.tree_map &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.utils.tree_unflatten" href="mlx.utils.tree_unflatten.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.utils.tree_map_with_path.html b/docs/build/html/python/_autosummary/mlx.utils.tree_map_with_path.html
index 8a2ca794b..6a9a3c210 100644
--- a/docs/build/html/python/_autosummary/mlx.utils.tree_map_with_path.html
+++ b/docs/build/html/python/_autosummary/mlx.utils.tree_map_with_path.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.utils.tree_map_with_path &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.utils.tree_map_with_path &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.utils.tree_map" href="mlx.utils.tree_map.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.utils.tree_reduce.html b/docs/build/html/python/_autosummary/mlx.utils.tree_reduce.html
index a0ba468d8..3570b4266 100644
--- a/docs/build/html/python/_autosummary/mlx.utils.tree_reduce.html
+++ b/docs/build/html/python/_autosummary/mlx.utils.tree_reduce.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.utils.tree_reduce &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.utils.tree_reduce &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.utils.tree_map_with_path" href="mlx.utils.tree_map_with_path.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/mlx.utils.tree_unflatten.html b/docs/build/html/python/_autosummary/mlx.utils.tree_unflatten.html
index b9d1b59c3..61309f4fd 100644
--- a/docs/build/html/python/_autosummary/mlx.utils.tree_unflatten.html
+++ b/docs/build/html/python/_autosummary/mlx.utils.tree_unflatten.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.utils.tree_unflatten &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.utils.tree_unflatten &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.utils.tree_flatten" href="mlx.utils.tree_flatten.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/_autosummary/stream_class.html b/docs/build/html/python/_autosummary/stream_class.html
index c277ce614..6600af163 100644
--- a/docs/build/html/python/_autosummary/stream_class.html
+++ b/docs/build/html/python/_autosummary/stream_class.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.core.Stream &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.core.Stream &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.Device" href="mlx.core.Device.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/array.html b/docs/build/html/python/array.html
index 0609bafdf..64bd5bb4c 100644
--- a/docs/build/html/python/array.html
+++ b/docs/build/html/python/array.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Array &#8212; MLX 0.20.0 documentation</title>
+    <title>Array &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="LLM inference" href="../examples/llama-inference.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/data_types.html b/docs/build/html/python/data_types.html
index 11d36e95a..f3d768073 100644
--- a/docs/build/html/python/data_types.html
+++ b/docs/build/html/python/data_types.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Data Types &#8212; MLX 0.20.0 documentation</title>
+    <title>Data Types &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.array.view" href="_autosummary/mlx.core.array.view.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/devices_and_streams.html b/docs/build/html/python/devices_and_streams.html
index 2c708cfcb..e7a828196 100644
--- a/docs/build/html/python/devices_and_streams.html
+++ b/docs/build/html/python/devices_and_streams.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Devices and Streams &#8212; MLX 0.20.0 documentation</title>
+    <title>Devices and Streams &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.issubdtype" href="_autosummary/mlx.core.issubdtype.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/distributed.html b/docs/build/html/python/distributed.html
index e7990efdf..55787a80e 100644
--- a/docs/build/html/python/distributed.html
+++ b/docs/build/html/python/distributed.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Distributed Communication &#8212; MLX 0.20.0 documentation</title>
+    <title>Distributed Communication &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.optimizers.clip_grad_norm" href="_autosummary/mlx.optimizers.clip_grad_norm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/fast.html b/docs/build/html/python/fast.html
index f296cc90e..c27b9023e 100644
--- a/docs/build/html/python/fast.html
+++ b/docs/build/html/python/fast.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Fast &#8212; MLX 0.20.0 documentation</title>
+    <title>Fast &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,11 +39,10 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
-    <script async="async" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
     <script>DOCUMENTATION_OPTIONS.pagename = 'python/fast';</script>
     <link rel="icon" href="../_static/mlx_logo.png"/>
     <link rel="index" title="Index" href="../genindex.html" />
@@ -52,7 +51,7 @@
     <link rel="prev" title="mlx.core.vmap" href="_autosummary/mlx.core.vmap.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -868,10 +868,7 @@
 <tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html#mlx.core.fast.scaled_dot_product_attention" title="mlx.core.fast.scaled_dot_product_attention"><code class="xref py py-obj docutils literal notranslate"><span class="pre">scaled_dot_product_attention</span></code></a>(q, k, v, *, scale)</p></td>
 <td><p>A fast implementation of multi-head attention: <code class="docutils literal notranslate"><span class="pre">O</span> <span class="pre">=</span> <span class="pre">softmax(Q</span> <span class="pre">&#64;</span> <span class="pre">K.T,</span> <span class="pre">dim=-1)</span> <span class="pre">&#64;</span> <span class="pre">V</span></code>.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html#mlx.core.fast.affine_quantize" title="mlx.core.fast.affine_quantize"><code class="xref py py-obj docutils literal notranslate"><span class="pre">affine_quantize</span></code></a>(w, /, scales, biases[, ...])</p></td>
-<td><p>Quantize the matrix <code class="docutils literal notranslate"><span class="pre">w</span></code> using the provided <code class="docutils literal notranslate"><span class="pre">scales</span></code> and <code class="docutils literal notranslate"><span class="pre">biases</span></code> and the <code class="docutils literal notranslate"><span class="pre">group_size</span></code> and <code class="docutils literal notranslate"><span class="pre">bits</span></code> configuration.</p></td>
-</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html#mlx.core.fast.metal_kernel" title="mlx.core.fast.metal_kernel"><code class="xref py py-obj docutils literal notranslate"><span class="pre">metal_kernel</span></code></a>(name, input_names, ...[, ...])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html#mlx.core.fast.metal_kernel" title="mlx.core.fast.metal_kernel"><code class="xref py py-obj docutils literal notranslate"><span class="pre">metal_kernel</span></code></a>(name, input_names, ...[, ...])</p></td>
 <td><p>A jit-compiled custom Metal kernel defined from a source string.</p></td>
 </tr>
 </tbody>
diff --git a/docs/build/html/python/fft.html b/docs/build/html/python/fft.html
index 3ada9a8e1..256538c7d 100644
--- a/docs/build/html/python/fft.html
+++ b/docs/build/html/python/fft.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>FFT &#8212; MLX 0.20.0 documentation</title>
+    <title>FFT &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.fast.metal_kernel" href="_autosummary/mlx.core.fast.metal_kernel.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/linalg.html b/docs/build/html/python/linalg.html
index ba425dbf0..8c27548b7 100644
--- a/docs/build/html/python/linalg.html
+++ b/docs/build/html/python/linalg.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Linear Algebra &#8212; MLX 0.20.0 documentation</title>
+    <title>Linear Algebra &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.fft.irfftn" href="_autosummary/mlx.core.fft.irfftn.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/metal.html b/docs/build/html/python/metal.html
index a9ef81111..eae7ebda6 100644
--- a/docs/build/html/python/metal.html
+++ b/docs/build/html/python/metal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Metal &#8212; MLX 0.20.0 documentation</title>
+    <title>Metal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.linalg.eigh" href="_autosummary/mlx.core.linalg.eigh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn.html b/docs/build/html/python/nn.html
index 3c1b19ff6..afa53bd05 100644
--- a/docs/build/html/python/nn.html
+++ b/docs/build/html/python/nn.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Neural Networks &#8212; MLX 0.20.0 documentation</title>
+    <title>Neural Networks &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.metal.stop_capture" href="_autosummary/mlx.core.metal.stop_capture.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -1121,6 +1122,10 @@ parameters as the first argument to the function returned by
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html#mlx.nn.AvgPool2d"><code class="docutils literal notranslate"><span class="pre">AvgPool2d</span></code></a></li>
 </ul>
 </li>
+<li class="toctree-l2"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html#mlx.nn.AvgPool3d"><code class="docutils literal notranslate"><span class="pre">AvgPool3d</span></code></a></li>
+</ul>
+</li>
 <li class="toctree-l2"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a><ul>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html#mlx.nn.BatchNorm"><code class="docutils literal notranslate"><span class="pre">BatchNorm</span></code></a></li>
 </ul>
@@ -1237,6 +1242,10 @@ parameters as the first argument to the function returned by
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html#mlx.nn.MaxPool2d"><code class="docutils literal notranslate"><span class="pre">MaxPool2d</span></code></a></li>
 </ul>
 </li>
+<li class="toctree-l2"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html#mlx.nn.MaxPool3d"><code class="docutils literal notranslate"><span class="pre">MaxPool3d</span></code></a></li>
+</ul>
+</li>
 <li class="toctree-l2"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a><ul>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html#mlx.nn.Mish"><code class="docutils literal notranslate"><span class="pre">Mish</span></code></a></li>
 </ul>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.ALiBi.html b/docs/build/html/python/nn/_autosummary/mlx.nn.ALiBi.html
index aaf20607c..48e8650d9 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.ALiBi.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.ALiBi.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.ALiBi &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.ALiBi &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Layers" href="../layers.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool1d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool1d.html
index 1a1be97bd..762afeb6f 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool1d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool1d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.AvgPool1d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.AvgPool1d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,11 +39,10 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
-    <script async="async" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
     <script>DOCUMENTATION_OPTIONS.pagename = 'python/nn/_autosummary/mlx.nn.AvgPool1d';</script>
     <link rel="icon" href="../../../_static/mlx_logo.png"/>
     <link rel="index" title="Index" href="../../../genindex.html" />
@@ -52,7 +51,7 @@
     <link rel="prev" title="mlx.nn.ALiBi" href="mlx.nn.ALiBi.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -868,16 +868,8 @@
 <dt class="sig sig-object py" id="mlx.nn.AvgPool1d">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">AvgPool1d</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stride</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#mlx.nn.AvgPool1d" title="Link to this definition">#</a></dt>
 <dd><p>Applies 1-dimensional average pooling.</p>
-<p>Assuming an input of shape <span class="math notranslate nohighlight">\((N, L, C)\)</span> and <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> is
-<span class="math notranslate nohighlight">\(k\)</span>, the output is a tensor of shape <span class="math notranslate nohighlight">\((N, L_{out}, C)\)</span>, given
-by:</p>
-<blockquote>
-<div><div class="math notranslate nohighlight">
-\[\text{out}(N_i, t, C_j) = \frac{1}{k} \sum_{m=0, \ldots, k - 1}
-        \text{input}(N_i, \text{stride} \times t + m, C_j),\]</div>
-</div></blockquote>
-<p>where <span class="math notranslate nohighlight">\(L_{out} = \left\lfloor \frac{L + 2 \times \text{padding} -
-\text{kernel\_size}}{\text{stride}}\right\rfloor + 1\)</span>.</p>
+<p>Spatially downsamples the input by taking the average of a sliding window
+of size <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> and sliding stride <code class="docutils literal notranslate"><span class="pre">stride</span></code>.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool2d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool2d.html
index 0f3ac611c..4ecff76d7 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool2d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool2d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.AvgPool2d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.AvgPool2d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,20 +39,19 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
-    <script async="async" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
     <script>DOCUMENTATION_OPTIONS.pagename = 'python/nn/_autosummary/mlx.nn.AvgPool2d';</script>
     <link rel="icon" href="../../../_static/mlx_logo.png"/>
     <link rel="index" title="Index" href="../../../genindex.html" />
     <link rel="search" title="Search" href="../../../search.html" />
-    <link rel="next" title="mlx.nn.BatchNorm" href="mlx.nn.BatchNorm.html" />
+    <link rel="next" title="mlx.nn.AvgPool3d" href="mlx.nn.AvgPool3d.html" />
     <link rel="prev" title="mlx.nn.AvgPool1d" href="mlx.nn.AvgPool1d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -868,26 +868,15 @@
 <dt class="sig sig-object py" id="mlx.nn.AvgPool2d">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">AvgPool2d</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stride</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#mlx.nn.AvgPool2d" title="Link to this definition">#</a></dt>
 <dd><p>Applies 2-dimensional average pooling.</p>
-<p>Assuming an input of shape <span class="math notranslate nohighlight">\((N, H, W, C)\)</span> and <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> is
-<span class="math notranslate nohighlight">\((k_H, k_W)\)</span>, the output is a tensor of shape <span class="math notranslate nohighlight">\((N, H_{out},
-W_{out}, C)\)</span>, given by:</p>
-<div class="math notranslate nohighlight">
-\[\begin{split}\begin{aligned}
-    \text{out}(N_i, h, w, C_j) = &amp; \frac{1}{k_H k_W} \sum_{m=0, \ldots, k_H-1} \sum_{n=0, \ldots, k_W-1} \\
-                            &amp; \text{input}(N_i, \text{stride[0]} \times h + m,
-                                        \text{stride[1]} \times w + n, C_j),
-\end{aligned}\end{split}\]</div>
-<p>where <span class="math notranslate nohighlight">\(H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel\_size[0]}}{\text{stride[0]}}\right\rfloor + 1\)</span>,
-<span class="math notranslate nohighlight">\(W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel\_size[1]}}{\text{stride[1]}}\right\rfloor + 1\)</span>.</p>
-<p>The parameters <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code>, <code class="docutils literal notranslate"><span class="pre">stride</span></code>, <code class="docutils literal notranslate"><span class="pre">padding</span></code>, can either be:</p>
-<blockquote>
-<div><ul class="simple">
+<p>Spatially downsamples the input by taking the average of a sliding window
+of size <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> and sliding stride <code class="docutils literal notranslate"><span class="pre">stride</span></code>.</p>
+<p>The parameters <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code>, <code class="docutils literal notranslate"><span class="pre">stride</span></code>, and <code class="docutils literal notranslate"><span class="pre">padding</span></code> can either be:</p>
+<ul class="simple">
 <li><p>a single <code class="docutils literal notranslate"><span class="pre">int</span></code> – in which case the same value is used for both the
-height and width axis;</p></li>
+height and width axis.</p></li>
 <li><p>a <code class="docutils literal notranslate"><span class="pre">tuple</span></code> of two <code class="docutils literal notranslate"><span class="pre">int</span></code> s – in which case, the first <code class="docutils literal notranslate"><span class="pre">int</span></code> is
 used for the height axis, the second <code class="docutils literal notranslate"><span class="pre">int</span></code> for the width axis.</p></li>
 </ul>
-</div></blockquote>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
@@ -939,11 +928,11 @@ of the height and width axis. Default: <code class="docutils literal notranslate
       </div>
     </a>
     <a class="right-next"
-       href="mlx.nn.BatchNorm.html"
+       href="mlx.nn.AvgPool3d.html"
        title="next page">
       <div class="prev-next-info">
         <p class="prev-next-subtitle">next</p>
-        <p class="prev-next-title">mlx.nn.BatchNorm</p>
+        <p class="prev-next-title">mlx.nn.AvgPool3d</p>
       </div>
       <i class="fa-solid fa-angle-right"></i>
     </a>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool3d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool3d.html
new file mode 100644
index 000000000..0a744a047
--- /dev/null
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.AvgPool3d.html
@@ -0,0 +1,1012 @@
+
+<!DOCTYPE html>
+
+
+<html lang="en" data-content_root="../../../" >
+
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+
+    <title>mlx.nn.AvgPool3d &#8212; MLX 0.21.0 documentation</title>
+  
+  
+  
+  <script data-cfasync="false">
+    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
+    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
+  </script>
+  <!-- 
+    this give us a css class that will be invisible only if js is disabled 
+  -->
+  <noscript>
+    <style>
+      .pst-js-only { display: none !important; }
+
+    </style>
+  </noscript>
+  
+  <!-- Loaded before other Sphinx assets -->
+  <link href="../../../_static/styles/theme.css?digest=26a4bc78f4c0ddb94549" rel="stylesheet" />
+<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=26a4bc78f4c0ddb94549" rel="stylesheet" />
+
+    <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=fa44fd50" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/sphinx-book-theme.css?v=a3416100" />
+  
+  <!-- So that users can add custom icons -->
+  <script src="../../../_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
+  <!-- Pre-loaded scripts that we'll load fully later -->
+  <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
+<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
+
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
+    <script src="../../../_static/doctools.js?v=9a2dae69"></script>
+    <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
+    <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
+    <script>DOCUMENTATION_OPTIONS.pagename = 'python/nn/_autosummary/mlx.nn.AvgPool3d';</script>
+    <link rel="icon" href="../../../_static/mlx_logo.png"/>
+    <link rel="index" title="Index" href="../../../genindex.html" />
+    <link rel="search" title="Search" href="../../../search.html" />
+    <link rel="next" title="mlx.nn.BatchNorm" href="mlx.nn.BatchNorm.html" />
+    <link rel="prev" title="mlx.nn.AvgPool2d" href="mlx.nn.AvgPool2d.html" />
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <meta name="docsearch:language" content="en"/>
+  <meta name="docsearch:version" content="0.21.0" />
+  </head>
+  
+  
+  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
+
+  
+  
+  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
+  
+  <div id="pst-scroll-pixel-helper"></div>
+  
+  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
+    <i class="fa-solid fa-arrow-up"></i>Back to top</button>
+
+  
+  <dialog id="pst-search-dialog">
+    
+<form class="bd-search d-flex align-items-center"
+      action="../../../search.html"
+      method="get">
+  <i class="fa-solid fa-magnifying-glass"></i>
+  <input type="search"
+         class="form-control"
+         name="q"
+         placeholder="Search..."
+         aria-label="Search..."
+         autocomplete="off"
+         autocorrect="off"
+         autocapitalize="off"
+         spellcheck="false"/>
+  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
+</form>
+  </dialog>
+
+  <div class="pst-async-banner-revealer d-none">
+  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
+</div>
+
+  
+    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
+    </header>
+  
+
+  <div class="bd-container">
+    <div class="bd-container__inner bd-page-width">
+      
+      
+      
+      <dialog id="pst-primary-sidebar-modal"></dialog>
+      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
+        
+
+  
+  <div class="sidebar-header-items sidebar-primary__section">
+    
+    
+    
+    
+  </div>
+  
+    <div class="sidebar-primary-items__start sidebar-primary__section">
+        <div class="sidebar-primary-item">
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../../../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
+  
+  
+</a></div>
+        <div class="sidebar-primary-item">
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button></div>
+        <div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
+    <div class="bd-toc-item navbar-nav active">
+        <p aria-level="2" class="caption" role="heading"><span class="caption-text">Install</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../install.html">Build and Install</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Usage</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/quick_start.html">Quick Start Guide</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/lazy_evaluation.html">Lazy Evaluation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/unified_memory.html">Unified Memory</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/indexing.html">Indexing Arrays</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/saving_and_loading.html">Saving and Loading Arrays</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/function_transforms.html">Function Transforms</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/compile.html">Compilation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/numpy.html">Conversion to NumPy and Other Frameworks</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/distributed.html">Distributed Communication</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/using_streams.html">Using Streams</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../examples/linear_regression.html">Linear Regression</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../examples/mlp.html">Multi-Layer Perceptron</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../examples/llama-inference.html">LLM inference</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Python API Reference</span></p>
+<ul class="current nav bd-sidenav">
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../array.html">Array</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.html">mlx.core.array</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.astype.html">mlx.core.array.astype</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.at.html">mlx.core.array.at</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.item.html">mlx.core.array.item</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.tolist.html">mlx.core.array.tolist</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.dtype.html">mlx.core.array.dtype</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.itemsize.html">mlx.core.array.itemsize</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.nbytes.html">mlx.core.array.nbytes</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.ndim.html">mlx.core.array.ndim</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.shape.html">mlx.core.array.shape</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.size.html">mlx.core.array.size</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.abs.html">mlx.core.array.abs</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.all.html">mlx.core.array.all</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.any.html">mlx.core.array.any</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.argmax.html">mlx.core.array.argmax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.argmin.html">mlx.core.array.argmin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.conj.html">mlx.core.array.conj</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cos.html">mlx.core.array.cos</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cummax.html">mlx.core.array.cummax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cummin.html">mlx.core.array.cummin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cumprod.html">mlx.core.array.cumprod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cumsum.html">mlx.core.array.cumsum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.diag.html">mlx.core.array.diag</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.diagonal.html">mlx.core.array.diagonal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.exp.html">mlx.core.array.exp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.flatten.html">mlx.core.array.flatten</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.log.html">mlx.core.array.log</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.log10.html">mlx.core.array.log10</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.log1p.html">mlx.core.array.log1p</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.log2.html">mlx.core.array.log2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.logsumexp.html">mlx.core.array.logsumexp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.max.html">mlx.core.array.max</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.mean.html">mlx.core.array.mean</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.min.html">mlx.core.array.min</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.moveaxis.html">mlx.core.array.moveaxis</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.prod.html">mlx.core.array.prod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.reciprocal.html">mlx.core.array.reciprocal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.reshape.html">mlx.core.array.reshape</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.round.html">mlx.core.array.round</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.rsqrt.html">mlx.core.array.rsqrt</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.sin.html">mlx.core.array.sin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.split.html">mlx.core.array.split</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.sqrt.html">mlx.core.array.sqrt</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.square.html">mlx.core.array.square</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.squeeze.html">mlx.core.array.squeeze</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.std.html">mlx.core.array.std</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.sum.html">mlx.core.array.sum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.swapaxes.html">mlx.core.array.swapaxes</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.transpose.html">mlx.core.array.transpose</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.T.html">mlx.core.array.T</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.var.html">mlx.core.array.var</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.view.html">mlx.core.array.view</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../data_types.html">Data Types</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.Dtype.html">mlx.core.Dtype</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.DtypeCategory.html">mlx.core.DtypeCategory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.issubdtype.html">mlx.core.issubdtype</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../devices_and_streams.html">Devices and Streams</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.Device.html">mlx.core.Device</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/stream_class.html">mlx.core.Stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.default_device.html">mlx.core.default_device</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.set_default_device.html">mlx.core.set_default_device</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.default_stream.html">mlx.core.default_stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.new_stream.html">mlx.core.new_stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.set_default_stream.html">mlx.core.set_default_stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.stream.html">mlx.core.stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.synchronize.html">mlx.core.synchronize</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../ops.html">Operations</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.abs.html">mlx.core.abs</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.add.html">mlx.core.add</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.addmm.html">mlx.core.addmm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.all.html">mlx.core.all</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.allclose.html">mlx.core.allclose</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.any.html">mlx.core.any</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arange.html">mlx.core.arange</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arccos.html">mlx.core.arccos</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arccosh.html">mlx.core.arccosh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arcsin.html">mlx.core.arcsin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arcsinh.html">mlx.core.arcsinh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arctan.html">mlx.core.arctan</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arctan2.html">mlx.core.arctan2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arctanh.html">mlx.core.arctanh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.argmax.html">mlx.core.argmax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.argmin.html">mlx.core.argmin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.argpartition.html">mlx.core.argpartition</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.argsort.html">mlx.core.argsort</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array_equal.html">mlx.core.array_equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.as_strided.html">mlx.core.as_strided</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.atleast_1d.html">mlx.core.atleast_1d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.atleast_2d.html">mlx.core.atleast_2d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.atleast_3d.html">mlx.core.atleast_3d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.bitwise_and.html">mlx.core.bitwise_and</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.bitwise_or.html">mlx.core.bitwise_or</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.bitwise_xor.html">mlx.core.bitwise_xor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.block_masked_mm.html">mlx.core.block_masked_mm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.broadcast_to.html">mlx.core.broadcast_to</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.ceil.html">mlx.core.ceil</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.clip.html">mlx.core.clip</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.concatenate.html">mlx.core.concatenate</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conj.html">mlx.core.conj</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conjugate.html">mlx.core.conjugate</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.convolve.html">mlx.core.convolve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv1d.html">mlx.core.conv1d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv2d.html">mlx.core.conv2d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv3d.html">mlx.core.conv3d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv_transpose1d.html">mlx.core.conv_transpose1d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv_transpose2d.html">mlx.core.conv_transpose2d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv_transpose3d.html">mlx.core.conv_transpose3d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv_general.html">mlx.core.conv_general</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cos.html">mlx.core.cos</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cosh.html">mlx.core.cosh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cummax.html">mlx.core.cummax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cummin.html">mlx.core.cummin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cumprod.html">mlx.core.cumprod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cumsum.html">mlx.core.cumsum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.degrees.html">mlx.core.degrees</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.dequantize.html">mlx.core.dequantize</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.diag.html">mlx.core.diag</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.diagonal.html">mlx.core.diagonal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.divide.html">mlx.core.divide</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.divmod.html">mlx.core.divmod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.einsum.html">mlx.core.einsum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.einsum_path.html">mlx.core.einsum_path</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.equal.html">mlx.core.equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.erf.html">mlx.core.erf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.erfinv.html">mlx.core.erfinv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.exp.html">mlx.core.exp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.expm1.html">mlx.core.expm1</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.expand_dims.html">mlx.core.expand_dims</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.eye.html">mlx.core.eye</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.flatten.html">mlx.core.flatten</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.floor.html">mlx.core.floor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.floor_divide.html">mlx.core.floor_divide</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.full.html">mlx.core.full</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.gather_mm.html">mlx.core.gather_mm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.gather_qmm.html">mlx.core.gather_qmm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.greater.html">mlx.core.greater</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.greater_equal.html">mlx.core.greater_equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.hadamard_transform.html">mlx.core.hadamard_transform</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.identity.html">mlx.core.identity</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.imag.html">mlx.core.imag</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.inner.html">mlx.core.inner</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isfinite.html">mlx.core.isfinite</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isclose.html">mlx.core.isclose</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isinf.html">mlx.core.isinf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isnan.html">mlx.core.isnan</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isneginf.html">mlx.core.isneginf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isposinf.html">mlx.core.isposinf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.issubdtype.html">mlx.core.issubdtype</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.left_shift.html">mlx.core.left_shift</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.less.html">mlx.core.less</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.less_equal.html">mlx.core.less_equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linspace.html">mlx.core.linspace</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.load.html">mlx.core.load</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.log.html">mlx.core.log</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.log2.html">mlx.core.log2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.log10.html">mlx.core.log10</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.log1p.html">mlx.core.log1p</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logaddexp.html">mlx.core.logaddexp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logical_not.html">mlx.core.logical_not</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logical_and.html">mlx.core.logical_and</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logical_or.html">mlx.core.logical_or</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logsumexp.html">mlx.core.logsumexp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.matmul.html">mlx.core.matmul</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.max.html">mlx.core.max</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.maximum.html">mlx.core.maximum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.mean.html">mlx.core.mean</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.meshgrid.html">mlx.core.meshgrid</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.min.html">mlx.core.min</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.minimum.html">mlx.core.minimum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.moveaxis.html">mlx.core.moveaxis</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.multiply.html">mlx.core.multiply</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.nan_to_num.html">mlx.core.nan_to_num</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.negative.html">mlx.core.negative</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.not_equal.html">mlx.core.not_equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.ones.html">mlx.core.ones</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.ones_like.html">mlx.core.ones_like</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.outer.html">mlx.core.outer</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.partition.html">mlx.core.partition</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.pad.html">mlx.core.pad</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.power.html">mlx.core.power</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.prod.html">mlx.core.prod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.put_along_axis.html">mlx.core.put_along_axis</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.quantize.html">mlx.core.quantize</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.quantized_matmul.html">mlx.core.quantized_matmul</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.radians.html">mlx.core.radians</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.real.html">mlx.core.real</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.reciprocal.html">mlx.core.reciprocal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.remainder.html">mlx.core.remainder</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.repeat.html">mlx.core.repeat</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.reshape.html">mlx.core.reshape</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.right_shift.html">mlx.core.right_shift</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.roll.html">mlx.core.roll</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.round.html">mlx.core.round</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.rsqrt.html">mlx.core.rsqrt</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.save.html">mlx.core.save</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.savez.html">mlx.core.savez</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.savez_compressed.html">mlx.core.savez_compressed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.save_gguf.html">mlx.core.save_gguf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.save_safetensors.html">mlx.core.save_safetensors</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sigmoid.html">mlx.core.sigmoid</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sign.html">mlx.core.sign</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sin.html">mlx.core.sin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sinh.html">mlx.core.sinh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.softmax.html">mlx.core.softmax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sort.html">mlx.core.sort</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.split.html">mlx.core.split</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sqrt.html">mlx.core.sqrt</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.square.html">mlx.core.square</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.squeeze.html">mlx.core.squeeze</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.stack.html">mlx.core.stack</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.std.html">mlx.core.std</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.stop_gradient.html">mlx.core.stop_gradient</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.subtract.html">mlx.core.subtract</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sum.html">mlx.core.sum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.swapaxes.html">mlx.core.swapaxes</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.take.html">mlx.core.take</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.take_along_axis.html">mlx.core.take_along_axis</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tan.html">mlx.core.tan</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tanh.html">mlx.core.tanh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tensordot.html">mlx.core.tensordot</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tile.html">mlx.core.tile</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.topk.html">mlx.core.topk</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.trace.html">mlx.core.trace</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.transpose.html">mlx.core.transpose</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tri.html">mlx.core.tri</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tril.html">mlx.core.tril</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.triu.html">mlx.core.triu</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.var.html">mlx.core.var</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.view.html">mlx.core.view</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.where.html">mlx.core.where</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.zeros.html">mlx.core.zeros</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.zeros_like.html">mlx.core.zeros_like</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../random.html">Random</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.bernoulli.html">mlx.core.random.bernoulli</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.categorical.html">mlx.core.random.categorical</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.gumbel.html">mlx.core.random.gumbel</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.key.html">mlx.core.random.key</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.normal.html">mlx.core.random.normal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.multivariate_normal.html">mlx.core.random.multivariate_normal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.randint.html">mlx.core.random.randint</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.seed.html">mlx.core.random.seed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.split.html">mlx.core.random.split</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.truncated_normal.html">mlx.core.random.truncated_normal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.uniform.html">mlx.core.random.uniform</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.laplace.html">mlx.core.random.laplace</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.permutation.html">mlx.core.random.permutation</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../transforms.html">Transforms</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.eval.html">mlx.core.eval</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.compile.html">mlx.core.compile</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.custom_function.html">mlx.core.custom_function</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.disable_compile.html">mlx.core.disable_compile</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.enable_compile.html">mlx.core.enable_compile</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.grad.html">mlx.core.grad</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.value_and_grad.html">mlx.core.value_and_grad</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.jvp.html">mlx.core.jvp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.vjp.html">mlx.core.vjp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.vmap.html">mlx.core.vmap</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../fast.html">Fast</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rms_norm.html">mlx.core.fast.rms_norm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../fft.html">FFT</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.fft.html">mlx.core.fft.fft</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.ifft.html">mlx.core.fft.ifft</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.fft2.html">mlx.core.fft.fft2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.ifft2.html">mlx.core.fft.ifft2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.fftn.html">mlx.core.fft.fftn</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.ifftn.html">mlx.core.fft.ifftn</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.rfft.html">mlx.core.fft.rfft</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.irfft.html">mlx.core.fft.irfft</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.rfft2.html">mlx.core.fft.rfft2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.irfft2.html">mlx.core.fft.irfft2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.rfftn.html">mlx.core.fft.rfftn</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.irfftn.html">mlx.core.fft.irfftn</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../linalg.html">Linear Algebra</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.inv.html">mlx.core.linalg.inv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.tri_inv.html">mlx.core.linalg.tri_inv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.norm.html">mlx.core.linalg.norm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.cholesky.html">mlx.core.linalg.cholesky</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.cholesky_inv.html">mlx.core.linalg.cholesky_inv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.cross.html">mlx.core.linalg.cross</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.qr.html">mlx.core.linalg.qr</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.svd.html">mlx.core.linalg.svd</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.eigvalsh.html">mlx.core.linalg.eigvalsh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.eigh.html">mlx.core.linalg.eigh</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../metal.html">Metal</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.is_available.html">mlx.core.metal.is_available</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.device_info.html">mlx.core.metal.device_info</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.get_active_memory.html">mlx.core.metal.get_active_memory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.get_peak_memory.html">mlx.core.metal.get_peak_memory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.reset_peak_memory.html">mlx.core.metal.reset_peak_memory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.get_cache_memory.html">mlx.core.metal.get_cache_memory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.set_memory_limit.html">mlx.core.metal.set_memory_limit</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.set_cache_limit.html">mlx.core.metal.set_cache_limit</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.set_wired_limit.html">mlx.core.metal.set_wired_limit</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.clear_cache.html">mlx.core.metal.clear_cache</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.start_capture.html">mlx.core.metal.start_capture</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.stop_capture.html">mlx.core.metal.stop_capture</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 current active has-children"><a class="reference internal" href="../../nn.html">Neural Networks</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.nn.value_and_grad.html">mlx.nn.value_and_grad</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.nn.quantize.html">mlx.nn.quantize</a></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../module.html">Module</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.training.html">mlx.nn.Module.training</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.state.html">mlx.nn.Module.state</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.apply.html">mlx.nn.Module.apply</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.apply_to_modules.html">mlx.nn.Module.apply_to_modules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.children.html">mlx.nn.Module.children</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.eval.html">mlx.nn.Module.eval</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.filter_and_map.html">mlx.nn.Module.filter_and_map</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.freeze.html">mlx.nn.Module.freeze</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.leaf_modules.html">mlx.nn.Module.leaf_modules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.load_weights.html">mlx.nn.Module.load_weights</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.modules.html">mlx.nn.Module.modules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.named_modules.html">mlx.nn.Module.named_modules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.parameters.html">mlx.nn.Module.parameters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.save_weights.html">mlx.nn.Module.save_weights</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.set_dtype.html">mlx.nn.Module.set_dtype</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.train.html">mlx.nn.Module.train</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.trainable_parameters.html">mlx.nn.Module.trainable_parameters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.unfreeze.html">mlx.nn.Module.unfreeze</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.update.html">mlx.nn.Module.update</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.update_modules.html">mlx.nn.Module.update_modules</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 current active has-children"><a class="reference internal" href="../layers.html">Layers</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.AvgPool3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv2d.html">mlx.nn.Conv2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv3d.html">mlx.nn.Conv3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ConvTranspose1d.html">mlx.nn.ConvTranspose1d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ConvTranspose2d.html">mlx.nn.ConvTranspose2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ConvTranspose3d.html">mlx.nn.ConvTranspose3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Dropout.html">mlx.nn.Dropout</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Dropout2d.html">mlx.nn.Dropout2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Dropout3d.html">mlx.nn.Dropout3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Embedding.html">mlx.nn.Embedding</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ELU.html">mlx.nn.ELU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.GELU.html">mlx.nn.GELU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.GLU.html">mlx.nn.GLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.GroupNorm.html">mlx.nn.GroupNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.GRU.html">mlx.nn.GRU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.HardShrink.html">mlx.nn.HardShrink</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.HardTanh.html">mlx.nn.HardTanh</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Hardswish.html">mlx.nn.Hardswish</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.InstanceNorm.html">mlx.nn.InstanceNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LayerNorm.html">mlx.nn.LayerNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LeakyReLU.html">mlx.nn.LeakyReLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Linear.html">mlx.nn.Linear</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LogSigmoid.html">mlx.nn.LogSigmoid</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LogSoftmax.html">mlx.nn.LogSoftmax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.QuantizedEmbedding.html">mlx.nn.QuantizedEmbedding</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.QuantizedLinear.html">mlx.nn.QuantizedLinear</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.RMSNorm.html">mlx.nn.RMSNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ReLU.html">mlx.nn.ReLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ReLU6.html">mlx.nn.ReLU6</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.RNN.html">mlx.nn.RNN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.RoPE.html">mlx.nn.RoPE</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.SELU.html">mlx.nn.SELU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Sequential.html">mlx.nn.Sequential</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Sigmoid.html">mlx.nn.Sigmoid</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.SiLU.html">mlx.nn.SiLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.SinusoidalPositionalEncoding.html">mlx.nn.SinusoidalPositionalEncoding</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softmin.html">mlx.nn.Softmin</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softshrink.html">mlx.nn.Softshrink</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softsign.html">mlx.nn.Softsign</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softmax.html">mlx.nn.Softmax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softplus.html">mlx.nn.Softplus</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Step.html">mlx.nn.Step</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Tanh.html">mlx.nn.Tanh</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Transformer.html">mlx.nn.Transformer</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Upsample.html">mlx.nn.Upsample</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../functions.html">Functions</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.elu.html">mlx.nn.elu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.celu.html">mlx.nn.celu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.gelu.html">mlx.nn.gelu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.gelu_approx.html">mlx.nn.gelu_approx</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.gelu_fast_approx.html">mlx.nn.gelu_fast_approx</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.glu.html">mlx.nn.glu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.hard_shrink.html">mlx.nn.hard_shrink</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.hard_tanh.html">mlx.nn.hard_tanh</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.hardswish.html">mlx.nn.hardswish</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.leaky_relu.html">mlx.nn.leaky_relu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.log_sigmoid.html">mlx.nn.log_sigmoid</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.log_softmax.html">mlx.nn.log_softmax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.mish.html">mlx.nn.mish</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.prelu.html">mlx.nn.prelu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.relu.html">mlx.nn.relu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.relu6.html">mlx.nn.relu6</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.selu.html">mlx.nn.selu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.sigmoid.html">mlx.nn.sigmoid</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.silu.html">mlx.nn.silu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.softmax.html">mlx.nn.softmax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.softmin.html">mlx.nn.softmin</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.softplus.html">mlx.nn.softplus</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.softshrink.html">mlx.nn.softshrink</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.step.html">mlx.nn.step</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.tanh.html">mlx.nn.tanh</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../losses.html">Loss Functions</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.binary_cross_entropy.html">mlx.nn.losses.binary_cross_entropy</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.html">mlx.nn.losses.cosine_similarity_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.cross_entropy.html">mlx.nn.losses.cross_entropy</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.html">mlx.nn.losses.gaussian_nll_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.hinge_loss.html">mlx.nn.losses.hinge_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.huber_loss.html">mlx.nn.losses.huber_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.kl_div_loss.html">mlx.nn.losses.kl_div_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.l1_loss.html">mlx.nn.losses.l1_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.log_cosh_loss.html">mlx.nn.losses.log_cosh_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.margin_ranking_loss.html">mlx.nn.losses.margin_ranking_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.mse_loss.html">mlx.nn.losses.mse_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.nll_loss.html">mlx.nn.losses.nll_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.smooth_l1_loss.html">mlx.nn.losses.smooth_l1_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.triplet_loss.html">mlx.nn.losses.triplet_loss</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../init.html">Initializers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.constant.html">mlx.nn.init.constant</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.normal.html">mlx.nn.init.normal</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.uniform.html">mlx.nn.init.uniform</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.identity.html">mlx.nn.init.identity</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.glorot_normal.html">mlx.nn.init.glorot_normal</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.glorot_uniform.html">mlx.nn.init.glorot_uniform</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.he_normal.html">mlx.nn.init.he_normal</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.he_uniform.html">mlx.nn.init.he_uniform</a></li>
+</ul>
+</details></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../optimizers.html">Optimizers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../optimizers/optimizer.html">Optimizer</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Optimizer.state.html">mlx.optimizers.Optimizer.state</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.html">mlx.optimizers.Optimizer.apply_gradients</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Optimizer.init.html">mlx.optimizers.Optimizer.init</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Optimizer.update.html">mlx.optimizers.Optimizer.update</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../optimizers/common_optimizers.html">Common Optimizers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.SGD.html">mlx.optimizers.SGD</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.RMSprop.html">mlx.optimizers.RMSprop</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Adagrad.html">mlx.optimizers.Adagrad</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Adafactor.html">mlx.optimizers.Adafactor</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.AdaDelta.html">mlx.optimizers.AdaDelta</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Adam.html">mlx.optimizers.Adam</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.AdamW.html">mlx.optimizers.AdamW</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Adamax.html">mlx.optimizers.Adamax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Lion.html">mlx.optimizers.Lion</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../optimizers/schedulers.html">Schedulers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.cosine_decay.html">mlx.optimizers.cosine_decay</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.exponential_decay.html">mlx.optimizers.exponential_decay</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.join_schedules.html">mlx.optimizers.join_schedules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.linear_schedule.html">mlx.optimizers.linear_schedule</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.step_decay.html">mlx.optimizers.step_decay</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.optimizers.clip_grad_norm.html">mlx.optimizers.clip_grad_norm</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../distributed.html">Distributed Communication</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.Group.html">mlx.core.distributed.Group</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.is_available.html">mlx.core.distributed.is_available</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.init.html">mlx.core.distributed.init</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.all_sum.html">mlx.core.distributed.all_sum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.all_gather.html">mlx.core.distributed.all_gather</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.send.html">mlx.core.distributed.send</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.recv.html">mlx.core.distributed.recv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.recv_like.html">mlx.core.distributed.recv_like</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../tree_utils.html">Tree Utils</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_flatten.html">mlx.utils.tree_flatten</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_unflatten.html">mlx.utils.tree_unflatten</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_map.html">mlx.utils.tree_map</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_map_with_path.html">mlx.utils.tree_map_with_path</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_reduce.html">mlx.utils.tree_reduce</a></li>
+</ul>
+</details></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../cpp/ops.html">Operations</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Further Reading</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../dev/extensions.html">Custom Extensions in MLX</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../dev/metal_debugger.html">Metal Debugger</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../dev/custom_metal_kernels.html">Custom Metal Kernels</a></li>
+</ul>
+
+    </div>
+</nav></div>
+    </div>
+  
+  
+  <div class="sidebar-primary-items__end sidebar-primary__section">
+  </div>
+  
+  <div id="rtd-footer-container"></div>
+
+
+      </div>
+      
+      <main id="main-content" class="bd-main" role="main">
+        
+        
+
+<div class="sbt-scroll-pixel-helper"></div>
+
+          <div class="bd-content">
+            <div class="bd-article-container">
+              
+              <div class="bd-header-article d-print-none">
+<div class="header-article-items header-article__inner">
+  
+    <div class="header-article-items__start">
+      
+        <div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <span class="fa-solid fa-bars"></span>
+</button></div>
+      
+    </div>
+  
+  
+    <div class="header-article-items__end">
+      
+        <div class="header-article-item">
+
+<div class="article-header-buttons">
+
+
+<a href="https://github.com/ml-explore/mlx" target="_blank"
+   class="btn btn-sm btn-source-repository-button"
+   title="Source repository"
+   data-bs-placement="bottom" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fab fa-github"></i>
+  </span>
+
+</a>
+
+
+
+
+
+
+<div class="dropdown dropdown-download-buttons">
+  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
+    <i class="fas fa-download"></i>
+  </button>
+  <ul class="dropdown-menu">
+      
+      
+      
+      <li><a href="../../../_sources/python/nn/_autosummary/mlx.nn.AvgPool3d.rst" target="_blank"
+   class="btn btn-sm btn-download-source-button dropdown-item"
+   title="Download source file"
+   data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-file"></i>
+  </span>
+<span class="btn__text-container">.rst</span>
+</a>
+</li>
+      
+      
+      
+      
+      <li>
+<button onclick="window.print()"
+  class="btn btn-sm btn-download-pdf-button dropdown-item"
+  title="Print to PDF"
+  data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-file-pdf"></i>
+  </span>
+<span class="btn__text-container">.pdf</span>
+</button>
+</li>
+      
+  </ul>
+</div>
+
+
+
+
+<button onclick="toggleFullScreen()"
+  class="btn btn-sm btn-fullscreen-button"
+  title="Fullscreen mode"
+  data-bs-placement="bottom" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-expand"></i>
+  </span>
+
+</button>
+
+
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button>
+
+
+<button class="btn btn-sm pst-navbar-icon search-button search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+    <i class="fa-solid fa-magnifying-glass fa-lg"></i>
+</button>
+<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
+    <span class="fa-solid fa-list"></span>
+</button>
+</div></div>
+      
+    </div>
+  
+</div>
+</div>
+              
+              
+
+<div id="jb-print-docs-body" class="onlyprint">
+    <h1>mlx.nn.AvgPool3d</h1>
+    <!-- Table of contents -->
+    <div id="print-main-content">
+        <div id="jb-print-toc">
+            
+            <div>
+                <h2> Contents </h2>
+            </div>
+            <nav aria-label="Page">
+                <ul class="visible nav section-nav flex-column">
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mlx.nn.AvgPool3d"><code class="docutils literal notranslate"><span class="pre">AvgPool3d</span></code></a></li>
+</ul>
+            </nav>
+        </div>
+    </div>
+</div>
+
+              
+                
+<div id="searchbox"></div>
+                <article class="bd-article">
+                  
+  <section id="mlx-nn-avgpool3d">
+<h1>mlx.nn.AvgPool3d<a class="headerlink" href="#mlx-nn-avgpool3d" title="Link to this heading">#</a></h1>
+<dl class="py class">
+<dt class="sig sig-object py" id="mlx.nn.AvgPool3d">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">AvgPool3d</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stride</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#mlx.nn.AvgPool3d" title="Link to this definition">#</a></dt>
+<dd><p>Applies 3-dimensional average pooling.</p>
+<p>Spatially downsamples the input by taking the average of a sliding window
+of size <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> and sliding stride <code class="docutils literal notranslate"><span class="pre">stride</span></code>.</p>
+<p>The parameters <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code>, <code class="docutils literal notranslate"><span class="pre">stride</span></code>, and <code class="docutils literal notranslate"><span class="pre">padding</span></code> can either be:</p>
+<ul class="simple">
+<li><p>a single <code class="docutils literal notranslate"><span class="pre">int</span></code> – in which case the same value is used for the depth,
+height, and width axis.</p></li>
+<li><p>a <code class="docutils literal notranslate"><span class="pre">tuple</span></code> of three <code class="docutils literal notranslate"><span class="pre">int</span></code> s – in which case, the first <code class="docutils literal notranslate"><span class="pre">int</span></code> is used
+for the depth axis, the second <code class="docutils literal notranslate"><span class="pre">int</span></code> for the height axis, and the third
+<code class="docutils literal notranslate"><span class="pre">int</span></code> for the width axis.</p></li>
+</ul>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>kernel_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em>(</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>)</em>) – The size of the pooling window.</p></li>
+<li><p><strong>stride</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em>(</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>)</em><em>, </em><em>optional</em>) – The stride of the pooling
+window. Default: <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code>.</p></li>
+<li><p><strong>padding</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em>(</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>)</em><em>, </em><em>optional</em>) – How much zero
+padding to apply to the input. The padding is applied on both sides
+of the depth, height and width axis. Default: <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
+</ul>
+</dd>
+</dl>
+<p class="rubric">Examples</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">mlx.core</span> <span class="k">as</span> <span class="nn">mx</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">mlx.nn.layers</span> <span class="k">as</span> <span class="nn">nn</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="mi">32</span><span class="p">,</span> <span class="mi">32</span><span class="p">,</span> <span class="mi">4</span><span class="p">))</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">pool</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">AvgPool3d</span><span class="p">(</span><span class="n">kernel_size</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">pool</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+</pre></div>
+</div>
+<p class="rubric">Methods</p>
+<div class="pst-scrollable-table-container"><table class="autosummary longtable table autosummary">
+<tbody>
+</tbody>
+</table>
+</div>
+</dd></dl>
+
+</section>
+
+
+                </article>
+              
+
+              
+              
+              
+              
+                <footer class="prev-next-footer d-print-none">
+                  
+<div class="prev-next-area">
+    <a class="left-prev"
+       href="mlx.nn.AvgPool2d.html"
+       title="previous page">
+      <i class="fa-solid fa-angle-left"></i>
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">previous</p>
+        <p class="prev-next-title">mlx.nn.AvgPool2d</p>
+      </div>
+    </a>
+    <a class="right-next"
+       href="mlx.nn.BatchNorm.html"
+       title="next page">
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">next</p>
+        <p class="prev-next-title">mlx.nn.BatchNorm</p>
+      </div>
+      <i class="fa-solid fa-angle-right"></i>
+    </a>
+</div>
+                </footer>
+              
+            </div>
+            
+            
+              
+                <dialog id="pst-secondary-sidebar-modal"></dialog>
+                <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
+
+
+  <div class="sidebar-secondary-item">
+  <div class="page-toc tocsection onthispage">
+    <i class="fa-solid fa-list"></i> Contents
+  </div>
+  <nav class="bd-toc-nav page-toc">
+    <ul class="visible nav section-nav flex-column">
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mlx.nn.AvgPool3d"><code class="docutils literal notranslate"><span class="pre">AvgPool3d</span></code></a></li>
+</ul>
+  </nav></div>
+
+</div></div>
+              
+            
+          </div>
+          <footer class="bd-footer-content">
+            
+<div class="bd-footer-content__inner container">
+  
+  <div class="footer-item">
+    
+<p class="component-author">
+By MLX Contributors
+</p>
+
+  </div>
+  
+  <div class="footer-item">
+    
+
+  <p class="copyright">
+    
+      © Copyright 2023, MLX Contributors.
+      <br/>
+    
+  </p>
+
+  </div>
+  
+  <div class="footer-item">
+    
+  </div>
+  
+  <div class="footer-item">
+    
+  </div>
+  
+</div>
+          </footer>
+        
+
+      </main>
+    </div>
+  </div>
+  
+  <!-- Scripts loaded after <body> so the DOM is not blocked -->
+  <script defer src="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549"></script>
+<script defer src="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549"></script>
+
+  <footer class="bd-footer">
+  </footer>
+  </body>
+</html>
\ No newline at end of file
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.BatchNorm.html b/docs/build/html/python/nn/_autosummary/mlx.nn.BatchNorm.html
index e70d267b7..c9058c077 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.BatchNorm.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.BatchNorm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.BatchNorm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.BatchNorm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -49,10 +49,10 @@
     <link rel="index" title="Index" href="../../../genindex.html" />
     <link rel="search" title="Search" href="../../../search.html" />
     <link rel="next" title="mlx.nn.CELU" href="mlx.nn.CELU.html" />
-    <link rel="prev" title="mlx.nn.AvgPool2d" href="mlx.nn.AvgPool2d.html" />
+    <link rel="prev" title="mlx.nn.AvgPool3d" href="mlx.nn.AvgPool3d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -929,12 +930,12 @@ running mean and variance. Default: <code class="docutils literal notranslate"><
                   
 <div class="prev-next-area">
     <a class="left-prev"
-       href="mlx.nn.AvgPool2d.html"
+       href="mlx.nn.AvgPool3d.html"
        title="previous page">
       <i class="fa-solid fa-angle-left"></i>
       <div class="prev-next-info">
         <p class="prev-next-subtitle">previous</p>
-        <p class="prev-next-title">mlx.nn.AvgPool2d</p>
+        <p class="prev-next-title">mlx.nn.AvgPool3d</p>
       </div>
     </a>
     <a class="right-next"
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.CELU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.CELU.html
index 2bfada290..2cfbaf981 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.CELU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.CELU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.CELU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.CELU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.BatchNorm" href="mlx.nn.BatchNorm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Conv1d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Conv1d.html
index 818eea496..a8129fe82 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Conv1d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Conv1d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Conv1d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Conv1d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.CELU" href="mlx.nn.CELU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Conv2d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Conv2d.html
index f348dd7a8..99c2f5b25 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Conv2d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Conv2d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Conv2d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Conv2d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Conv1d" href="mlx.nn.Conv1d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -865,7 +866,7 @@
 <h1>mlx.nn.Conv2d<a class="headerlink" href="#mlx-nn-conv2d" title="Link to this heading">#</a></h1>
 <dl class="py class">
 <dt class="sig sig-object py" id="mlx.nn.Conv2d">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">Conv2d</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">in_channels</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">out_channels</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><span class="pre">tuple</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">stride</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><span class="pre">tuple</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><span class="pre">tuple</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dilation</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><span class="pre">tuple</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bias</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><span class="pre">bool</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#mlx.nn.Conv2d" title="Link to this definition">#</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">Conv2d</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">in_channels</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">out_channels</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">kernel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><span class="pre">tuple</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">stride</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><span class="pre">tuple</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><span class="pre">tuple</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dilation</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><span class="pre">tuple</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">groups</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bias</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><span class="pre">bool</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#mlx.nn.Conv2d" title="Link to this definition">#</a></dt>
 <dd><p>Applies a 2-dimensional convolution over the multi-channel input image.</p>
 <p>The channels are expected to be last i.e. the input shape should be <code class="docutils literal notranslate"><span class="pre">NHWC</span></code> where:</p>
 <ul class="simple">
@@ -885,6 +886,8 @@ applying the filter. Default: <code class="docutils literal notranslate"><span c
 <li><p><strong>padding</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em>, </em><em>optional</em>) – How many positions to 0-pad
 the input with. Default: <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
 <li><p><strong>dilation</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em>, </em><em>optional</em>) – The dilation of the convolution.</p></li>
+<li><p><strong>groups</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – The number of groups for the convolution.
+Default: <code class="docutils literal notranslate"><span class="pre">1</span></code>.</p></li>
 <li><p><strong>bias</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – If <code class="docutils literal notranslate"><span class="pre">True</span></code> add a learnable bias to the
 output. Default: <code class="docutils literal notranslate"><span class="pre">True</span></code></p></li>
 </ul>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Conv3d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Conv3d.html
index a8c5a859a..4cd2ff2b1 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Conv3d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Conv3d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Conv3d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Conv3d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Conv2d" href="mlx.nn.Conv2d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose1d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose1d.html
index 4b96b967e..dd149a60a 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose1d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose1d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.ConvTranspose1d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.ConvTranspose1d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Conv3d" href="mlx.nn.Conv3d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose2d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose2d.html
index 71cb52491..70e389478 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose2d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose2d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.ConvTranspose2d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.ConvTranspose2d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.ConvTranspose1d" href="mlx.nn.ConvTranspose1d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose3d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose3d.html
index c870efc05..798ce3d7d 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose3d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.ConvTranspose3d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.ConvTranspose3d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.ConvTranspose3d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.ConvTranspose2d" href="mlx.nn.ConvTranspose2d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout.html
index 66cf6cf07..85bcffd50 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Dropout &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Dropout &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.ConvTranspose3d" href="mlx.nn.ConvTranspose3d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout2d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout2d.html
index fbe6f4539..2a5fa07cf 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout2d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout2d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Dropout2d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Dropout2d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.Dropout" href="mlx.nn.Dropout.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout3d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout3d.html
index 6cca8ff19..76d0ab41a 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout3d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Dropout3d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Dropout3d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Dropout3d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.Dropout2d" href="mlx.nn.Dropout2d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.ELU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.ELU.html
index 6ad6cf84a..13bc9464a 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.ELU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.ELU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.ELU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.ELU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.Embedding" href="mlx.nn.Embedding.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Embedding.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Embedding.html
index 3d3728c33..aecf5c76e 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Embedding.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Embedding.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Embedding &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Embedding &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Dropout3d" href="mlx.nn.Dropout3d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.GELU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.GELU.html
index 935b65dee..5f48216cc 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.GELU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.GELU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.GELU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.GELU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.ELU" href="mlx.nn.ELU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.GLU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.GLU.html
index 247d82f2c..0782c6644 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.GLU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.GLU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.GLU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.GLU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.GELU" href="mlx.nn.GELU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.GRU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.GRU.html
index cbcf0a61d..774445ca5 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.GRU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.GRU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.GRU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.GRU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.GroupNorm" href="mlx.nn.GroupNorm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.GroupNorm.html b/docs/build/html/python/nn/_autosummary/mlx.nn.GroupNorm.html
index 70216bd4d..22f18d7d3 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.GroupNorm.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.GroupNorm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.GroupNorm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.GroupNorm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.GLU" href="mlx.nn.GLU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.HardShrink.html b/docs/build/html/python/nn/_autosummary/mlx.nn.HardShrink.html
index 61f2a2282..aa37d5054 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.HardShrink.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.HardShrink.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.HardShrink &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.HardShrink &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.GRU" href="mlx.nn.GRU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.HardTanh.html b/docs/build/html/python/nn/_autosummary/mlx.nn.HardTanh.html
index 144403ca6..89e9fa0a9 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.HardTanh.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.HardTanh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.HardTanh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.HardTanh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.HardShrink" href="mlx.nn.HardShrink.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Hardswish.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Hardswish.html
index ef845846c..28aa440e4 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Hardswish.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Hardswish.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Hardswish &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Hardswish &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.HardTanh" href="mlx.nn.HardTanh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.InstanceNorm.html b/docs/build/html/python/nn/_autosummary/mlx.nn.InstanceNorm.html
index b8a42a847..c2dfe360f 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.InstanceNorm.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.InstanceNorm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.InstanceNorm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.InstanceNorm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.Hardswish" href="mlx.nn.Hardswish.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.LSTM.html b/docs/build/html/python/nn/_autosummary/mlx.nn.LSTM.html
index b89a1105d..ae82553f9 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.LSTM.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.LSTM.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.LSTM &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.LSTM &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.LogSoftmax" href="mlx.nn.LogSoftmax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.LayerNorm.html b/docs/build/html/python/nn/_autosummary/mlx.nn.LayerNorm.html
index dd7a0214d..70f7b85a1 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.LayerNorm.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.LayerNorm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.LayerNorm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.LayerNorm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.InstanceNorm" href="mlx.nn.InstanceNorm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.LeakyReLU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.LeakyReLU.html
index 704d1a175..abc49b8c3 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.LeakyReLU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.LeakyReLU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.LeakyReLU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.LeakyReLU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.LayerNorm" href="mlx.nn.LayerNorm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Linear.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Linear.html
index 92ff8f4d8..9844d84ab 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Linear.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Linear.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Linear &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Linear &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.LeakyReLU" href="mlx.nn.LeakyReLU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.LogSigmoid.html b/docs/build/html/python/nn/_autosummary/mlx.nn.LogSigmoid.html
index 7ea70fd22..36ee5b93f 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.LogSigmoid.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.LogSigmoid.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.LogSigmoid &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.LogSigmoid &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Linear" href="mlx.nn.Linear.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.LogSoftmax.html b/docs/build/html/python/nn/_autosummary/mlx.nn.LogSoftmax.html
index 9e76efb3a..41ee622d5 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.LogSoftmax.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.LogSoftmax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.LogSoftmax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.LogSoftmax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.LogSigmoid" href="mlx.nn.LogSigmoid.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool1d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool1d.html
index 174653acc..e4966cb1e 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool1d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool1d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.MaxPool1d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.MaxPool1d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,11 +39,10 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
-    <script async="async" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
     <script>DOCUMENTATION_OPTIONS.pagename = 'python/nn/_autosummary/mlx.nn.MaxPool1d';</script>
     <link rel="icon" href="../../../_static/mlx_logo.png"/>
     <link rel="index" title="Index" href="../../../genindex.html" />
@@ -52,7 +51,7 @@
     <link rel="prev" title="mlx.nn.LSTM" href="mlx.nn.LSTM.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -868,16 +868,8 @@
 <dt class="sig sig-object py" id="mlx.nn.MaxPool1d">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">MaxPool1d</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stride</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#mlx.nn.MaxPool1d" title="Link to this definition">#</a></dt>
 <dd><p>Applies 1-dimensional max pooling.</p>
-<p>Assuming an input of shape <span class="math notranslate nohighlight">\((N, L, C)\)</span> and <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> is
-<span class="math notranslate nohighlight">\(k\)</span>, the output is a tensor of shape <span class="math notranslate nohighlight">\((N, L_{out}, C)\)</span>, given
-by:</p>
-<blockquote>
-<div><div class="math notranslate nohighlight">
-\[\text{out}(N_i, t, C_j) = \max_{m=0, \ldots, k - 1}
-        \text{input}(N_i, \text{stride} \times t + m, C_j),\]</div>
-</div></blockquote>
-<p>where <span class="math notranslate nohighlight">\(L_{out} = \left\lfloor \frac{L + 2 \times \text{padding} -
-\text{kernel\_size}}{\text{stride}}\right\rfloor + 1\)</span>.</p>
+<p>Spatially downsamples the input by taking the maximum of a sliding window
+of size <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> and sliding stride <code class="docutils literal notranslate"><span class="pre">stride</span></code>.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool2d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool2d.html
index f45b3cd01..6b940f164 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool2d.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool2d.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.MaxPool2d &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.MaxPool2d &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,20 +39,19 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
-    <script async="async" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
     <script>DOCUMENTATION_OPTIONS.pagename = 'python/nn/_autosummary/mlx.nn.MaxPool2d';</script>
     <link rel="icon" href="../../../_static/mlx_logo.png"/>
     <link rel="index" title="Index" href="../../../genindex.html" />
     <link rel="search" title="Search" href="../../../search.html" />
-    <link rel="next" title="mlx.nn.Mish" href="mlx.nn.Mish.html" />
+    <link rel="next" title="mlx.nn.MaxPool3d" href="mlx.nn.MaxPool3d.html" />
     <link rel="prev" title="mlx.nn.MaxPool1d" href="mlx.nn.MaxPool1d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -868,26 +868,15 @@
 <dt class="sig sig-object py" id="mlx.nn.MaxPool2d">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">MaxPool2d</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stride</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#mlx.nn.MaxPool2d" title="Link to this definition">#</a></dt>
 <dd><p>Applies 2-dimensional max pooling.</p>
-<p>Assuming an input of shape <span class="math notranslate nohighlight">\((N, H, W, C)\)</span> and <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> is
-<span class="math notranslate nohighlight">\((k_H, k_W)\)</span>, the output is a tensor of shape <span class="math notranslate nohighlight">\((N, H_{out},
-W_{out}, C)\)</span>, given by:</p>
-<div class="math notranslate nohighlight">
-\[\begin{split}\begin{aligned}
-    \text{out}(N_i, h, w, C_j) = &amp; \max_{m=0, \ldots, k_H-1} \max_{n=0, \ldots, k_W-1} \\
-                            &amp; \text{input}(N_i, \text{stride[0]} \times h + m,
-                                        \text{stride[1]} \times w + n, C_j),
-\end{aligned}\end{split}\]</div>
-<p>where <span class="math notranslate nohighlight">\(H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel\_size[0]}}{\text{stride[0]}}\right\rfloor + 1\)</span>,
-<span class="math notranslate nohighlight">\(W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel\_size[1]}}{\text{stride[1]}}\right\rfloor + 1\)</span>.</p>
-<p>The parameters <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code>, <code class="docutils literal notranslate"><span class="pre">stride</span></code>, <code class="docutils literal notranslate"><span class="pre">padding</span></code>, can either be:</p>
-<blockquote>
-<div><ul class="simple">
+<p>Spatially downsamples the input by taking the maximum of a sliding window
+of size <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> and sliding stride <code class="docutils literal notranslate"><span class="pre">stride</span></code>.</p>
+<p>The parameters <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code>, <code class="docutils literal notranslate"><span class="pre">stride</span></code>, and <code class="docutils literal notranslate"><span class="pre">padding</span></code> can either be:</p>
+<ul class="simple">
 <li><p>a single <code class="docutils literal notranslate"><span class="pre">int</span></code> – in which case the same value is used for both the
-height and width axis;</p></li>
+height and width axis.</p></li>
 <li><p>a <code class="docutils literal notranslate"><span class="pre">tuple</span></code> of two <code class="docutils literal notranslate"><span class="pre">int</span></code> s – in which case, the first <code class="docutils literal notranslate"><span class="pre">int</span></code> is
 used for the height axis, the second <code class="docutils literal notranslate"><span class="pre">int</span></code> for the width axis.</p></li>
 </ul>
-</div></blockquote>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
@@ -939,11 +928,11 @@ of the height and width axis. Default: <code class="docutils literal notranslate
       </div>
     </a>
     <a class="right-next"
-       href="mlx.nn.Mish.html"
+       href="mlx.nn.MaxPool3d.html"
        title="next page">
       <div class="prev-next-info">
         <p class="prev-next-subtitle">next</p>
-        <p class="prev-next-title">mlx.nn.Mish</p>
+        <p class="prev-next-title">mlx.nn.MaxPool3d</p>
       </div>
       <i class="fa-solid fa-angle-right"></i>
     </a>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool3d.html b/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool3d.html
new file mode 100644
index 000000000..c1bd13f32
--- /dev/null
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.MaxPool3d.html
@@ -0,0 +1,1012 @@
+
+<!DOCTYPE html>
+
+
+<html lang="en" data-content_root="../../../" >
+
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+
+    <title>mlx.nn.MaxPool3d &#8212; MLX 0.21.0 documentation</title>
+  
+  
+  
+  <script data-cfasync="false">
+    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
+    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
+  </script>
+  <!-- 
+    this give us a css class that will be invisible only if js is disabled 
+  -->
+  <noscript>
+    <style>
+      .pst-js-only { display: none !important; }
+
+    </style>
+  </noscript>
+  
+  <!-- Loaded before other Sphinx assets -->
+  <link href="../../../_static/styles/theme.css?digest=26a4bc78f4c0ddb94549" rel="stylesheet" />
+<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=26a4bc78f4c0ddb94549" rel="stylesheet" />
+
+    <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css?v=fa44fd50" />
+    <link rel="stylesheet" type="text/css" href="../../../_static/styles/sphinx-book-theme.css?v=a3416100" />
+  
+  <!-- So that users can add custom icons -->
+  <script src="../../../_static/scripts/fontawesome.js?digest=26a4bc78f4c0ddb94549"></script>
+  <!-- Pre-loaded scripts that we'll load fully later -->
+  <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
+<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
+
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
+    <script src="../../../_static/doctools.js?v=9a2dae69"></script>
+    <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
+    <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
+    <script>DOCUMENTATION_OPTIONS.pagename = 'python/nn/_autosummary/mlx.nn.MaxPool3d';</script>
+    <link rel="icon" href="../../../_static/mlx_logo.png"/>
+    <link rel="index" title="Index" href="../../../genindex.html" />
+    <link rel="search" title="Search" href="../../../search.html" />
+    <link rel="next" title="mlx.nn.Mish" href="mlx.nn.Mish.html" />
+    <link rel="prev" title="mlx.nn.MaxPool2d" href="mlx.nn.MaxPool2d.html" />
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <meta name="docsearch:language" content="en"/>
+  <meta name="docsearch:version" content="0.21.0" />
+  </head>
+  
+  
+  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
+
+  
+  
+  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
+  
+  <div id="pst-scroll-pixel-helper"></div>
+  
+  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
+    <i class="fa-solid fa-arrow-up"></i>Back to top</button>
+
+  
+  <dialog id="pst-search-dialog">
+    
+<form class="bd-search d-flex align-items-center"
+      action="../../../search.html"
+      method="get">
+  <i class="fa-solid fa-magnifying-glass"></i>
+  <input type="search"
+         class="form-control"
+         name="q"
+         placeholder="Search..."
+         aria-label="Search..."
+         autocomplete="off"
+         autocorrect="off"
+         autocapitalize="off"
+         spellcheck="false"/>
+  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
+</form>
+  </dialog>
+
+  <div class="pst-async-banner-revealer d-none">
+  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
+</div>
+
+  
+    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
+    </header>
+  
+
+  <div class="bd-container">
+    <div class="bd-container__inner bd-page-width">
+      
+      
+      
+      <dialog id="pst-primary-sidebar-modal"></dialog>
+      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
+        
+
+  
+  <div class="sidebar-header-items sidebar-primary__section">
+    
+    
+    
+    
+  </div>
+  
+    <div class="sidebar-primary-items__start sidebar-primary__section">
+        <div class="sidebar-primary-item">
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../../../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
+  
+  
+</a></div>
+        <div class="sidebar-primary-item">
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button></div>
+        <div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
+    <div class="bd-toc-item navbar-nav active">
+        <p aria-level="2" class="caption" role="heading"><span class="caption-text">Install</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../install.html">Build and Install</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Usage</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/quick_start.html">Quick Start Guide</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/lazy_evaluation.html">Lazy Evaluation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/unified_memory.html">Unified Memory</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/indexing.html">Indexing Arrays</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/saving_and_loading.html">Saving and Loading Arrays</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/function_transforms.html">Function Transforms</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/compile.html">Compilation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/numpy.html">Conversion to NumPy and Other Frameworks</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/distributed.html">Distributed Communication</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../usage/using_streams.html">Using Streams</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../examples/linear_regression.html">Linear Regression</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../examples/mlp.html">Multi-Layer Perceptron</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../examples/llama-inference.html">LLM inference</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Python API Reference</span></p>
+<ul class="current nav bd-sidenav">
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../array.html">Array</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.html">mlx.core.array</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.astype.html">mlx.core.array.astype</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.at.html">mlx.core.array.at</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.item.html">mlx.core.array.item</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.tolist.html">mlx.core.array.tolist</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.dtype.html">mlx.core.array.dtype</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.itemsize.html">mlx.core.array.itemsize</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.nbytes.html">mlx.core.array.nbytes</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.ndim.html">mlx.core.array.ndim</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.shape.html">mlx.core.array.shape</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.size.html">mlx.core.array.size</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.abs.html">mlx.core.array.abs</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.all.html">mlx.core.array.all</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.any.html">mlx.core.array.any</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.argmax.html">mlx.core.array.argmax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.argmin.html">mlx.core.array.argmin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.conj.html">mlx.core.array.conj</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cos.html">mlx.core.array.cos</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cummax.html">mlx.core.array.cummax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cummin.html">mlx.core.array.cummin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cumprod.html">mlx.core.array.cumprod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.cumsum.html">mlx.core.array.cumsum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.diag.html">mlx.core.array.diag</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.diagonal.html">mlx.core.array.diagonal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.exp.html">mlx.core.array.exp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.flatten.html">mlx.core.array.flatten</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.log.html">mlx.core.array.log</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.log10.html">mlx.core.array.log10</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.log1p.html">mlx.core.array.log1p</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.log2.html">mlx.core.array.log2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.logsumexp.html">mlx.core.array.logsumexp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.max.html">mlx.core.array.max</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.mean.html">mlx.core.array.mean</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.min.html">mlx.core.array.min</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.moveaxis.html">mlx.core.array.moveaxis</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.prod.html">mlx.core.array.prod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.reciprocal.html">mlx.core.array.reciprocal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.reshape.html">mlx.core.array.reshape</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.round.html">mlx.core.array.round</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.rsqrt.html">mlx.core.array.rsqrt</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.sin.html">mlx.core.array.sin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.split.html">mlx.core.array.split</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.sqrt.html">mlx.core.array.sqrt</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.square.html">mlx.core.array.square</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.squeeze.html">mlx.core.array.squeeze</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.std.html">mlx.core.array.std</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.sum.html">mlx.core.array.sum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.swapaxes.html">mlx.core.array.swapaxes</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.transpose.html">mlx.core.array.transpose</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.T.html">mlx.core.array.T</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.var.html">mlx.core.array.var</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array.view.html">mlx.core.array.view</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../data_types.html">Data Types</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.Dtype.html">mlx.core.Dtype</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.DtypeCategory.html">mlx.core.DtypeCategory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.issubdtype.html">mlx.core.issubdtype</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../devices_and_streams.html">Devices and Streams</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.Device.html">mlx.core.Device</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/stream_class.html">mlx.core.Stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.default_device.html">mlx.core.default_device</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.set_default_device.html">mlx.core.set_default_device</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.default_stream.html">mlx.core.default_stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.new_stream.html">mlx.core.new_stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.set_default_stream.html">mlx.core.set_default_stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.stream.html">mlx.core.stream</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.synchronize.html">mlx.core.synchronize</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../ops.html">Operations</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.abs.html">mlx.core.abs</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.add.html">mlx.core.add</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.addmm.html">mlx.core.addmm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.all.html">mlx.core.all</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.allclose.html">mlx.core.allclose</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.any.html">mlx.core.any</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arange.html">mlx.core.arange</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arccos.html">mlx.core.arccos</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arccosh.html">mlx.core.arccosh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arcsin.html">mlx.core.arcsin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arcsinh.html">mlx.core.arcsinh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arctan.html">mlx.core.arctan</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arctan2.html">mlx.core.arctan2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.arctanh.html">mlx.core.arctanh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.argmax.html">mlx.core.argmax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.argmin.html">mlx.core.argmin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.argpartition.html">mlx.core.argpartition</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.argsort.html">mlx.core.argsort</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.array_equal.html">mlx.core.array_equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.as_strided.html">mlx.core.as_strided</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.atleast_1d.html">mlx.core.atleast_1d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.atleast_2d.html">mlx.core.atleast_2d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.atleast_3d.html">mlx.core.atleast_3d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.bitwise_and.html">mlx.core.bitwise_and</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.bitwise_or.html">mlx.core.bitwise_or</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.bitwise_xor.html">mlx.core.bitwise_xor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.block_masked_mm.html">mlx.core.block_masked_mm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.broadcast_to.html">mlx.core.broadcast_to</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.ceil.html">mlx.core.ceil</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.clip.html">mlx.core.clip</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.concatenate.html">mlx.core.concatenate</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conj.html">mlx.core.conj</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conjugate.html">mlx.core.conjugate</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.convolve.html">mlx.core.convolve</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv1d.html">mlx.core.conv1d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv2d.html">mlx.core.conv2d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv3d.html">mlx.core.conv3d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv_transpose1d.html">mlx.core.conv_transpose1d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv_transpose2d.html">mlx.core.conv_transpose2d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv_transpose3d.html">mlx.core.conv_transpose3d</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.conv_general.html">mlx.core.conv_general</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cos.html">mlx.core.cos</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cosh.html">mlx.core.cosh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cummax.html">mlx.core.cummax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cummin.html">mlx.core.cummin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cumprod.html">mlx.core.cumprod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.cumsum.html">mlx.core.cumsum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.degrees.html">mlx.core.degrees</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.dequantize.html">mlx.core.dequantize</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.diag.html">mlx.core.diag</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.diagonal.html">mlx.core.diagonal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.divide.html">mlx.core.divide</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.divmod.html">mlx.core.divmod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.einsum.html">mlx.core.einsum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.einsum_path.html">mlx.core.einsum_path</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.equal.html">mlx.core.equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.erf.html">mlx.core.erf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.erfinv.html">mlx.core.erfinv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.exp.html">mlx.core.exp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.expm1.html">mlx.core.expm1</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.expand_dims.html">mlx.core.expand_dims</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.eye.html">mlx.core.eye</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.flatten.html">mlx.core.flatten</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.floor.html">mlx.core.floor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.floor_divide.html">mlx.core.floor_divide</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.full.html">mlx.core.full</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.gather_mm.html">mlx.core.gather_mm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.gather_qmm.html">mlx.core.gather_qmm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.greater.html">mlx.core.greater</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.greater_equal.html">mlx.core.greater_equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.hadamard_transform.html">mlx.core.hadamard_transform</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.identity.html">mlx.core.identity</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.imag.html">mlx.core.imag</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.inner.html">mlx.core.inner</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isfinite.html">mlx.core.isfinite</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isclose.html">mlx.core.isclose</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isinf.html">mlx.core.isinf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isnan.html">mlx.core.isnan</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isneginf.html">mlx.core.isneginf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.isposinf.html">mlx.core.isposinf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.issubdtype.html">mlx.core.issubdtype</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.left_shift.html">mlx.core.left_shift</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.less.html">mlx.core.less</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.less_equal.html">mlx.core.less_equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linspace.html">mlx.core.linspace</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.load.html">mlx.core.load</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.log.html">mlx.core.log</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.log2.html">mlx.core.log2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.log10.html">mlx.core.log10</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.log1p.html">mlx.core.log1p</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logaddexp.html">mlx.core.logaddexp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logical_not.html">mlx.core.logical_not</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logical_and.html">mlx.core.logical_and</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logical_or.html">mlx.core.logical_or</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.logsumexp.html">mlx.core.logsumexp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.matmul.html">mlx.core.matmul</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.max.html">mlx.core.max</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.maximum.html">mlx.core.maximum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.mean.html">mlx.core.mean</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.meshgrid.html">mlx.core.meshgrid</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.min.html">mlx.core.min</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.minimum.html">mlx.core.minimum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.moveaxis.html">mlx.core.moveaxis</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.multiply.html">mlx.core.multiply</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.nan_to_num.html">mlx.core.nan_to_num</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.negative.html">mlx.core.negative</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.not_equal.html">mlx.core.not_equal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.ones.html">mlx.core.ones</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.ones_like.html">mlx.core.ones_like</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.outer.html">mlx.core.outer</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.partition.html">mlx.core.partition</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.pad.html">mlx.core.pad</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.power.html">mlx.core.power</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.prod.html">mlx.core.prod</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.put_along_axis.html">mlx.core.put_along_axis</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.quantize.html">mlx.core.quantize</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.quantized_matmul.html">mlx.core.quantized_matmul</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.radians.html">mlx.core.radians</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.real.html">mlx.core.real</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.reciprocal.html">mlx.core.reciprocal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.remainder.html">mlx.core.remainder</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.repeat.html">mlx.core.repeat</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.reshape.html">mlx.core.reshape</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.right_shift.html">mlx.core.right_shift</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.roll.html">mlx.core.roll</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.round.html">mlx.core.round</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.rsqrt.html">mlx.core.rsqrt</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.save.html">mlx.core.save</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.savez.html">mlx.core.savez</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.savez_compressed.html">mlx.core.savez_compressed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.save_gguf.html">mlx.core.save_gguf</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.save_safetensors.html">mlx.core.save_safetensors</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sigmoid.html">mlx.core.sigmoid</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sign.html">mlx.core.sign</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sin.html">mlx.core.sin</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sinh.html">mlx.core.sinh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.softmax.html">mlx.core.softmax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sort.html">mlx.core.sort</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.split.html">mlx.core.split</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sqrt.html">mlx.core.sqrt</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.square.html">mlx.core.square</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.squeeze.html">mlx.core.squeeze</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.stack.html">mlx.core.stack</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.std.html">mlx.core.std</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.stop_gradient.html">mlx.core.stop_gradient</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.subtract.html">mlx.core.subtract</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.sum.html">mlx.core.sum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.swapaxes.html">mlx.core.swapaxes</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.take.html">mlx.core.take</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.take_along_axis.html">mlx.core.take_along_axis</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tan.html">mlx.core.tan</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tanh.html">mlx.core.tanh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tensordot.html">mlx.core.tensordot</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tile.html">mlx.core.tile</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.topk.html">mlx.core.topk</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.trace.html">mlx.core.trace</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.transpose.html">mlx.core.transpose</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tri.html">mlx.core.tri</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.tril.html">mlx.core.tril</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.triu.html">mlx.core.triu</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.var.html">mlx.core.var</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.view.html">mlx.core.view</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.where.html">mlx.core.where</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.zeros.html">mlx.core.zeros</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.zeros_like.html">mlx.core.zeros_like</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../random.html">Random</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.bernoulli.html">mlx.core.random.bernoulli</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.categorical.html">mlx.core.random.categorical</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.gumbel.html">mlx.core.random.gumbel</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.key.html">mlx.core.random.key</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.normal.html">mlx.core.random.normal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.multivariate_normal.html">mlx.core.random.multivariate_normal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.randint.html">mlx.core.random.randint</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.seed.html">mlx.core.random.seed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.split.html">mlx.core.random.split</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.truncated_normal.html">mlx.core.random.truncated_normal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.uniform.html">mlx.core.random.uniform</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.laplace.html">mlx.core.random.laplace</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.random.permutation.html">mlx.core.random.permutation</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../transforms.html">Transforms</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.eval.html">mlx.core.eval</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.compile.html">mlx.core.compile</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.custom_function.html">mlx.core.custom_function</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.disable_compile.html">mlx.core.disable_compile</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.enable_compile.html">mlx.core.enable_compile</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.grad.html">mlx.core.grad</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.value_and_grad.html">mlx.core.value_and_grad</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.jvp.html">mlx.core.jvp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.vjp.html">mlx.core.vjp</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.vmap.html">mlx.core.vmap</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../fast.html">Fast</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rms_norm.html">mlx.core.fast.rms_norm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../fft.html">FFT</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.fft.html">mlx.core.fft.fft</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.ifft.html">mlx.core.fft.ifft</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.fft2.html">mlx.core.fft.fft2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.ifft2.html">mlx.core.fft.ifft2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.fftn.html">mlx.core.fft.fftn</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.ifftn.html">mlx.core.fft.ifftn</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.rfft.html">mlx.core.fft.rfft</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.irfft.html">mlx.core.fft.irfft</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.rfft2.html">mlx.core.fft.rfft2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.irfft2.html">mlx.core.fft.irfft2</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.rfftn.html">mlx.core.fft.rfftn</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fft.irfftn.html">mlx.core.fft.irfftn</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../linalg.html">Linear Algebra</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.inv.html">mlx.core.linalg.inv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.tri_inv.html">mlx.core.linalg.tri_inv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.norm.html">mlx.core.linalg.norm</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.cholesky.html">mlx.core.linalg.cholesky</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.cholesky_inv.html">mlx.core.linalg.cholesky_inv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.cross.html">mlx.core.linalg.cross</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.qr.html">mlx.core.linalg.qr</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.svd.html">mlx.core.linalg.svd</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.eigvalsh.html">mlx.core.linalg.eigvalsh</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.linalg.eigh.html">mlx.core.linalg.eigh</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../metal.html">Metal</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.is_available.html">mlx.core.metal.is_available</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.device_info.html">mlx.core.metal.device_info</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.get_active_memory.html">mlx.core.metal.get_active_memory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.get_peak_memory.html">mlx.core.metal.get_peak_memory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.reset_peak_memory.html">mlx.core.metal.reset_peak_memory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.get_cache_memory.html">mlx.core.metal.get_cache_memory</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.set_memory_limit.html">mlx.core.metal.set_memory_limit</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.set_cache_limit.html">mlx.core.metal.set_cache_limit</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.set_wired_limit.html">mlx.core.metal.set_wired_limit</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.clear_cache.html">mlx.core.metal.clear_cache</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.start_capture.html">mlx.core.metal.start_capture</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.metal.stop_capture.html">mlx.core.metal.stop_capture</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 current active has-children"><a class="reference internal" href="../../nn.html">Neural Networks</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.nn.value_and_grad.html">mlx.nn.value_and_grad</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.nn.quantize.html">mlx.nn.quantize</a></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../module.html">Module</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.training.html">mlx.nn.Module.training</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.state.html">mlx.nn.Module.state</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.apply.html">mlx.nn.Module.apply</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.apply_to_modules.html">mlx.nn.Module.apply_to_modules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.children.html">mlx.nn.Module.children</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.eval.html">mlx.nn.Module.eval</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.filter_and_map.html">mlx.nn.Module.filter_and_map</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.freeze.html">mlx.nn.Module.freeze</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.leaf_modules.html">mlx.nn.Module.leaf_modules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.load_weights.html">mlx.nn.Module.load_weights</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.modules.html">mlx.nn.Module.modules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.named_modules.html">mlx.nn.Module.named_modules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.parameters.html">mlx.nn.Module.parameters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.save_weights.html">mlx.nn.Module.save_weights</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.set_dtype.html">mlx.nn.Module.set_dtype</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.train.html">mlx.nn.Module.train</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.trainable_parameters.html">mlx.nn.Module.trainable_parameters</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.unfreeze.html">mlx.nn.Module.unfreeze</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.update.html">mlx.nn.Module.update</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Module.update_modules.html">mlx.nn.Module.update_modules</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 current active has-children"><a class="reference internal" href="../layers.html">Layers</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv2d.html">mlx.nn.Conv2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv3d.html">mlx.nn.Conv3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ConvTranspose1d.html">mlx.nn.ConvTranspose1d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ConvTranspose2d.html">mlx.nn.ConvTranspose2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ConvTranspose3d.html">mlx.nn.ConvTranspose3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Dropout.html">mlx.nn.Dropout</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Dropout2d.html">mlx.nn.Dropout2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Dropout3d.html">mlx.nn.Dropout3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Embedding.html">mlx.nn.Embedding</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ELU.html">mlx.nn.ELU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.GELU.html">mlx.nn.GELU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.GLU.html">mlx.nn.GLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.GroupNorm.html">mlx.nn.GroupNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.GRU.html">mlx.nn.GRU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.HardShrink.html">mlx.nn.HardShrink</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.HardTanh.html">mlx.nn.HardTanh</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Hardswish.html">mlx.nn.Hardswish</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.InstanceNorm.html">mlx.nn.InstanceNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LayerNorm.html">mlx.nn.LayerNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LeakyReLU.html">mlx.nn.LeakyReLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Linear.html">mlx.nn.Linear</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LogSigmoid.html">mlx.nn.LogSigmoid</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LogSoftmax.html">mlx.nn.LogSoftmax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.MaxPool3d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.QuantizedEmbedding.html">mlx.nn.QuantizedEmbedding</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.QuantizedLinear.html">mlx.nn.QuantizedLinear</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.RMSNorm.html">mlx.nn.RMSNorm</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ReLU.html">mlx.nn.ReLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.ReLU6.html">mlx.nn.ReLU6</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.RNN.html">mlx.nn.RNN</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.RoPE.html">mlx.nn.RoPE</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.SELU.html">mlx.nn.SELU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Sequential.html">mlx.nn.Sequential</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Sigmoid.html">mlx.nn.Sigmoid</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.SiLU.html">mlx.nn.SiLU</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.SinusoidalPositionalEncoding.html">mlx.nn.SinusoidalPositionalEncoding</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softmin.html">mlx.nn.Softmin</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softshrink.html">mlx.nn.Softshrink</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softsign.html">mlx.nn.Softsign</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softmax.html">mlx.nn.Softmax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Softplus.html">mlx.nn.Softplus</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Step.html">mlx.nn.Step</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Tanh.html">mlx.nn.Tanh</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Transformer.html">mlx.nn.Transformer</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.Upsample.html">mlx.nn.Upsample</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../functions.html">Functions</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.elu.html">mlx.nn.elu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.celu.html">mlx.nn.celu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.gelu.html">mlx.nn.gelu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.gelu_approx.html">mlx.nn.gelu_approx</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.gelu_fast_approx.html">mlx.nn.gelu_fast_approx</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.glu.html">mlx.nn.glu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.hard_shrink.html">mlx.nn.hard_shrink</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.hard_tanh.html">mlx.nn.hard_tanh</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.hardswish.html">mlx.nn.hardswish</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.leaky_relu.html">mlx.nn.leaky_relu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.log_sigmoid.html">mlx.nn.log_sigmoid</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.log_softmax.html">mlx.nn.log_softmax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.mish.html">mlx.nn.mish</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.prelu.html">mlx.nn.prelu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.relu.html">mlx.nn.relu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.relu6.html">mlx.nn.relu6</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.selu.html">mlx.nn.selu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.sigmoid.html">mlx.nn.sigmoid</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.silu.html">mlx.nn.silu</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.softmax.html">mlx.nn.softmax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.softmin.html">mlx.nn.softmin</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.softplus.html">mlx.nn.softplus</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.softshrink.html">mlx.nn.softshrink</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.step.html">mlx.nn.step</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.tanh.html">mlx.nn.tanh</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../losses.html">Loss Functions</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.binary_cross_entropy.html">mlx.nn.losses.binary_cross_entropy</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.html">mlx.nn.losses.cosine_similarity_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.cross_entropy.html">mlx.nn.losses.cross_entropy</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.html">mlx.nn.losses.gaussian_nll_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.hinge_loss.html">mlx.nn.losses.hinge_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.huber_loss.html">mlx.nn.losses.huber_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.kl_div_loss.html">mlx.nn.losses.kl_div_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.l1_loss.html">mlx.nn.losses.l1_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.log_cosh_loss.html">mlx.nn.losses.log_cosh_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.margin_ranking_loss.html">mlx.nn.losses.margin_ranking_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.mse_loss.html">mlx.nn.losses.mse_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.nll_loss.html">mlx.nn.losses.nll_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.smooth_l1_loss.html">mlx.nn.losses.smooth_l1_loss</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary_functions/mlx.nn.losses.triplet_loss.html">mlx.nn.losses.triplet_loss</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../init.html">Initializers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.constant.html">mlx.nn.init.constant</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.normal.html">mlx.nn.init.normal</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.uniform.html">mlx.nn.init.uniform</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.identity.html">mlx.nn.init.identity</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.glorot_normal.html">mlx.nn.init.glorot_normal</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.glorot_uniform.html">mlx.nn.init.glorot_uniform</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.he_normal.html">mlx.nn.init.he_normal</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.init.he_uniform.html">mlx.nn.init.he_uniform</a></li>
+</ul>
+</details></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../optimizers.html">Optimizers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../optimizers/optimizer.html">Optimizer</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Optimizer.state.html">mlx.optimizers.Optimizer.state</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.html">mlx.optimizers.Optimizer.apply_gradients</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Optimizer.init.html">mlx.optimizers.Optimizer.init</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Optimizer.update.html">mlx.optimizers.Optimizer.update</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../optimizers/common_optimizers.html">Common Optimizers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.SGD.html">mlx.optimizers.SGD</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.RMSprop.html">mlx.optimizers.RMSprop</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Adagrad.html">mlx.optimizers.Adagrad</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Adafactor.html">mlx.optimizers.Adafactor</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.AdaDelta.html">mlx.optimizers.AdaDelta</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Adam.html">mlx.optimizers.Adam</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.AdamW.html">mlx.optimizers.AdamW</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Adamax.html">mlx.optimizers.Adamax</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.Lion.html">mlx.optimizers.Lion</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2 has-children"><a class="reference internal" href="../../optimizers/schedulers.html">Schedulers</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.cosine_decay.html">mlx.optimizers.cosine_decay</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.exponential_decay.html">mlx.optimizers.exponential_decay</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.join_schedules.html">mlx.optimizers.join_schedules</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.linear_schedule.html">mlx.optimizers.linear_schedule</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../optimizers/_autosummary/mlx.optimizers.step_decay.html">mlx.optimizers.step_decay</a></li>
+</ul>
+</details></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.optimizers.clip_grad_norm.html">mlx.optimizers.clip_grad_norm</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../distributed.html">Distributed Communication</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.Group.html">mlx.core.distributed.Group</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.is_available.html">mlx.core.distributed.is_available</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.init.html">mlx.core.distributed.init</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.all_sum.html">mlx.core.distributed.all_sum</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.all_gather.html">mlx.core.distributed.all_gather</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.send.html">mlx.core.distributed.send</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.recv.html">mlx.core.distributed.recv</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.distributed.recv_like.html">mlx.core.distributed.recv_like</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../tree_utils.html">Tree Utils</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_flatten.html">mlx.utils.tree_flatten</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_unflatten.html">mlx.utils.tree_unflatten</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_map.html">mlx.utils.tree_map</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_map_with_path.html">mlx.utils.tree_map_with_path</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.utils.tree_reduce.html">mlx.utils.tree_reduce</a></li>
+</ul>
+</details></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../cpp/ops.html">Operations</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Further Reading</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../../dev/extensions.html">Custom Extensions in MLX</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../dev/metal_debugger.html">Metal Debugger</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../dev/custom_metal_kernels.html">Custom Metal Kernels</a></li>
+</ul>
+
+    </div>
+</nav></div>
+    </div>
+  
+  
+  <div class="sidebar-primary-items__end sidebar-primary__section">
+  </div>
+  
+  <div id="rtd-footer-container"></div>
+
+
+      </div>
+      
+      <main id="main-content" class="bd-main" role="main">
+        
+        
+
+<div class="sbt-scroll-pixel-helper"></div>
+
+          <div class="bd-content">
+            <div class="bd-article-container">
+              
+              <div class="bd-header-article d-print-none">
+<div class="header-article-items header-article__inner">
+  
+    <div class="header-article-items__start">
+      
+        <div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <span class="fa-solid fa-bars"></span>
+</button></div>
+      
+    </div>
+  
+  
+    <div class="header-article-items__end">
+      
+        <div class="header-article-item">
+
+<div class="article-header-buttons">
+
+
+<a href="https://github.com/ml-explore/mlx" target="_blank"
+   class="btn btn-sm btn-source-repository-button"
+   title="Source repository"
+   data-bs-placement="bottom" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fab fa-github"></i>
+  </span>
+
+</a>
+
+
+
+
+
+
+<div class="dropdown dropdown-download-buttons">
+  <button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
+    <i class="fas fa-download"></i>
+  </button>
+  <ul class="dropdown-menu">
+      
+      
+      
+      <li><a href="../../../_sources/python/nn/_autosummary/mlx.nn.MaxPool3d.rst" target="_blank"
+   class="btn btn-sm btn-download-source-button dropdown-item"
+   title="Download source file"
+   data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-file"></i>
+  </span>
+<span class="btn__text-container">.rst</span>
+</a>
+</li>
+      
+      
+      
+      
+      <li>
+<button onclick="window.print()"
+  class="btn btn-sm btn-download-pdf-button dropdown-item"
+  title="Print to PDF"
+  data-bs-placement="left" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-file-pdf"></i>
+  </span>
+<span class="btn__text-container">.pdf</span>
+</button>
+</li>
+      
+  </ul>
+</div>
+
+
+
+
+<button onclick="toggleFullScreen()"
+  class="btn btn-sm btn-fullscreen-button"
+  title="Fullscreen mode"
+  data-bs-placement="bottom" data-bs-toggle="tooltip"
+>
+  
+
+<span class="btn__icon-container">
+  <i class="fas fa-expand"></i>
+  </span>
+
+</button>
+
+
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button>
+
+
+<button class="btn btn-sm pst-navbar-icon search-button search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+    <i class="fa-solid fa-magnifying-glass fa-lg"></i>
+</button>
+<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
+    <span class="fa-solid fa-list"></span>
+</button>
+</div></div>
+      
+    </div>
+  
+</div>
+</div>
+              
+              
+
+<div id="jb-print-docs-body" class="onlyprint">
+    <h1>mlx.nn.MaxPool3d</h1>
+    <!-- Table of contents -->
+    <div id="print-main-content">
+        <div id="jb-print-toc">
+            
+            <div>
+                <h2> Contents </h2>
+            </div>
+            <nav aria-label="Page">
+                <ul class="visible nav section-nav flex-column">
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mlx.nn.MaxPool3d"><code class="docutils literal notranslate"><span class="pre">MaxPool3d</span></code></a></li>
+</ul>
+            </nav>
+        </div>
+    </div>
+</div>
+
+              
+                
+<div id="searchbox"></div>
+                <article class="bd-article">
+                  
+  <section id="mlx-nn-maxpool3d">
+<h1>mlx.nn.MaxPool3d<a class="headerlink" href="#mlx-nn-maxpool3d" title="Link to this heading">#</a></h1>
+<dl class="py class">
+<dt class="sig sig-object py" id="mlx.nn.MaxPool3d">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">MaxPool3d</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">kernel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">stride</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">padding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.13)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#mlx.nn.MaxPool3d" title="Link to this definition">#</a></dt>
+<dd><p>Applies 3-dimensional max pooling.</p>
+<p>Spatially downsamples the input by taking the maximum of a sliding window
+of size <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code> and sliding stride <code class="docutils literal notranslate"><span class="pre">stride</span></code>.</p>
+<p>The parameters <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code>, <code class="docutils literal notranslate"><span class="pre">stride</span></code>, and <code class="docutils literal notranslate"><span class="pre">padding</span></code> can either be:</p>
+<ul class="simple">
+<li><p>a single <code class="docutils literal notranslate"><span class="pre">int</span></code> – in which case the same value is used for the depth,
+height, and width axis.</p></li>
+<li><p>a <code class="docutils literal notranslate"><span class="pre">tuple</span></code> of three <code class="docutils literal notranslate"><span class="pre">int</span></code> s – in which case, the first <code class="docutils literal notranslate"><span class="pre">int</span></code> is used
+for the depth axis, the second <code class="docutils literal notranslate"><span class="pre">int</span></code> for the height axis, and the third
+<code class="docutils literal notranslate"><span class="pre">int</span></code> for the width axis.</p></li>
+</ul>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>kernel_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em>(</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>)</em>) – The size of the pooling window.</p></li>
+<li><p><strong>stride</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em>(</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>)</em><em>, </em><em>optional</em>) – The stride of the pooling
+window. Default: <code class="docutils literal notranslate"><span class="pre">kernel_size</span></code>.</p></li>
+<li><p><strong>padding</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em>(</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>)</em><em>, </em><em>optional</em>) – How much negative infinity
+padding to apply to the input. The padding is applied on both sides
+of the depth, height and width axis. Default: <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
+</ul>
+</dd>
+</dl>
+<p class="rubric">Examples</p>
+<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">mlx.core</span> <span class="k">as</span> <span class="nn">mx</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">mlx.nn.layers</span> <span class="k">as</span> <span class="nn">nn</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">x</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">normal</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="mi">32</span><span class="p">,</span> <span class="mi">32</span><span class="p">,</span> <span class="mi">4</span><span class="p">))</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">pool</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">MaxPool3d</span><span class="p">(</span><span class="n">kernel_size</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">pool</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+</pre></div>
+</div>
+<p class="rubric">Methods</p>
+<div class="pst-scrollable-table-container"><table class="autosummary longtable table autosummary">
+<tbody>
+</tbody>
+</table>
+</div>
+</dd></dl>
+
+</section>
+
+
+                </article>
+              
+
+              
+              
+              
+              
+                <footer class="prev-next-footer d-print-none">
+                  
+<div class="prev-next-area">
+    <a class="left-prev"
+       href="mlx.nn.MaxPool2d.html"
+       title="previous page">
+      <i class="fa-solid fa-angle-left"></i>
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">previous</p>
+        <p class="prev-next-title">mlx.nn.MaxPool2d</p>
+      </div>
+    </a>
+    <a class="right-next"
+       href="mlx.nn.Mish.html"
+       title="next page">
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">next</p>
+        <p class="prev-next-title">mlx.nn.Mish</p>
+      </div>
+      <i class="fa-solid fa-angle-right"></i>
+    </a>
+</div>
+                </footer>
+              
+            </div>
+            
+            
+              
+                <dialog id="pst-secondary-sidebar-modal"></dialog>
+                <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
+
+
+  <div class="sidebar-secondary-item">
+  <div class="page-toc tocsection onthispage">
+    <i class="fa-solid fa-list"></i> Contents
+  </div>
+  <nav class="bd-toc-nav page-toc">
+    <ul class="visible nav section-nav flex-column">
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mlx.nn.MaxPool3d"><code class="docutils literal notranslate"><span class="pre">MaxPool3d</span></code></a></li>
+</ul>
+  </nav></div>
+
+</div></div>
+              
+            
+          </div>
+          <footer class="bd-footer-content">
+            
+<div class="bd-footer-content__inner container">
+  
+  <div class="footer-item">
+    
+<p class="component-author">
+By MLX Contributors
+</p>
+
+  </div>
+  
+  <div class="footer-item">
+    
+
+  <p class="copyright">
+    
+      © Copyright 2023, MLX Contributors.
+      <br/>
+    
+  </p>
+
+  </div>
+  
+  <div class="footer-item">
+    
+  </div>
+  
+  <div class="footer-item">
+    
+  </div>
+  
+</div>
+          </footer>
+        
+
+      </main>
+    </div>
+  </div>
+  
+  <!-- Scripts loaded after <body> so the DOM is not blocked -->
+  <script defer src="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549"></script>
+<script defer src="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549"></script>
+
+  <footer class="bd-footer">
+  </footer>
+  </body>
+</html>
\ No newline at end of file
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Mish.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Mish.html
index 8a5620d37..d31270c35 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Mish.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Mish.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Mish &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Mish &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -49,10 +49,10 @@
     <link rel="index" title="Index" href="../../../genindex.html" />
     <link rel="search" title="Search" href="../../../search.html" />
     <link rel="next" title="mlx.nn.MultiHeadAttention" href="mlx.nn.MultiHeadAttention.html" />
-    <link rel="prev" title="mlx.nn.MaxPool2d" href="mlx.nn.MaxPool2d.html" />
+    <link rel="prev" title="mlx.nn.MaxPool3d" href="mlx.nn.MaxPool3d.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -893,12 +894,12 @@
                   
 <div class="prev-next-area">
     <a class="left-prev"
-       href="mlx.nn.MaxPool2d.html"
+       href="mlx.nn.MaxPool3d.html"
        title="previous page">
       <i class="fa-solid fa-angle-left"></i>
       <div class="prev-next-info">
         <p class="prev-next-subtitle">previous</p>
-        <p class="prev-next-title">mlx.nn.MaxPool2d</p>
+        <p class="prev-next-title">mlx.nn.MaxPool3d</p>
       </div>
     </a>
     <a class="right-next"
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.apply.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.apply.html
index 72c9e0e6f..e1c31c646 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.apply.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.apply.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.apply &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.apply &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.state" href="mlx.nn.Module.state.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.apply_to_modules.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.apply_to_modules.html
index afcf06ddb..4b9d3b5db 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.apply_to_modules.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.apply_to_modules.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.apply_to_modules &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.apply_to_modules &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.apply" href="mlx.nn.Module.apply.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.children.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.children.html
index 9db4ac9b4..b000446e0 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.children.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.children.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.children &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.children &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.apply_to_modules" href="mlx.nn.Module.apply_to_modules.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.eval.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.eval.html
index fc68eebb8..7227830d1 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.eval.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.eval.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.eval &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.eval &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.children" href="mlx.nn.Module.children.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.filter_and_map.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.filter_and_map.html
index 8f14e3eb9..b5dd1e4e6 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.filter_and_map.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.filter_and_map.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.filter_and_map &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.filter_and_map &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.eval" href="mlx.nn.Module.eval.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.freeze.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.freeze.html
index 1033693e1..83221a7c1 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.freeze.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.freeze.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.freeze &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.freeze &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.filter_and_map" href="mlx.nn.Module.filter_and_map.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.leaf_modules.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.leaf_modules.html
index 2ed019938..799c3718f 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.leaf_modules.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.leaf_modules.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.leaf_modules &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.leaf_modules &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.freeze" href="mlx.nn.Module.freeze.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.load_weights.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.load_weights.html
index a698e2adb..f8c3a25ab 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.load_weights.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.load_weights.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.load_weights &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.load_weights &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.leaf_modules" href="mlx.nn.Module.leaf_modules.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.modules.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.modules.html
index c114d99f7..48d4b6192 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.modules.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.modules.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.modules &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.modules &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.load_weights" href="mlx.nn.Module.load_weights.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.named_modules.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.named_modules.html
index d3202cc4c..a334c79dd 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.named_modules.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.named_modules.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.named_modules &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.named_modules &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.modules" href="mlx.nn.Module.modules.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.parameters.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.parameters.html
index bef2a9619..62825d8b4 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.parameters.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.parameters.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.parameters &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.parameters &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.named_modules" href="mlx.nn.Module.named_modules.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.save_weights.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.save_weights.html
index 338a37bdd..6e0115384 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.save_weights.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.save_weights.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.save_weights &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.save_weights &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.parameters" href="mlx.nn.Module.parameters.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.set_dtype.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.set_dtype.html
index a4eb5b94a..499f7de56 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.set_dtype.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.set_dtype.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.set_dtype &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.set_dtype &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.save_weights" href="mlx.nn.Module.save_weights.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.state.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.state.html
index 6c5361b5a..902f7b626 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.state.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.state.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.state &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.state &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.training" href="mlx.nn.Module.training.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.train.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.train.html
index 488da0acf..24cce0132 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.train.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.train.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.train &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.train &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.set_dtype" href="mlx.nn.Module.set_dtype.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.trainable_parameters.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.trainable_parameters.html
index ccc43907f..e8a61e710 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.trainable_parameters.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.trainable_parameters.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.trainable_parameters &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.trainable_parameters &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.train" href="mlx.nn.Module.train.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.training.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.training.html
index d6904b197..0a59a1cbe 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.training.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.training.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.training &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.training &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Module" href="../module.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.unfreeze.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.unfreeze.html
index 03e0a1c18..66d16983f 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.unfreeze.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.unfreeze.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.unfreeze &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.unfreeze &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.trainable_parameters" href="mlx.nn.Module.trainable_parameters.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.update.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.update.html
index 86ce32eab..6e3168cc7 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.update.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.update.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.update &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.update &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.unfreeze" href="mlx.nn.Module.unfreeze.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.update_modules.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.update_modules.html
index d90e7f800..e213b30a1 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Module.update_modules.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Module.update_modules.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Module.update_modules &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Module.update_modules &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Module.update" href="mlx.nn.Module.update.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.MultiHeadAttention.html b/docs/build/html/python/nn/_autosummary/mlx.nn.MultiHeadAttention.html
index 449c0fe62..0951d2165 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.MultiHeadAttention.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.MultiHeadAttention.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.MultiHeadAttention &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.MultiHeadAttention &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Mish" href="mlx.nn.Mish.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.PReLU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.PReLU.html
index 1449d66b6..3836d66f9 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.PReLU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.PReLU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.PReLU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.PReLU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.MultiHeadAttention" href="mlx.nn.MultiHeadAttention.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3 current active"><a class="current reference internal" href="#">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.QuantizedEmbedding.html b/docs/build/html/python/nn/_autosummary/mlx.nn.QuantizedEmbedding.html
index ba0ff6184..66d4abb8e 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.QuantizedEmbedding.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.QuantizedEmbedding.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.QuantizedEmbedding &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.QuantizedEmbedding &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.PReLU" href="mlx.nn.PReLU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.QuantizedLinear.html b/docs/build/html/python/nn/_autosummary/mlx.nn.QuantizedLinear.html
index 4a6ab03a8..d26ff9795 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.QuantizedLinear.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.QuantizedLinear.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.QuantizedLinear &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.QuantizedLinear &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.QuantizedEmbedding" href="mlx.nn.QuantizedEmbedding.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.RMSNorm.html b/docs/build/html/python/nn/_autosummary/mlx.nn.RMSNorm.html
index 8b3a11ac4..76b5531d6 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.RMSNorm.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.RMSNorm.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.RMSNorm &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.RMSNorm &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.QuantizedLinear" href="mlx.nn.QuantizedLinear.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.RNN.html b/docs/build/html/python/nn/_autosummary/mlx.nn.RNN.html
index a4ee9e210..29e0c2d6c 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.RNN.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.RNN.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.RNN &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.RNN &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.ReLU6" href="mlx.nn.ReLU6.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.ReLU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.ReLU.html
index f64346d75..0ec51dd81 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.ReLU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.ReLU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.ReLU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.ReLU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.RMSNorm" href="mlx.nn.RMSNorm.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.ReLU6.html b/docs/build/html/python/nn/_autosummary/mlx.nn.ReLU6.html
index 43c2042b1..a28ac22ef 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.ReLU6.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.ReLU6.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.ReLU6 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.ReLU6 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.ReLU" href="mlx.nn.ReLU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.RoPE.html b/docs/build/html/python/nn/_autosummary/mlx.nn.RoPE.html
index cdfba246a..479a981c1 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.RoPE.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.RoPE.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.RoPE &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.RoPE &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.RNN" href="mlx.nn.RNN.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.SELU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.SELU.html
index 028e8fbf2..6b69eb2ed 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.SELU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.SELU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.SELU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.SELU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.RoPE" href="mlx.nn.RoPE.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Sequential.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Sequential.html
index 3889266da..d2f8c49da 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Sequential.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Sequential.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Sequential &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Sequential &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.SELU" href="mlx.nn.SELU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.SiLU.html b/docs/build/html/python/nn/_autosummary/mlx.nn.SiLU.html
index cafb4fbe6..6a004c01a 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.SiLU.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.SiLU.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.SiLU &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.SiLU &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Sigmoid" href="mlx.nn.Sigmoid.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Sigmoid.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Sigmoid.html
index 7e7047400..5c1326527 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Sigmoid.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Sigmoid.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Sigmoid &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Sigmoid &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.Sequential" href="mlx.nn.Sequential.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding.html b/docs/build/html/python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding.html
index 1800ffffb..08872ef64 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.SinusoidalPositionalEncoding &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.SinusoidalPositionalEncoding &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.SiLU" href="mlx.nn.SiLU.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Softmax.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Softmax.html
index 3fef69ddf..a392a2baa 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Softmax.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Softmax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Softmax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Softmax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Softsign" href="mlx.nn.Softsign.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Softmin.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Softmin.html
index 39ecaa3bc..60b68bda7 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Softmin.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Softmin.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Softmin &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Softmin &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.SinusoidalPositionalEncoding" href="mlx.nn.SinusoidalPositionalEncoding.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Softplus.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Softplus.html
index a59b2fc5a..61a7b4b9c 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Softplus.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Softplus.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Softplus &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Softplus &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Softmax" href="mlx.nn.Softmax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Softshrink.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Softshrink.html
index dbfb8992d..dfdf9f2fa 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Softshrink.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Softshrink.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Softshrink &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Softshrink &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.Softmin" href="mlx.nn.Softmin.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Softsign.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Softsign.html
index 34f4fbaee..95ece6203 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Softsign.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Softsign.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Softsign &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Softsign &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Softshrink" href="mlx.nn.Softshrink.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Step.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Step.html
index 3fce344c9..6b79d2b14 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Step.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Step.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Step &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Step &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.Softplus" href="mlx.nn.Softplus.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Tanh.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Tanh.html
index e5ac75e4b..634538f47 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Tanh.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Tanh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Tanh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Tanh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Step" href="mlx.nn.Step.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Transformer.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Transformer.html
index 430bf2b0f..55a667654 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Transformer.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Transformer.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Transformer &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Transformer &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Tanh" href="mlx.nn.Tanh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.Upsample.html b/docs/build/html/python/nn/_autosummary/mlx.nn.Upsample.html
index f9ad04a4e..2efd1b5cc 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.Upsample.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.Upsample.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.Upsample &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.Upsample &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.Transformer" href="mlx.nn.Transformer.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.init.constant.html b/docs/build/html/python/nn/_autosummary/mlx.nn.init.constant.html
index 6c4102294..5542f139d 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.init.constant.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.init.constant.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.init.constant &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.init.constant &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Initializers" href="../init.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.init.glorot_normal.html b/docs/build/html/python/nn/_autosummary/mlx.nn.init.glorot_normal.html
index b8a2268de..ab915a167 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.init.glorot_normal.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.init.glorot_normal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.init.glorot_normal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.init.glorot_normal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.init.identity" href="mlx.nn.init.identity.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.init.glorot_uniform.html b/docs/build/html/python/nn/_autosummary/mlx.nn.init.glorot_uniform.html
index c43f839f2..0a6ab3e09 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.init.glorot_uniform.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.init.glorot_uniform.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.init.glorot_uniform &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.init.glorot_uniform &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.init.glorot_normal" href="mlx.nn.init.glorot_normal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.init.he_normal.html b/docs/build/html/python/nn/_autosummary/mlx.nn.init.he_normal.html
index 04fb56f62..5dbea1045 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.init.he_normal.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.init.he_normal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.init.he_normal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.init.he_normal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.init.glorot_uniform" href="mlx.nn.init.glorot_uniform.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.init.he_uniform.html b/docs/build/html/python/nn/_autosummary/mlx.nn.init.he_uniform.html
index ac2dff523..29474534b 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.init.he_uniform.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.init.he_uniform.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.init.he_uniform &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.init.he_uniform &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.init.he_normal" href="mlx.nn.init.he_normal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.init.identity.html b/docs/build/html/python/nn/_autosummary/mlx.nn.init.identity.html
index 101d0d30e..beca6ab6e 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.init.identity.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.init.identity.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.init.identity &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.init.identity &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.init.uniform" href="mlx.nn.init.uniform.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.init.normal.html b/docs/build/html/python/nn/_autosummary/mlx.nn.init.normal.html
index 4ef3016a0..71431a740 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.init.normal.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.init.normal.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.init.normal &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.init.normal &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.init.constant" href="mlx.nn.init.constant.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary/mlx.nn.init.uniform.html b/docs/build/html/python/nn/_autosummary/mlx.nn.init.uniform.html
index a9314d349..db68625ab 100644
--- a/docs/build/html/python/nn/_autosummary/mlx.nn.init.uniform.html
+++ b/docs/build/html/python/nn/_autosummary/mlx.nn.init.uniform.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.init.uniform &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.init.uniform &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.init.normal" href="mlx.nn.init.normal.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.celu.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.celu.html
index bcfd24674..dd784037f 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.celu.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.celu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.celu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.celu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.elu" href="mlx.nn.elu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.elu.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.elu.html
index 1767b8151..860592e4a 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.elu.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.elu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.elu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.elu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Functions" href="../functions.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu.html
index e6fdf4a4d..68726795b 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.gelu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.gelu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.celu" href="mlx.nn.celu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_approx.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_approx.html
index 4ea327edd..ff7ad66f5 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_approx.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_approx.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.gelu_approx &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.gelu_approx &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.gelu" href="mlx.nn.gelu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx.html
index 390b4457e..de376385f 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.gelu_fast_approx &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.gelu_fast_approx &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.gelu_approx" href="mlx.nn.gelu_approx.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.glu.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.glu.html
index 486555aab..777b762ea 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.glu.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.glu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.glu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.glu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.gelu_fast_approx" href="mlx.nn.gelu_fast_approx.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hard_shrink.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hard_shrink.html
index 3562c4eb1..19e54485d 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hard_shrink.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hard_shrink.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.hard_shrink &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.hard_shrink &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.glu" href="mlx.nn.glu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hard_tanh.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hard_tanh.html
index 77baca5fa..c204c54aa 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hard_tanh.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hard_tanh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.hard_tanh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.hard_tanh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.hard_shrink" href="mlx.nn.hard_shrink.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hardswish.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hardswish.html
index d48443836..d2ecf4822 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hardswish.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.hardswish.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.hardswish &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.hardswish &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.hard_tanh" href="mlx.nn.hard_tanh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.leaky_relu.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.leaky_relu.html
index 3a79f3b29..64d3cc571 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.leaky_relu.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.leaky_relu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.leaky_relu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.leaky_relu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.hardswish" href="mlx.nn.hardswish.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.log_sigmoid.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.log_sigmoid.html
index abd40ae13..51b9a9872 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.log_sigmoid.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.log_sigmoid.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.log_sigmoid &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.log_sigmoid &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.leaky_relu" href="mlx.nn.leaky_relu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.log_softmax.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.log_softmax.html
index c9a4f9e6a..0c8f04a97 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.log_softmax.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.log_softmax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.log_softmax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.log_softmax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.log_sigmoid" href="mlx.nn.log_sigmoid.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy.html
index 004324f9f..7a359861d 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.binary_cross_entropy &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.binary_cross_entropy &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Loss Functions" href="../losses.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.html
index f8ec2e8f9..c6e23f6bb 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.cosine_similarity_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.cosine_similarity_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.losses.binary_cross_entropy" href="mlx.nn.losses.binary_cross_entropy.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy.html
index 491630b5b..0752aa7c6 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.cross_entropy &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.cross_entropy &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.losses.cosine_similarity_loss" href="mlx.nn.losses.cosine_similarity_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.html
index 719c74740..05ec94449 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.gaussian_nll_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.gaussian_nll_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.losses.cross_entropy" href="mlx.nn.losses.cross_entropy.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss.html
index 039f394f2..683b0f9a4 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.hinge_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.hinge_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.losses.gaussian_nll_loss" href="mlx.nn.losses.gaussian_nll_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.huber_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.huber_loss.html
index cc1371e1e..a6627b7c5 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.huber_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.huber_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.huber_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.huber_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.losses.hinge_loss" href="mlx.nn.losses.hinge_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss.html
index 9f3d6989d..971c11929 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.kl_div_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.kl_div_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.losses.huber_loss" href="mlx.nn.losses.huber_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.l1_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.l1_loss.html
index a9dd67cf0..cc0497871 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.l1_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.l1_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.l1_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.l1_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.losses.kl_div_loss" href="mlx.nn.losses.kl_div_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss.html
index c156be499..7e04217e4 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.log_cosh_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.log_cosh_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.losses.l1_loss" href="mlx.nn.losses.l1_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss.html
index f9c6e9614..10d1c64f8 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.margin_ranking_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.margin_ranking_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.losses.log_cosh_loss" href="mlx.nn.losses.log_cosh_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.mse_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.mse_loss.html
index 2a006bb42..c2c1128ed 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.mse_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.mse_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.mse_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.mse_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.losses.margin_ranking_loss" href="mlx.nn.losses.margin_ranking_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.nll_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.nll_loss.html
index 494c5d00e..d514134ba 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.nll_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.nll_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.nll_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.nll_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.losses.mse_loss" href="mlx.nn.losses.mse_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss.html
index e7ee4d69a..db573a9b3 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.smooth_l1_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.smooth_l1_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.losses.nll_loss" href="mlx.nn.losses.nll_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss.html
index f4f6ecd65..568ece30e 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.losses.triplet_loss &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.losses.triplet_loss &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.losses.smooth_l1_loss" href="mlx.nn.losses.smooth_l1_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.mish.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.mish.html
index e9c24b3e9..ffbeae9ed 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.mish.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.mish.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.mish &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.mish &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.log_softmax" href="mlx.nn.log_softmax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.prelu.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.prelu.html
index 8f8541b56..978588293 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.prelu.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.prelu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.prelu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.prelu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.mish" href="mlx.nn.mish.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.relu.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.relu.html
index 8e0d623df..af4cd6f8b 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.relu.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.relu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.relu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.relu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.prelu" href="mlx.nn.prelu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.relu6.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.relu6.html
index eb551ed04..9ac117097 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.relu6.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.relu6.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.relu6 &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.relu6 &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.relu" href="mlx.nn.relu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.selu.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.selu.html
index fb410c874..b1375a673 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.selu.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.selu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.selu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.selu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.relu6" href="mlx.nn.relu6.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.sigmoid.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.sigmoid.html
index 7faaa8172..140916c61 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.sigmoid.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.sigmoid.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.sigmoid &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.sigmoid &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.selu" href="mlx.nn.selu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.silu.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.silu.html
index 86f496ee1..cb1d71541 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.silu.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.silu.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.silu &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.silu &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.sigmoid" href="mlx.nn.sigmoid.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softmax.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softmax.html
index b733d85fc..ec4f035b2 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softmax.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softmax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.softmax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.softmax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.silu" href="mlx.nn.silu.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softmin.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softmin.html
index 004fa6c3b..a33c66275 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softmin.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softmin.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.softmin &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.softmin &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.softmax" href="mlx.nn.softmax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softplus.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softplus.html
index 05b431fc6..c786292b7 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softplus.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softplus.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.softplus &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.softplus &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.softmin" href="mlx.nn.softmin.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softshrink.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softshrink.html
index 727fd2fa9..3fe365074 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softshrink.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.softshrink.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.softshrink &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.softshrink &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.softplus" href="mlx.nn.softplus.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.step.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.step.html
index 706e78f28..ea4bccaa1 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.step.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.step.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.step &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.step &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.softshrink" href="mlx.nn.softshrink.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.tanh.html b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.tanh.html
index 3211a1751..d7bf4d498 100644
--- a/docs/build/html/python/nn/_autosummary_functions/mlx.nn.tanh.html
+++ b/docs/build/html/python/nn/_autosummary_functions/mlx.nn.tanh.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.nn.tanh &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.nn.tanh &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.step" href="mlx.nn.step.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/functions.html b/docs/build/html/python/nn/functions.html
index 063f25e9a..6bcb3e2d2 100644
--- a/docs/build/html/python/nn/functions.html
+++ b/docs/build/html/python/nn/functions.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Functions &#8212; MLX 0.20.0 documentation</title>
+    <title>Functions &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.Upsample" href="_autosummary/mlx.nn.Upsample.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/init.html b/docs/build/html/python/nn/init.html
index aae6a250b..67d369607 100644
--- a/docs/build/html/python/nn/init.html
+++ b/docs/build/html/python/nn/init.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Initializers &#8212; MLX 0.20.0 documentation</title>
+    <title>Initializers &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.losses.triplet_loss" href="_autosummary_functions/mlx.nn.losses.triplet_loss.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/layers.html b/docs/build/html/python/nn/layers.html
index 6e5de370e..5724cd061 100644
--- a/docs/build/html/python/nn/layers.html
+++ b/docs/build/html/python/nn/layers.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Layers &#8212; MLX 0.20.0 documentation</title>
+    <title>Layers &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.Module.update_modules" href="_autosummary/mlx.nn.Module.update_modules.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -865,93 +866,99 @@
 <tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.AvgPool2d.html#mlx.nn.AvgPool2d" title="mlx.nn.AvgPool2d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">AvgPool2d</span></code></a>(kernel_size[, stride, padding])</p></td>
 <td><p>Applies 2-dimensional average pooling.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.BatchNorm.html#mlx.nn.BatchNorm" title="mlx.nn.BatchNorm"><code class="xref py py-obj docutils literal notranslate"><span class="pre">BatchNorm</span></code></a>(num_features[, eps, momentum, ...])</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.AvgPool3d.html#mlx.nn.AvgPool3d" title="mlx.nn.AvgPool3d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">AvgPool3d</span></code></a>(kernel_size[, stride, padding])</p></td>
+<td><p>Applies 3-dimensional average pooling.</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.BatchNorm.html#mlx.nn.BatchNorm" title="mlx.nn.BatchNorm"><code class="xref py py-obj docutils literal notranslate"><span class="pre">BatchNorm</span></code></a>(num_features[, eps, momentum, ...])</p></td>
 <td><p>Applies Batch Normalization over a 2D or 3D input.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.CELU.html#mlx.nn.CELU" title="mlx.nn.CELU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">CELU</span></code></a>([alpha])</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.CELU.html#mlx.nn.CELU" title="mlx.nn.CELU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">CELU</span></code></a>([alpha])</p></td>
 <td><p>Applies the Continuously Differentiable Exponential Linear Unit.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Conv1d.html#mlx.nn.Conv1d" title="mlx.nn.Conv1d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Conv1d</span></code></a>(in_channels, out_channels, kernel_size)</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Conv1d.html#mlx.nn.Conv1d" title="mlx.nn.Conv1d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Conv1d</span></code></a>(in_channels, out_channels, kernel_size)</p></td>
 <td><p>Applies a 1-dimensional convolution over the multi-channel input sequence.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Conv2d.html#mlx.nn.Conv2d" title="mlx.nn.Conv2d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Conv2d</span></code></a>(in_channels, out_channels, kernel_size)</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Conv2d.html#mlx.nn.Conv2d" title="mlx.nn.Conv2d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Conv2d</span></code></a>(in_channels, out_channels, kernel_size)</p></td>
 <td><p>Applies a 2-dimensional convolution over the multi-channel input image.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Conv3d.html#mlx.nn.Conv3d" title="mlx.nn.Conv3d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Conv3d</span></code></a>(in_channels, out_channels, kernel_size)</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Conv3d.html#mlx.nn.Conv3d" title="mlx.nn.Conv3d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Conv3d</span></code></a>(in_channels, out_channels, kernel_size)</p></td>
 <td><p>Applies a 3-dimensional convolution over the multi-channel input image.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.ConvTranspose1d.html#mlx.nn.ConvTranspose1d" title="mlx.nn.ConvTranspose1d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ConvTranspose1d</span></code></a>(in_channels, out_channels, ...)</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.ConvTranspose1d.html#mlx.nn.ConvTranspose1d" title="mlx.nn.ConvTranspose1d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ConvTranspose1d</span></code></a>(in_channels, out_channels, ...)</p></td>
 <td><p>Applies a 1-dimensional transposed convolution over the multi-channel input sequence.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.ConvTranspose2d.html#mlx.nn.ConvTranspose2d" title="mlx.nn.ConvTranspose2d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ConvTranspose2d</span></code></a>(in_channels, out_channels, ...)</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.ConvTranspose2d.html#mlx.nn.ConvTranspose2d" title="mlx.nn.ConvTranspose2d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ConvTranspose2d</span></code></a>(in_channels, out_channels, ...)</p></td>
 <td><p>Applies a 2-dimensional transposed convolution over the multi-channel input image.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.ConvTranspose3d.html#mlx.nn.ConvTranspose3d" title="mlx.nn.ConvTranspose3d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ConvTranspose3d</span></code></a>(in_channels, out_channels, ...)</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.ConvTranspose3d.html#mlx.nn.ConvTranspose3d" title="mlx.nn.ConvTranspose3d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ConvTranspose3d</span></code></a>(in_channels, out_channels, ...)</p></td>
 <td><p>Applies a 3-dimensional transposed convolution over the multi-channel input image.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Dropout.html#mlx.nn.Dropout" title="mlx.nn.Dropout"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Dropout</span></code></a>([p])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Dropout.html#mlx.nn.Dropout" title="mlx.nn.Dropout"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Dropout</span></code></a>([p])</p></td>
 <td><p>Randomly zero a portion of the elements during training.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Dropout2d.html#mlx.nn.Dropout2d" title="mlx.nn.Dropout2d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Dropout2d</span></code></a>([p])</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Dropout2d.html#mlx.nn.Dropout2d" title="mlx.nn.Dropout2d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Dropout2d</span></code></a>([p])</p></td>
 <td><p>Apply 2D channel-wise dropout during training.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Dropout3d.html#mlx.nn.Dropout3d" title="mlx.nn.Dropout3d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Dropout3d</span></code></a>([p])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Dropout3d.html#mlx.nn.Dropout3d" title="mlx.nn.Dropout3d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Dropout3d</span></code></a>([p])</p></td>
 <td><p>Apply 3D channel-wise dropout during training.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Embedding.html#mlx.nn.Embedding" title="mlx.nn.Embedding"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Embedding</span></code></a>(num_embeddings, dims)</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Embedding.html#mlx.nn.Embedding" title="mlx.nn.Embedding"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Embedding</span></code></a>(num_embeddings, dims)</p></td>
 <td><p>Implements a simple lookup table that maps each input integer to a high-dimensional vector.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.ELU.html#mlx.nn.ELU" title="mlx.nn.ELU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ELU</span></code></a>([alpha])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.ELU.html#mlx.nn.ELU" title="mlx.nn.ELU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ELU</span></code></a>([alpha])</p></td>
 <td><p>Applies the Exponential Linear Unit.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.GELU.html#mlx.nn.GELU" title="mlx.nn.GELU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GELU</span></code></a>([approx])</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.GELU.html#mlx.nn.GELU" title="mlx.nn.GELU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GELU</span></code></a>([approx])</p></td>
 <td><p>Applies the Gaussian Error Linear Units.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.GLU.html#mlx.nn.GLU" title="mlx.nn.GLU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GLU</span></code></a>([axis])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.GLU.html#mlx.nn.GLU" title="mlx.nn.GLU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GLU</span></code></a>([axis])</p></td>
 <td><p>Applies the gated linear unit function.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.GroupNorm.html#mlx.nn.GroupNorm" title="mlx.nn.GroupNorm"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GroupNorm</span></code></a>(num_groups, dims[, eps, affine, ...])</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.GroupNorm.html#mlx.nn.GroupNorm" title="mlx.nn.GroupNorm"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GroupNorm</span></code></a>(num_groups, dims[, eps, affine, ...])</p></td>
 <td><p>Applies Group Normalization [1] to the inputs.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.GRU.html#mlx.nn.GRU" title="mlx.nn.GRU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GRU</span></code></a>(input_size, hidden_size[, bias])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.GRU.html#mlx.nn.GRU" title="mlx.nn.GRU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GRU</span></code></a>(input_size, hidden_size[, bias])</p></td>
 <td><p>A gated recurrent unit (GRU) RNN layer.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.HardShrink.html#mlx.nn.HardShrink" title="mlx.nn.HardShrink"><code class="xref py py-obj docutils literal notranslate"><span class="pre">HardShrink</span></code></a>()</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.HardShrink.html#mlx.nn.HardShrink" title="mlx.nn.HardShrink"><code class="xref py py-obj docutils literal notranslate"><span class="pre">HardShrink</span></code></a>()</p></td>
 <td><p>Applies the HardShrink function.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.HardTanh.html#mlx.nn.HardTanh" title="mlx.nn.HardTanh"><code class="xref py py-obj docutils literal notranslate"><span class="pre">HardTanh</span></code></a>()</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.HardTanh.html#mlx.nn.HardTanh" title="mlx.nn.HardTanh"><code class="xref py py-obj docutils literal notranslate"><span class="pre">HardTanh</span></code></a>()</p></td>
 <td><p>Applies the HardTanh function.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Hardswish.html#mlx.nn.Hardswish" title="mlx.nn.Hardswish"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Hardswish</span></code></a>()</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Hardswish.html#mlx.nn.Hardswish" title="mlx.nn.Hardswish"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Hardswish</span></code></a>()</p></td>
 <td><p>Applies the hardswish function, element-wise.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.InstanceNorm.html#mlx.nn.InstanceNorm" title="mlx.nn.InstanceNorm"><code class="xref py py-obj docutils literal notranslate"><span class="pre">InstanceNorm</span></code></a>(dims[, eps, affine])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.InstanceNorm.html#mlx.nn.InstanceNorm" title="mlx.nn.InstanceNorm"><code class="xref py py-obj docutils literal notranslate"><span class="pre">InstanceNorm</span></code></a>(dims[, eps, affine])</p></td>
 <td><p>Applies instance normalization [1] on the inputs.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LayerNorm.html#mlx.nn.LayerNorm" title="mlx.nn.LayerNorm"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LayerNorm</span></code></a>(dims[, eps, affine, bias])</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LayerNorm.html#mlx.nn.LayerNorm" title="mlx.nn.LayerNorm"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LayerNorm</span></code></a>(dims[, eps, affine, bias])</p></td>
 <td><p>Applies layer normalization [1] on the inputs.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LeakyReLU.html#mlx.nn.LeakyReLU" title="mlx.nn.LeakyReLU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LeakyReLU</span></code></a>([negative_slope])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LeakyReLU.html#mlx.nn.LeakyReLU" title="mlx.nn.LeakyReLU"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LeakyReLU</span></code></a>([negative_slope])</p></td>
 <td><p>Applies the Leaky Rectified Linear Unit.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Linear.html#mlx.nn.Linear" title="mlx.nn.Linear"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Linear</span></code></a>(input_dims, output_dims[, bias])</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Linear.html#mlx.nn.Linear" title="mlx.nn.Linear"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Linear</span></code></a>(input_dims, output_dims[, bias])</p></td>
 <td><p>Applies an affine transformation to the input.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LogSigmoid.html#mlx.nn.LogSigmoid" title="mlx.nn.LogSigmoid"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LogSigmoid</span></code></a>()</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LogSigmoid.html#mlx.nn.LogSigmoid" title="mlx.nn.LogSigmoid"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LogSigmoid</span></code></a>()</p></td>
 <td><p>Applies the Log Sigmoid function.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LogSoftmax.html#mlx.nn.LogSoftmax" title="mlx.nn.LogSoftmax"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LogSoftmax</span></code></a>()</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LogSoftmax.html#mlx.nn.LogSoftmax" title="mlx.nn.LogSoftmax"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LogSoftmax</span></code></a>()</p></td>
 <td><p>Applies the Log Softmax function.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LSTM.html#mlx.nn.LSTM" title="mlx.nn.LSTM"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LSTM</span></code></a>(input_size, hidden_size[, bias])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.LSTM.html#mlx.nn.LSTM" title="mlx.nn.LSTM"><code class="xref py py-obj docutils literal notranslate"><span class="pre">LSTM</span></code></a>(input_size, hidden_size[, bias])</p></td>
 <td><p>An LSTM recurrent layer.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.MaxPool1d.html#mlx.nn.MaxPool1d" title="mlx.nn.MaxPool1d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MaxPool1d</span></code></a>(kernel_size[, stride, padding])</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.MaxPool1d.html#mlx.nn.MaxPool1d" title="mlx.nn.MaxPool1d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MaxPool1d</span></code></a>(kernel_size[, stride, padding])</p></td>
 <td><p>Applies 1-dimensional max pooling.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.MaxPool2d.html#mlx.nn.MaxPool2d" title="mlx.nn.MaxPool2d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MaxPool2d</span></code></a>(kernel_size[, stride, padding])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.MaxPool2d.html#mlx.nn.MaxPool2d" title="mlx.nn.MaxPool2d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MaxPool2d</span></code></a>(kernel_size[, stride, padding])</p></td>
 <td><p>Applies 2-dimensional max pooling.</p></td>
 </tr>
+<tr class="row-even"><td><p><a class="reference internal" href="_autosummary/mlx.nn.MaxPool3d.html#mlx.nn.MaxPool3d" title="mlx.nn.MaxPool3d"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MaxPool3d</span></code></a>(kernel_size[, stride, padding])</p></td>
+<td><p>Applies 3-dimensional max pooling.</p></td>
+</tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="_autosummary/mlx.nn.Mish.html#mlx.nn.Mish" title="mlx.nn.Mish"><code class="xref py py-obj docutils literal notranslate"><span class="pre">Mish</span></code></a>()</p></td>
 <td><p>Applies the Mish function, element-wise.</p></td>
 </tr>
diff --git a/docs/build/html/python/nn/losses.html b/docs/build/html/python/nn/losses.html
index 40c6b94d5..16b007d23 100644
--- a/docs/build/html/python/nn/losses.html
+++ b/docs/build/html/python/nn/losses.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Loss Functions &#8212; MLX 0.20.0 documentation</title>
+    <title>Loss Functions &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.tanh" href="_autosummary_functions/mlx.nn.tanh.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/nn/module.html b/docs/build/html/python/nn/module.html
index 68ba455fe..157a77583 100644
--- a/docs/build/html/python/nn/module.html
+++ b/docs/build/html/python/nn/module.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Module &#8212; MLX 0.20.0 documentation</title>
+    <title>Module &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.nn.quantize" href="../_autosummary/mlx.nn.quantize.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/ops.html b/docs/build/html/python/ops.html
index 3f77dd8b8..539ded7da 100644
--- a/docs/build/html/python/ops.html
+++ b/docs/build/html/python/ops.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Operations &#8212; MLX 0.20.0 documentation</title>
+    <title>Operations &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.core.synchronize" href="_autosummary/mlx.core.synchronize.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers.html b/docs/build/html/python/optimizers.html
index 273674033..c512a138e 100644
--- a/docs/build/html/python/optimizers.html
+++ b/docs/build/html/python/optimizers.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Optimizers &#8212; MLX 0.20.0 documentation</title>
+    <title>Optimizers &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.nn.init.he_uniform" href="nn/_autosummary/mlx.nn.init.he_uniform.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.AdaDelta.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.AdaDelta.html
index 7a270fa7d..167d5cdcc 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.AdaDelta.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.AdaDelta.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.AdaDelta &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.AdaDelta &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.Adafactor" href="mlx.optimizers.Adafactor.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adafactor.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adafactor.html
index 59a679adf..e37f18c37 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adafactor.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adafactor.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.Adafactor &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.Adafactor &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.Adagrad" href="mlx.optimizers.Adagrad.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adagrad.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adagrad.html
index 4e2a31bf9..1bdbab44b 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adagrad.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adagrad.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.Adagrad &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.Adagrad &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.RMSprop" href="mlx.optimizers.RMSprop.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adam.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adam.html
index 6b24ad2e0..8387ba6db 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adam.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adam.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.Adam &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.Adam &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.AdaDelta" href="mlx.optimizers.AdaDelta.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.AdamW.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.AdamW.html
index c918b564c..d064cce2a 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.AdamW.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.AdamW.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.AdamW &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.AdamW &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.Adam" href="mlx.optimizers.Adam.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adamax.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adamax.html
index 79ee469ad..5a0c4585a 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adamax.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Adamax.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.Adamax &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.Adamax &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.AdamW" href="mlx.optimizers.AdamW.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Lion.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Lion.html
index 69ca61ec1..36c98ad07 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Lion.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Lion.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.Lion &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.Lion &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.Adamax" href="mlx.optimizers.Adamax.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.html
index bf3501699..437527ffc 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.Optimizer.apply_gradients &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.Optimizer.apply_gradients &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.optimizers.Optimizer.state" href="mlx.optimizers.Optimizer.state.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.init.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.init.html
index 74817b91a..2dbf4cba5 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.init.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.init.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.Optimizer.init &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.Optimizer.init &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.optimizers.Optimizer.apply_gradients" href="mlx.optimizers.Optimizer.apply_gradients.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.state.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.state.html
index a9ab1a3ec..434586016 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.state.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.state.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.Optimizer.state &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.Optimizer.state &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Optimizer" href="../optimizer.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.update.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.update.html
index efbe0a18a..bffeb270f 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.update.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.Optimizer.update.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.Optimizer.update &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.Optimizer.update &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.optimizers.Optimizer.init" href="mlx.optimizers.Optimizer.init.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.RMSprop.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.RMSprop.html
index e5e1aa65b..ebb608198 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.RMSprop.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.RMSprop.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.RMSprop &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.RMSprop &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.SGD" href="mlx.optimizers.SGD.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.SGD.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.SGD.html
index 8e7d1b0e4..74dfdbdb5 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.SGD.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.SGD.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.SGD &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.SGD &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="Common Optimizers" href="../common_optimizers.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.cosine_decay.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.cosine_decay.html
index 654c49769..c46a28db9 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.cosine_decay.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.cosine_decay.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.cosine_decay &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.cosine_decay &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Schedulers" href="../schedulers.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.exponential_decay.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.exponential_decay.html
index d150ccb47..21f7f3555 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.exponential_decay.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.exponential_decay.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.exponential_decay &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.exponential_decay &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.optimizers.cosine_decay" href="mlx.optimizers.cosine_decay.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.join_schedules.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.join_schedules.html
index 931e6e292..3df4fbea4 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.join_schedules.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.join_schedules.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.join_schedules &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.join_schedules &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.exponential_decay" href="mlx.optimizers.exponential_decay.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.linear_schedule.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.linear_schedule.html
index e9aec67bf..807bed1f0 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.linear_schedule.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.linear_schedule.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.linear_schedule &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.linear_schedule &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.optimizers.join_schedules" href="mlx.optimizers.join_schedules.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.step_decay.html b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.step_decay.html
index 0f20abc6e..3b9c5b4ee 100644
--- a/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.step_decay.html
+++ b/docs/build/html/python/optimizers/_autosummary/mlx.optimizers.step_decay.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>mlx.optimizers.step_decay &#8212; MLX 0.20.0 documentation</title>
+    <title>mlx.optimizers.step_decay &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.optimizers.linear_schedule" href="mlx.optimizers.linear_schedule.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/common_optimizers.html b/docs/build/html/python/optimizers/common_optimizers.html
index 4e40764b6..6dc07a35d 100644
--- a/docs/build/html/python/optimizers/common_optimizers.html
+++ b/docs/build/html/python/optimizers/common_optimizers.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Common Optimizers &#8212; MLX 0.20.0 documentation</title>
+    <title>Common Optimizers &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.Optimizer.update" href="_autosummary/mlx.optimizers.Optimizer.update.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/optimizer.html b/docs/build/html/python/optimizers/optimizer.html
index 4e080517e..5aa499585 100644
--- a/docs/build/html/python/optimizers/optimizer.html
+++ b/docs/build/html/python/optimizers/optimizer.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Optimizer &#8212; MLX 0.20.0 documentation</title>
+    <title>Optimizer &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Optimizers" href="../optimizers.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/optimizers/schedulers.html b/docs/build/html/python/optimizers/schedulers.html
index c38b9e941..e5c19102b 100644
--- a/docs/build/html/python/optimizers/schedulers.html
+++ b/docs/build/html/python/optimizers/schedulers.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Schedulers &#8212; MLX 0.20.0 documentation</title>
+    <title>Schedulers &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../../_static/doctools.js?v=9a2dae69"></script>
     <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -52,7 +52,7 @@
     <link rel="prev" title="mlx.optimizers.Lion" href="_autosummary/mlx.optimizers.Lion.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/random.html b/docs/build/html/python/random.html
index 82efd6af1..2d606a547 100644
--- a/docs/build/html/python/random.html
+++ b/docs/build/html/python/random.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Random &#8212; MLX 0.20.0 documentation</title>
+    <title>Random &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.zeros_like" href="_autosummary/mlx.core.zeros_like.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/transforms.html b/docs/build/html/python/transforms.html
index 4ee6f2445..ccedbe16f 100644
--- a/docs/build/html/python/transforms.html
+++ b/docs/build/html/python/transforms.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Transforms &#8212; MLX 0.20.0 documentation</title>
+    <title>Transforms &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.random.permutation" href="_autosummary/mlx.core.random.permutation.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/python/tree_utils.html b/docs/build/html/python/tree_utils.html
index 233c7296c..be8a0239a 100644
--- a/docs/build/html/python/tree_utils.html
+++ b/docs/build/html/python/tree_utils.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Tree Utils &#8212; MLX 0.20.0 documentation</title>
+    <title>Tree Utils &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="mlx.core.distributed.recv_like" href="_autosummary/mlx.core.distributed.recv_like.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/quantized_8h.html b/docs/build/html/quantized_8h.html
index c0e18903c..15c398275 100644
--- a/docs/build/html/quantized_8h.html
+++ b/docs/build/html/quantized_8h.html
@@ -194,9 +194,6 @@ Functions</h2></td></tr>
 <tr class="memitem:a47bcf4a14566e01e14bd3c155811db59" id="r_a47bcf4a14566e01e14bd3c155811db59"><td class="memTemplParams" colspan="2">template&lt;typename T , const int group_size, const int bits&gt; </td></tr>
 <tr class="memitem:a47bcf4a14566e01e14bd3c155811db59"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a47bcf4a14566e01e14bd3c155811db59">affine_quantize</a> (const device T *w, device uint8_t *out, device T *scales, device T *biases, uint2 index, uint2 grid_dim)</td></tr>
 <tr class="separator:a47bcf4a14566e01e14bd3c155811db59"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a7561acefd7b55e7e2b25393be08bb99c" id="r_a7561acefd7b55e7e2b25393be08bb99c"><td class="memTemplParams" colspan="2">template&lt;typename T , const int group_size, const int bits&gt; </td></tr>
-<tr class="memitem:a7561acefd7b55e7e2b25393be08bb99c"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a7561acefd7b55e7e2b25393be08bb99c">affine_quantize_scales_biases</a> (const device T *w, const device T *scales, const device T *biases, device uint8_t *out, uint2 index, uint2 grid_dim)</td></tr>
-<tr class="separator:a7561acefd7b55e7e2b25393be08bb99c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a6076203615038eb06816158f7b3869c6" id="r_a6076203615038eb06816158f7b3869c6"><td class="memTemplParams" colspan="2">template&lt;typename T , const int group_size, const int bits&gt; </td></tr>
 <tr class="memitem:a6076203615038eb06816158f7b3869c6"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a6076203615038eb06816158f7b3869c6">affine_dequantize</a> (const device uint8_t *w, const device T *scales, const device T *biases, device T *out, uint2 index, uint2 grid_dim)</td></tr>
 <tr class="separator:a6076203615038eb06816158f7b3869c6"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -514,49 +511,6 @@ template&lt;typename T , const int group_size, const int bits&gt; </div>
       </table>
 </div><div class="memdoc">
 
-</div>
-</div>
-<a id="a7561acefd7b55e7e2b25393be08bb99c" name="a7561acefd7b55e7e2b25393be08bb99c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7561acefd7b55e7e2b25393be08bb99c">&#9670;&#160;</a></span>affine_quantize_scales_biases()</h2>
-
-<div class="memitem">
-<div class="memproto">
-<div class="memtemplate">
-template&lt;typename T , const int group_size, const int bits&gt; </div>
-      <table class="memname">
-        <tr>
-          <td class="memname">void affine_quantize_scales_biases </td>
-          <td>(</td>
-          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>w</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>scales</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>biases</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">device uint8_t *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">uint2</td>          <td class="paramname"><span class="paramname"><em>index</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">uint2</td>          <td class="paramname"><span class="paramname"><em>grid_dim</em></span>&#160;)</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
 </div>
 </div>
 <a id="a1a66b061c46383952a0f067c3848971f" name="a1a66b061c46383952a0f067c3848971f"></a>
diff --git a/docs/build/html/quantized_8h_source.html b/docs/build/html/quantized_8h_source.html
index 0a08db757..4562088ee 100644
--- a/docs/build/html/quantized_8h_source.html
+++ b/docs/build/html/quantized_8h_source.html
@@ -107,8 +107,8 @@ $(function(){ initResizable(false); });
 <div class="foldopen" id="foldopen00014" data-start="{" data-end="}">
 <div class="line"><a id="l00014" name="l00014"></a><span class="lineno"><a class="line" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">   14</a></span><span class="keyword">inline</span> U <a class="code hl_function" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector</a>(<span class="keyword">const</span> device T* x, thread U* x_thread) {</div>
 <div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>      bits == 2 || bits == 4 || bits == 8,</div>
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 4, 8}&quot;</span>);</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>      bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 3, 4, 6, 8}&quot;</span>);</div>
 <div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span> </div>
 <div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>  U sum = 0;</div>
 <div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span> </div>
@@ -122,1803 +122,1797 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>    }</div>
 <div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  }</div>
 <div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span> </div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i += 4) {</div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>      x_thread[i] = x[i];</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>      x_thread[i + 1] = x[i + 1] / 16.0f;</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>      x_thread[i + 2] = x[i + 2] / 256.0f;</div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>      x_thread[i + 3] = x[i + 3] / 4096.0f;</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>    }</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  }</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span> </div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i++) {</div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>      sum += x[i];</div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>      x_thread[i] = x[i];</div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>    }</div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  }</div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span> </div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>  <span class="keywordflow">return</span> sum;</div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>}</div>
-</div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span> </div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keywordtype">int</span> values_per_thread, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen00052" data-start="{" data-end="}">
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno"><a class="line" href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">   52</a></span><span class="keyword">inline</span> U <a class="code hl_function" href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">load_vector_safe</a>(<span class="keyword">const</span> device T* x, thread U* x_thread, <span class="keywordtype">int</span> N) {</div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>      bits == 2 || bits == 4 || bits == 8,</div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 4, 8}&quot;</span>);</div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span> </div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>  U sum = 0;</div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span> </div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i += 4) {</div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];</div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>      x_thread[i] = x[i];</div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>      x_thread[i + 1] = x[i + 1] / 4.0f;</div>
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>      x_thread[i + 2] = x[i + 2] / 16.0f;</div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>      x_thread[i + 3] = x[i + 3] / 64.0f;</div>
-<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    }</div>
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = N; i &lt; values_per_thread; i++) {</div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>      x_thread[i] = 0;</div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>    }</div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>  }</div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span> </div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i += 4) {</div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];</div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>      x_thread[i] = x[i];</div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>      x_thread[i + 1] = x[i + 1] / 16.0f;</div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>      x_thread[i + 2] = x[i + 2] / 256.0f;</div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>      x_thread[i + 3] = x[i + 3] / 4096.0f;</div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    }</div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = N; i &lt; values_per_thread; i++) {</div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>      x_thread[i] = 0;</div>
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    }</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>  }</div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span> </div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i++) {</div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>      sum += x[i];</div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>      x_thread[i] = x[i];</div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>    }</div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = N; i &lt; values_per_thread; i++) {</div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>      x_thread[i] = 0;</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    }</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>  }</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span> </div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>  <span class="keywordflow">return</span> sum;</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>}</div>
-</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span> </div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> values_per_thread, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen00099" data-start="{" data-end="}">
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno"><a class="line" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">   99</a></span><span class="keyword">inline</span> U <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot</a>(</div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    <span class="keyword">const</span> device uint8_t* w,</div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>    <span class="keyword">const</span> thread U* x_thread,</div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    U scale,</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    U bias,</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    U sum) {</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>      bits == 2 || bits == 4 || bits == 8,</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 4, 8}&quot;</span>);</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span> </div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>  U accum = 0;</div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span> </div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 4); i++) {</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>      accum +=</div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>          (x_thread[4 * i] * (w[i] &amp; 0x03) +</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>           x_thread[4 * i + 1] * (w[i] &amp; 0x0c) +</div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>           x_thread[4 * i + 2] * (w[i] &amp; 0x30) +</div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>           x_thread[4 * i + 3] * (w[i] &amp; 0xc0));</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    }</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  }</div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span> </div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    <span class="keyword">const</span> device uint16_t* ws = (<span class="keyword">const</span> device uint16_t*)w;</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 4); i++) {</div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>      accum +=</div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>          (x_thread[4 * i] * (ws[i] &amp; 0x000f) +</div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>           x_thread[4 * i + 1] * (ws[i] &amp; 0x00f0) +</div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>           x_thread[4 * i + 2] * (ws[i] &amp; 0x0f00) +</div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>           x_thread[4 * i + 3] * (ws[i] &amp; 0xf000));</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    }</div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>  }</div>
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span> </div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i++) {</div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>      accum += x_thread[i] * w[i];</div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>    }</div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>  }</div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span> </div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>  <span class="keywordflow">return</span> scale * accum + sum * bias;</div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>}</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 3) {</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i += 8) {</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>          x[i + 6] + x[i + 7];</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>      x_thread[i] = x[i];</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>      x_thread[i + 1] = x[i + 1] / 8.0f;</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>      x_thread[i + 2] = x[i + 2] / 64.0f;</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>      x_thread[i + 3] = x[i + 3] / 2.0f;</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>      x_thread[i + 4] = x[i + 4] / 16.0f;</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>      x_thread[i + 5] = x[i + 5] / 128.0f;</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>      x_thread[i + 6] = x[i + 6] / 4.0f;</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>      x_thread[i + 7] = x[i + 7] / 32.0f;</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>    }</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  }</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span> </div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i += 4) {</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>      x_thread[i] = x[i];</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>      x_thread[i + 1] = x[i + 1] / 16.0f;</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>      x_thread[i + 2] = x[i + 2] / 256.0f;</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>      x_thread[i + 3] = x[i + 3] / 4096.0f;</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>    }</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>  }</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span> </div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 6) {</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i += 4) {</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>      x_thread[i] = x[i];</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>      x_thread[i + 1] = x[i + 1] / 64.0f;</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>      x_thread[i + 2] = x[i + 2] / 16.0f;</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>      x_thread[i + 3] = x[i + 3] / 4.0f;</div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    }</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>  }</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span> </div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i++) {</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>      sum += x[i];</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>      x_thread[i] = x[i];</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>    }</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>  }</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span> </div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  <span class="keywordflow">return</span> sum;</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>}</div>
 </div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span> </div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keywordtype">int</span> values_per_thread, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen00077" data-start="{" data-end="}">
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno"><a class="line" href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">   77</a></span><span class="keyword">inline</span> U <a class="code hl_function" href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">load_vector_safe</a>(<span class="keyword">const</span> device T* x, thread U* x_thread, <span class="keywordtype">int</span> N) {</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>      bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 3, 4, 6, 8}&quot;</span>);</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span> </div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>  U sum = 0;</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span> </div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i += 4) {</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>      x_thread[i] = x[i];</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>      x_thread[i + 1] = x[i + 1] / 4.0f;</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>      x_thread[i + 2] = x[i + 2] / 16.0f;</div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>      x_thread[i + 3] = x[i + 3] / 64.0f;</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>    }</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>  }</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span> </div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 3) {</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i += 8) {</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>          x[i + 6] + x[i + 7];</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span> </div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>      x_thread[i] = x[i];</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>      x_thread[i + 1] = x[i + 1] / 8.0f;</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>      x_thread[i + 2] = x[i + 2] / 64.0f;</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>      x_thread[i + 3] = x[i + 3] / 2.0f;</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>      x_thread[i + 4] = x[i + 4] / 16.0f;</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>      x_thread[i + 5] = x[i + 5] / 128.0f;</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>      x_thread[i + 6] = x[i + 6] / 4.0f;</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>      x_thread[i + 7] = x[i + 7] / 32.0f;</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    }</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>  }</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span> </div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i += 4) {</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>      x_thread[i] = x[i];</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>      x_thread[i + 1] = x[i + 1] / 16.0f;</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>      x_thread[i + 2] = x[i + 2] / 256.0f;</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>      x_thread[i + 3] = x[i + 3] / 4096.0f;</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    }</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>  }</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span> </div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 6) {</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i += 4) {</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>      sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>      x_thread[i] = x[i];</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>      x_thread[i + 1] = x[i + 1] / 64.0f;</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>      x_thread[i + 2] = x[i + 2] / 16.0f;</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>      x_thread[i + 3] = x[i + 3] / 4.0f;</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    }</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>  }</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span> </div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i++) {</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>      sum += x[i];</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>      x_thread[i] = x[i];</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>    }</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>  }</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span> </div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = N; i &lt; values_per_thread; i++) {</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    x_thread[i] = 0;</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>  }</div>
 <div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span> </div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> values_per_thread, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen00142" data-start="{" data-end="}">
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno"><a class="line" href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42">  142</a></span><span class="keyword">inline</span> U <a class="code hl_function" href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42">qdot_safe</a>(</div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    <span class="keyword">const</span> device uint8_t* w,</div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>    <span class="keyword">const</span> thread U* x_thread,</div>
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>    U scale,</div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>    U bias,</div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>    U sum,</div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>    <span class="keywordtype">int</span> N) {</div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>      bits == 2 || bits == 4 || bits == 8,</div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 4, 8}&quot;</span>);</div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span> </div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>  U accum = 0;</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>  <span class="keywordflow">return</span> sum;</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>}</div>
+</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span> </div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> values_per_thread, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen00145" data-start="{" data-end="}">
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno"><a class="line" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">  145</a></span><span class="keyword">inline</span> U <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot</a>(</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>    <span class="keyword">const</span> device uint8_t* w,</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>    <span class="keyword">const</span> thread U* x_thread,</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>    U scale,</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>    U bias,</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    U sum) {</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>      bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 3, 4, 6, 8}&quot;</span>);</div>
 <div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span> </div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 4); i++) {</div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>      accum +=</div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>          (x_thread[4 * i] * (w[i] &amp; 0x03) +</div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>           x_thread[4 * i + 1] * (w[i] &amp; 0x0c) +</div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>           x_thread[4 * i + 2] * (w[i] &amp; 0x30) +</div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>           x_thread[4 * i + 3] * (w[i] &amp; 0xc0));</div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    }</div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>  }</div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span> </div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keyword">const</span> device uint16_t* ws = (<span class="keyword">const</span> device uint16_t*)w;</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 4); i++) {</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>      accum +=</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>          (x_thread[4 * i] * (ws[i] &amp; 0x000f) +</div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>           x_thread[4 * i + 1] * (ws[i] &amp; 0x00f0) +</div>
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>           x_thread[4 * i + 2] * (ws[i] &amp; 0x0f00) +</div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>           x_thread[4 * i + 3] * (ws[i] &amp; 0xf000));</div>
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    }</div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>  }</div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span> </div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i++) {</div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>      accum += x_thread[i] * w[i];</div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>    }</div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>  }</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>  U accum = 0;</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span> </div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 4); i++) {</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>      accum +=</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>          (x_thread[4 * i] * (w[i] &amp; 0x03) +</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>           x_thread[4 * i + 1] * (w[i] &amp; 0x0c) +</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>           x_thread[4 * i + 2] * (w[i] &amp; 0x30) +</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>           x_thread[4 * i + 3] * (w[i] &amp; 0xc0));</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    }</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>  }</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span> </div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 3) {</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 8); i++) {</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>      x_thread += 8 * i;</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>      w += 3 * i;</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span> </div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>      accum += (w[0] &amp; 0x07) * x_thread[0];</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>      accum += (w[0] &amp; 0x38) * x_thread[1];</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>      accum += (w[0] &amp; 0xc0) * x_thread[2];</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>      accum += (w[1] &amp; 0x01) * (x_thread[2] * 256.0f);</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span> </div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>      accum += (w[1] &amp; 0x0e) * x_thread[3];</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>      accum += (w[1] &amp; 0x70) * x_thread[4];</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>      accum += (w[1] &amp; 0x80) * x_thread[5];</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>      accum += (w[2] &amp; 0x03) * (x_thread[5] * 256.0f);</div>
 <div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span> </div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>  <span class="keywordflow">return</span> scale * accum + sum * bias;</div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>}</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>      accum += (w[2] &amp; 0x1c) * x_thread[6];</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>      accum += (w[2] &amp; 0xe0) * x_thread[7];</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>    }</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>  }</div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span> </div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    <span class="keyword">const</span> device uint16_t* ws = (<span class="keyword">const</span> device uint16_t*)w;</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 4); i++) {</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>      accum +=</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>          (x_thread[4 * i] * (ws[i] &amp; 0x000f) +</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>           x_thread[4 * i + 1] * (ws[i] &amp; 0x00f0) +</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>           x_thread[4 * i + 2] * (ws[i] &amp; 0x0f00) +</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>           x_thread[4 * i + 3] * (ws[i] &amp; 0xf000));</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    }</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>  }</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span> </div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 6) {</div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 4); i++) {</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>      x_thread += 4 * i;</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>      w += 3 * i;</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span> </div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>      accum += (w[0] &amp; 0x3f) * x_thread[0];</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span> </div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      accum += (w[0] &amp; 0xc0) * x_thread[1];</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>      accum += (w[1] &amp; 0x0f) * (x_thread[1] * 256.0f);</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span> </div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>      accum += (w[1] &amp; 0xf0) * x_thread[2];</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>      accum += (w[2] &amp; 0x03) * (x_thread[2] * 256.0f);</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span> </div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>      accum += (w[2] &amp; 0xfc) * x_thread[3];</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>    }</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>  }</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span> </div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i++) {</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>      accum += x_thread[i] * w[i];</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>    }</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>  }</div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span> </div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>  <span class="keywordflow">return</span> scale * accum + sum * bias;</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>}</div>
 </div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span> </div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> values_per_thread, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span><span class="keyword">inline</span> <span class="keywordtype">void</span></div>
-<div class="foldopen" id="foldopen00187" data-start="{" data-end="}">
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno"><a class="line" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">  187</a></span><a class="code hl_function" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter</a>(<span class="keyword">const</span> thread uint8_t* w, U x, U scale, U bias, thread U* result) {</div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>      bits == 2 || bits == 4 || bits == 8,</div>
-<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 4, 8}&quot;</span>);</div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span> </div>
-<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
-<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>    U s[4] = {scale, scale / 4.0f, scale / 16.0f, scale / 64.0f};</div>
-<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 4); i++) {</div>
-<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>      result[4 * i] += x * (s[0] * (w[i] &amp; 0x03) + bias);</div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>      result[4 * i + 1] += x * (s[1] * (w[i] &amp; 0x0c) + bias);</div>
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>      result[4 * i + 2] += x * (s[2] * (w[i] &amp; 0x30) + bias);</div>
-<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>      result[4 * i + 3] += x * (s[3] * (w[i] &amp; 0xc0) + bias);</div>
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>    }</div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>  }</div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span> </div>
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>    U s[2] = {scale, scale / 16.0f};</div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 2); i++) {</div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      result[2 * i] += x * (s[0] * (w[i] &amp; 0x0f) + bias);</div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>      result[2 * i + 1] += x * (s[1] * (w[i] &amp; 0xf0) + bias);</div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>    }</div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>  }</div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span> </div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i++) {</div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>      result[i] += x * (scale * w[i] + bias);</div>
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>    }</div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>  }</div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>}</div>
-</div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span> </div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> N, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span><span class="keyword">inline</span> <span class="keywordtype">void</span></div>
-<div class="foldopen" id="foldopen00219" data-start="{" data-end="}">
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno"><a class="line" href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">  219</a></span><a class="code hl_function" href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">dequantize</a>(<span class="keyword">const</span> device uint8_t* w, U scale, U bias, threadgroup U* w_local) {</div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>      bits == 2 || bits == 4 || bits == 8,</div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 4, 8}&quot;</span>);</div>
 <div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span> </div>
-<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
-<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>    U s[4] = {</div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>        scale,</div>
-<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>        scale / <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(4.0f),</div>
-<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>        scale / <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(16.0f),</div>
-<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>        scale / <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(64.0f)};</div>
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 4); i++) {</div>
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>      w_local[4 * i] = s[0] * (w[i] &amp; 0x03) + bias;</div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>      w_local[4 * i + 1] = s[1] * (w[i] &amp; 0x0c) + bias;</div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>      w_local[4 * i + 2] = s[2] * (w[i] &amp; 0x30) + bias;</div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>      w_local[4 * i + 3] = s[3] * (w[i] &amp; 0xc0) + bias;</div>
-<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>    }</div>
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>  }</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> values_per_thread, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen00225" data-start="{" data-end="}">
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno"><a class="line" href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42">  225</a></span><span class="keyword">inline</span> U <a class="code hl_function" href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42">qdot_safe</a>(</div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>    <span class="keyword">const</span> device uint8_t* w,</div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>    <span class="keyword">const</span> thread U* x_thread,</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>    U scale,</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>    U bias,</div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    U sum,</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>    <span class="keywordtype">int</span> N) {</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>      bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 3, 4, 6, 8}&quot;</span>);</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span> </div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>  U accum = 0;</div>
 <div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span> </div>
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>    U s[2] = {scale, scale / <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(16.0f)};</div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 2); i++) {</div>
-<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>      w_local[2 * i] = s[0] * (w[i] &amp; 0x0f) + bias;</div>
-<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>      w_local[2 * i + 1] = s[1] * (w[i] &amp; 0xf0) + bias;</div>
-<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>    }</div>
-<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>  }</div>
-<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span> </div>
-<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i++) {</div>
-<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>      w_local[i] = scale * w[i] + bias;</div>
-<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>    }</div>
-<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>  }</div>
-<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>}</div>
-</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 4); i++) {</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>      accum +=</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>          (x_thread[4 * i] * (w[i] &amp; 0x03) +</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>           x_thread[4 * i + 1] * (w[i] &amp; 0x0c) +</div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>           x_thread[4 * i + 2] * (w[i] &amp; 0x30) +</div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>           x_thread[4 * i + 3] * (w[i] &amp; 0xc0));</div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>    }</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>  }</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span> </div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 3) {</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 8); i++) {</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>      x_thread += 8 * i;</div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>      w += 3 * i;</div>
 <div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span> </div>
-<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>    <span class="keywordtype">short</span> BROWS,</div>
-<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>    <span class="keywordtype">short</span> BCOLS,</div>
-<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>    <span class="keywordtype">short</span> dst_ld,</div>
-<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>    <span class="keywordtype">short</span> reduction_dim,</div>
-<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>    <span class="keywordtype">short</span> tgp_size,</div>
-<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>    <span class="keywordtype">short</span> group_size,</div>
-<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>    <span class="keywordtype">short</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>&gt;</div>
-<div class="foldopen" id="foldopen00262" data-start="{" data-end="};">
-<div class="line"><a id="l00262" name="l00262"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html">  262</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a> {</div>
-<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>      BCOLS &lt;= group_size,</div>
-<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>      <span class="stringliteral">&quot;The group size should be larger than the columns&quot;</span>);</div>
-<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>      group_size % BCOLS == 0,</div>
-<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>      <span class="stringliteral">&quot;The group size should be divisible by the columns&quot;</span>);</div>
-<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>      bits == 2 || bits == 4 || bits == 8,</div>
-<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 4, 8}&quot;</span>);</div>
-<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span> </div>
-<div class="line"><a id="l00273" name="l00273"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">  273</a></span>  <a class="code hl_define" href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a> = 32 / bits;</div>
-<div class="line"><a id="l00274" name="l00274"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">  274</a></span>  <a class="code hl_define" href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> = BCOLS / <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>;</div>
-<div class="line"><a id="l00275" name="l00275"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">  275</a></span>  <a class="code hl_define" href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> =</div>
-<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>      (<a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS &lt; tgp_size) ? 1 : (<a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS) / tgp_size;</div>
-<div class="line"><a id="l00277" name="l00277"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">  277</a></span>  <a class="code hl_define" href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">group_steps</a> = group_size / BCOLS;</div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>      accum += (w[0] &amp; 0x07) * x_thread[0];</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>      accum += (w[0] &amp; 0x38) * x_thread[1];</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>      accum += (w[0] &amp; 0xc0) * x_thread[2];</div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>      accum += (w[1] &amp; 0x01) * (x_thread[2] * 256.0f);</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span> </div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>      accum += (w[1] &amp; 0x0e) * x_thread[3];</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>      accum += (w[1] &amp; 0x70) * x_thread[4];</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>      accum += (w[1] &amp; 0x80) * x_thread[5];</div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>      accum += (w[2] &amp; 0x03) * (x_thread[5] * 256.0f);</div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span> </div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>      accum += (w[2] &amp; 0x1c) * x_thread[6];</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>      accum += (w[2] &amp; 0xe0) * x_thread[7];</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>    }</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>  }</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span> </div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>    <span class="keyword">const</span> device uint16_t* ws = (<span class="keyword">const</span> device uint16_t*)w;</div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 4); i++) {</div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>      accum +=</div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>          (x_thread[4 * i] * (ws[i] &amp; 0x000f) +</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>           x_thread[4 * i + 1] * (ws[i] &amp; 0x00f0) +</div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>           x_thread[4 * i + 2] * (ws[i] &amp; 0x0f00) +</div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>           x_thread[4 * i + 3] * (ws[i] &amp; 0xf000));</div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>    }</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>  }</div>
 <div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span> </div>
-<div class="line"><a id="l00279" name="l00279"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">  279</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a>;</div>
-<div class="line"><a id="l00280" name="l00280"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">  280</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">tile_stride</a>;</div>
-<div class="line"><a id="l00281" name="l00281"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">  281</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a>;</div>
-<div class="line"><a id="l00282" name="l00282"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">  282</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a>;</div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 6) {</div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 4); i++) {</div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>      x_thread += 4 * i;</div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>      w += 3 * i;</div>
 <div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span> </div>
-<div class="line"><a id="l00284" name="l00284"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">  284</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a>;</div>
-<div class="line"><a id="l00285" name="l00285"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">  285</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a>;</div>
-<div class="line"><a id="l00286" name="l00286"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">  286</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">bj</a>;</div>
-<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span> </div>
-<div class="line"><a id="l00288" name="l00288"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">  288</a></span>  threadgroup T* <a class="code hl_variable" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">dst</a>;</div>
-<div class="line"><a id="l00289" name="l00289"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">  289</a></span>  <span class="keyword">const</span> device uint32_t* <a class="code hl_variable" href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">src</a>;</div>
-<div class="line"><a id="l00290" name="l00290"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">  290</a></span>  <span class="keyword">const</span> device T* <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>;</div>
-<div class="line"><a id="l00291" name="l00291"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">  291</a></span>  <span class="keyword">const</span> device T* <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>;</div>
-<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span> </div>
-<div class="foldopen" id="foldopen00293" data-start="{" data-end="}">
-<div class="line"><a id="l00293" name="l00293"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#af59b054750a65e7e79c1cd05c4acac93">  293</a></span>  <a class="code hl_function" href="struct_quantized_block_loader.html#af59b054750a65e7e79c1cd05c4acac93">QuantizedBlockLoader</a>(</div>
-<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span>      <span class="keyword">const</span> device uint32_t* src_,</div>
-<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>      <span class="keyword">const</span> device T* scales_,</div>
-<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>      <span class="keyword">const</span> device T* biases_,</div>
-<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> src_ld_,</div>
-<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>      threadgroup T* dst_,</div>
-<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>      ushort simd_group_id [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>      ushort simd_lane_id [[thread_index_in_simdgroup]])</div>
-<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>      : <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a>(src_ld_),</div>
-<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">tile_stride</a>(</div>
-<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>            reduction_dim ? <a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> : BROWS * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> / <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>),</div>
-<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a>(0),</div>
-<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a>(BROWS * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> / group_size),</div>
-<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a>(simd_group_id * 32 + simd_lane_id),</div>
-<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a>(<a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a> / <a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a>),</div>
-<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">bj</a>((<a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a>) % <a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a>),</div>
-<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>        dst(dst_ + <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> * dst_ld + <a class="code hl_variable" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">bj</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>),</div>
-<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">src</a>(src_ + <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> / <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a> + <a class="code hl_variable" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">bj</a>),</div>
-<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>(scales_ + <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> / group_size),</div>
-<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>(biases_ + <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> / group_size) {}</div>
-</div>
-<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span> </div>
-<div class="foldopen" id="foldopen00314" data-start="{" data-end="}">
-<div class="line"><a id="l00314" name="l00314"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">  314</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">load_unsafe</a>()<span class="keyword"> const </span>{</div>
-<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS &lt; tgp_size &amp;&amp; bi &gt;= BROWS) {</div>
-<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span>      <span class="keywordflow">return</span>;</div>
-<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span>    }</div>
-<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span> </div>
-<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>    T scale = *<a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>;</div>
-<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>    T bias = *<a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>;</div>
-<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a>; i++) {</div>
-<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>      <a class="code hl_function" href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">dequantize&lt;T, pack_factor, bits&gt;</a>(</div>
-<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>          (device uint8_t*)(<a class="code hl_variable" href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">src</a> + i), scale, bias, dst + i * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>);</div>
-<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>    }</div>
-<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>  }</div>
-</div>
-<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span> </div>
-<div class="foldopen" id="foldopen00327" data-start="{" data-end="}">
-<div class="line"><a id="l00327" name="l00327"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">  327</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">load_safe</a>(short2 src_tile_dim)<span class="keyword"> const </span>{</div>
-<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS &lt; tgp_size &amp;&amp; bi &gt;= BROWS) {</div>
-<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>      <span class="keywordflow">return</span>;</div>
-<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>    }</div>
-<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span> </div>
-<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span>    <span class="keywordflow">if</span> (reduction_dim == 1 &amp;&amp; <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> &gt;= src_tile_dim.y) {</div>
-<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>; i++) {</div>
-<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span>        dst[i] = T(0);</div>
-<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span>      }</div>
-<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span>      <span class="keywordflow">return</span>;</div>
-<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span>    }</div>
-<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span> </div>
-<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>    <span class="keywordflow">if</span> (reduction_dim == 0 &amp;&amp; <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> &gt;= src_tile_dim.x) {</div>
-<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>; i++) {</div>
-<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span>        dst[i] = T(0);</div>
-<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span>      }</div>
-<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>      <span class="keywordflow">return</span>;</div>
-<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>    }</div>
-<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span> </div>
-<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span>    T scale = *<a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>;</div>
-<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span>    T bias = *<a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>;</div>
-<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a>; i++) {</div>
-<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>      <a class="code hl_function" href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">dequantize&lt;T, pack_factor, bits&gt;</a>(</div>
-<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span>          (device uint8_t*)(<a class="code hl_variable" href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">src</a> + i), scale, bias, dst + i * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>);</div>
-<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span>    }</div>
-<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>  }</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>      accum += (w[0] &amp; 0x3f) * x_thread[0];</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span> </div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>      accum += (w[0] &amp; 0xc0) * x_thread[1];</div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>      accum += (w[1] &amp; 0x0f) * (x_thread[1] * 256.0f);</div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span> </div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>      accum += (w[1] &amp; 0xf0) * x_thread[2];</div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>      accum += (w[2] &amp; 0x03) * (x_thread[2] * 256.0f);</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span> </div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>      accum += (w[2] &amp; 0xfc) * x_thread[3];</div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>    }</div>
+<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span>  }</div>
+<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span> </div>
+<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i++) {</div>
+<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>      accum += x_thread[i] * w[i];</div>
+<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>    }</div>
+<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>  }</div>
+<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span> </div>
+<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>  <span class="keywordflow">return</span> scale * accum + sum * bias;</div>
+<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>}</div>
 </div>
+<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span> </div>
+<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> values_per_thread, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span><span class="keyword">inline</span> <span class="keywordtype">void</span></div>
+<div class="foldopen" id="foldopen00307" data-start="{" data-end="}">
+<div class="line"><a id="l00307" name="l00307"></a><span class="lineno"><a class="line" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">  307</a></span><a class="code hl_function" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter</a>(<span class="keyword">const</span> thread uint8_t* w, U x, U scale, U bias, thread U* result) {</div>
+<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>      bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,</div>
+<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 3, 4, 6, 8}&quot;</span>);</div>
+<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span> </div>
+<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
+<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>    U s[4] = {scale, scale / 4.0f, scale / 16.0f, scale / 64.0f};</div>
+<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 4); i++) {</div>
+<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>      result[4 * i] += x * (s[0] * (w[i] &amp; 0x03) + bias);</div>
+<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span>      result[4 * i + 1] += x * (s[1] * (w[i] &amp; 0x0c) + bias);</div>
+<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span>      result[4 * i + 2] += x * (s[2] * (w[i] &amp; 0x30) + bias);</div>
+<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span>      result[4 * i + 3] += x * (s[3] * (w[i] &amp; 0xc0) + bias);</div>
+<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>    }</div>
+<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>  }</div>
+<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span> </div>
+<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 3) {</div>
+<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 8); i++) {</div>
+<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>      uint8_t w0 = w[3 * i];</div>
+<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>      uint8_t w1 = w[3 * i + 1];</div>
+<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span>      uint8_t w2 = w[3 * i + 2];</div>
+<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span> </div>
+<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>      result[8 * i] += x * ((w0 &amp; 0x7) * scale + bias);</div>
+<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>      result[8 * i + 1] += x * (((w0 &amp; 0x38) &gt;&gt; 3) * scale + bias);</div>
+<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>      result[8 * i + 2] +=</div>
+<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>          x * ((((w0 &amp; 0xc0) &gt;&gt; 6) + ((w1 &amp; 0x1) &lt;&lt; 2)) * scale + bias);</div>
+<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span>      result[8 * i + 3] += x * (((w1 &amp; 0xe) &gt;&gt; 1) * scale + bias);</div>
+<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>      result[8 * i + 4] += x * (((w1 &amp; 0x70) &gt;&gt; 4) * scale + bias);</div>
+<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span>      result[8 * i + 5] +=</div>
+<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span>          x * ((((w1 &amp; 0x80) &gt;&gt; 7) + ((w2 &amp; 0x3) &lt;&lt; 1)) * scale + bias);</div>
+<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span>      result[8 * i + 6] += x * (((w2 &amp; 0x1c) &gt;&gt; 2) * scale + bias);</div>
+<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span>      result[8 * i + 7] += x * (((w2 &amp; 0xe0) &gt;&gt; 5) * scale + bias);</div>
+<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span>    }</div>
+<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>  }</div>
+<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span> </div>
+<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
+<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span>    U s[2] = {scale, scale / 16.0f};</div>
+<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 2); i++) {</div>
+<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>      result[2 * i] += x * (s[0] * (w[i] &amp; 0x0f) + bias);</div>
+<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>      result[2 * i + 1] += x * (s[1] * (w[i] &amp; 0xf0) + bias);</div>
+<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span>    }</div>
+<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span> </div>
+<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>  } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 6) {</div>
+<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (values_per_thread / 4); i++) {</div>
+<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span>      uint8_t w0 = w[3 * i];</div>
+<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span>      uint8_t w1 = w[3 * i + 1];</div>
+<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>      uint8_t w2 = w[3 * i + 2];</div>
 <div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span> </div>
-<div class="foldopen" id="foldopen00354" data-start="{" data-end="}">
-<div class="line"><a id="l00354" name="l00354"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">  354</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">next</a>() {</div>
-<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>    <a class="code hl_variable" href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">src</a> += <a class="code hl_variable" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">tile_stride</a>;</div>
-<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>    <span class="keywordflow">if</span> (reduction_dim == 1) {</div>
-<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span>      <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">group_steps</a> &gt; 1) {</div>
-<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a>++;</div>
-<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>        <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a> == <a class="code hl_variable" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">group_steps</a>) {</div>
-<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>          <a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a> = 0;</div>
-<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span>          <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>++;</div>
-<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span>          <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>++;</div>
-<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>        }</div>
-<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>      } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>++;</div>
-<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>++;</div>
-<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span>      }</div>
-<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span>      <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a> += <a class="code hl_variable" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a>;</div>
-<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span>      <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a> += <a class="code hl_variable" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a>;</div>
-<div class="line"><a id="l00371" name="l00371"></a><span class="lineno">  371</span>    }</div>
-<div class="line"><a id="l00372" name="l00372"></a><span class="lineno">  372</span>  }</div>
+<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span>      result[4 * i] += x * ((w0 &amp; 0x3f) * scale + bias);</div>
+<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>      result[4 * i + 1] +=</div>
+<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>          x * ((((w0 &gt;&gt; 6) &amp; 0x03) + ((w1 &amp; 0x0f) &lt;&lt; 2)) * scale + bias);</div>
+<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span>      result[4 * i + 2] +=</div>
+<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span>          x * ((((w1 &gt;&gt; 4) &amp; 0x0f) + ((w2 &amp; 0x03) &lt;&lt; 4)) * scale + bias);</div>
+<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>      result[4 * i + 3] += x * (((w2 &gt;&gt; 2) &amp; 0x3f) * scale + bias);</div>
+<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>    }</div>
+<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span>  }</div>
+<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span> </div>
+<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_thread; i++) {</div>
+<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span>      result[i] += x * (scale * w[i] + bias);</div>
+<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span>    }</div>
+<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span>  }</div>
+<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span>}</div>
 </div>
-<div class="line"><a id="l00373" name="l00373"></a><span class="lineno">  373</span>};</div>
-</div>
-<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span> </div>
-<div class="line"><a id="l00375" name="l00375"></a><span class="lineno">  375</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits, <span class="keywordtype">int</span> D&gt;</div>
-<div class="foldopen" id="foldopen00376" data-start="{" data-end="}">
-<div class="line"><a id="l00376" name="l00376"></a><span class="lineno"><a class="line" href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">  376</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">qmv_quad_impl</a>(</div>
-<div class="line"><a id="l00377" name="l00377"></a><span class="lineno">  377</span>    <span class="keyword">const</span> device uint32_t* w,</div>
-<div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span>    <span class="keyword">const</span> device T* scales,</div>
-<div class="line"><a id="l00379" name="l00379"></a><span class="lineno">  379</span>    <span class="keyword">const</span> device T* biases,</div>
-<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>    <span class="keyword">const</span> device T* x,</div>
-<div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span>    device T* y,</div>
-<div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span>    constant <span class="keywordtype">int</span>&amp; in_vec_size,</div>
-<div class="line"><a id="l00383" name="l00383"></a><span class="lineno">  383</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size,</div>
-<div class="line"><a id="l00384" name="l00384"></a><span class="lineno">  384</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00385" name="l00385"></a><span class="lineno">  385</span>    uint quad_gid [[quadgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l00386" name="l00386"></a><span class="lineno">  386</span>    uint quad_lid [[thread_index_in_quadgroup]]) {</div>
-<div class="line"><a id="l00387" name="l00387"></a><span class="lineno">  387</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> quads_per_simd = <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a> / <a class="code hl_variable" href="quantized_8h.html#a803e4d5a1459844ba647aea5b004e133">QUAD_SIZE</a>;</div>
-<div class="line"><a id="l00388" name="l00388"></a><span class="lineno">  388</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = 32 / bits;</div>
-<div class="line"><a id="l00389" name="l00389"></a><span class="lineno">  389</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_thread = D / <a class="code hl_variable" href="quantized_8h.html#a803e4d5a1459844ba647aea5b004e133">QUAD_SIZE</a>;</div>
-<div class="line"><a id="l00390" name="l00390"></a><span class="lineno">  390</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_thread = values_per_thread / pack_factor;</div>
-<div class="line"><a id="l00391" name="l00391"></a><span class="lineno">  391</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> scale_step_per_thread = group_size / values_per_thread;</div>
-<div class="line"><a id="l00392" name="l00392"></a><span class="lineno">  392</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> results_per_quadgroup = 8;</div>
-<div class="line"><a id="l00393" name="l00393"></a><span class="lineno">  393</span> </div>
-<div class="line"><a id="l00394" name="l00394"></a><span class="lineno">  394</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
+<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span> </div>
+<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> U, <span class="keywordtype">int</span> N, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="line"><a id="l00371" name="l00371"></a><span class="lineno">  371</span><span class="keyword">inline</span> <span class="keywordtype">void</span></div>
+<div class="foldopen" id="foldopen00372" data-start="{" data-end="}">
+<div class="line"><a id="l00372" name="l00372"></a><span class="lineno"><a class="line" href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">  372</a></span><a class="code hl_function" href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">dequantize</a>(<span class="keyword">const</span> device uint8_t* w, U scale, U bias, threadgroup U* w_local) {</div>
+<div class="line"><a id="l00373" name="l00373"></a><span class="lineno">  373</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span>      bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,</div>
+<div class="line"><a id="l00375" name="l00375"></a><span class="lineno">  375</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 3, 4, 6, 8}&quot;</span>);</div>
+<div class="line"><a id="l00376" name="l00376"></a><span class="lineno">  376</span> </div>
+<div class="line"><a id="l00377" name="l00377"></a><span class="lineno">  377</span>  <span class="keywordflow">if</span> (bits == 2) {</div>
+<div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span>    U s[4] = {</div>
+<div class="line"><a id="l00379" name="l00379"></a><span class="lineno">  379</span>        scale,</div>
+<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>        scale / <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(4.0f),</div>
+<div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span>        scale / <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(16.0f),</div>
+<div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span>        scale / <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(64.0f)};</div>
+<div class="line"><a id="l00383" name="l00383"></a><span class="lineno">  383</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 4); i++) {</div>
+<div class="line"><a id="l00384" name="l00384"></a><span class="lineno">  384</span>      w_local[4 * i] = s[0] * (w[i] &amp; 0x03) + bias;</div>
+<div class="line"><a id="l00385" name="l00385"></a><span class="lineno">  385</span>      w_local[4 * i + 1] = s[1] * (w[i] &amp; 0x0c) + bias;</div>
+<div class="line"><a id="l00386" name="l00386"></a><span class="lineno">  386</span>      w_local[4 * i + 2] = s[2] * (w[i] &amp; 0x30) + bias;</div>
+<div class="line"><a id="l00387" name="l00387"></a><span class="lineno">  387</span>      w_local[4 * i + 3] = s[3] * (w[i] &amp; 0xc0) + bias;</div>
+<div class="line"><a id="l00388" name="l00388"></a><span class="lineno">  388</span>    }</div>
+<div class="line"><a id="l00389" name="l00389"></a><span class="lineno">  389</span>  }</div>
+<div class="line"><a id="l00390" name="l00390"></a><span class="lineno">  390</span> </div>
+<div class="line"><a id="l00391" name="l00391"></a><span class="lineno">  391</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 3) {</div>
+<div class="line"><a id="l00392" name="l00392"></a><span class="lineno">  392</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 8); i++) {</div>
+<div class="line"><a id="l00393" name="l00393"></a><span class="lineno">  393</span>      w_local += 8 * i;</div>
+<div class="line"><a id="l00394" name="l00394"></a><span class="lineno">  394</span>      w += 3 * i;</div>
 <div class="line"><a id="l00395" name="l00395"></a><span class="lineno">  395</span> </div>
-<div class="line"><a id="l00396" name="l00396"></a><span class="lineno">  396</span>  thread U x_thread[values_per_thread];</div>
-<div class="line"><a id="l00397" name="l00397"></a><span class="lineno">  397</span>  thread U result[results_per_quadgroup] = {0};</div>
-<div class="line"><a id="l00398" name="l00398"></a><span class="lineno">  398</span> </div>
-<div class="line"><a id="l00399" name="l00399"></a><span class="lineno">  399</span>  <span class="comment">// Adjust positions</span></div>
-<div class="line"><a id="l00400" name="l00400"></a><span class="lineno">  400</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_w = in_vec_size / pack_factor;</div>
-<div class="line"><a id="l00401" name="l00401"></a><span class="lineno">  401</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_g = in_vec_size / group_size;</div>
-<div class="line"><a id="l00402" name="l00402"></a><span class="lineno">  402</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_row = tid.x * quads_per_simd * results_per_quadgroup + quad_gid;</div>
-<div class="line"><a id="l00403" name="l00403"></a><span class="lineno">  403</span> </div>
-<div class="line"><a id="l00404" name="l00404"></a><span class="lineno">  404</span>  w += out_row * in_vec_size_w + quad_lid * packs_per_thread;</div>
-<div class="line"><a id="l00405" name="l00405"></a><span class="lineno">  405</span>  scales += out_row * in_vec_size_g + quad_lid / scale_step_per_thread;</div>
-<div class="line"><a id="l00406" name="l00406"></a><span class="lineno">  406</span>  biases += out_row * in_vec_size_g + quad_lid / scale_step_per_thread;</div>
-<div class="line"><a id="l00407" name="l00407"></a><span class="lineno">  407</span>  x += tid.y * in_vec_size + quad_lid * values_per_thread;</div>
-<div class="line"><a id="l00408" name="l00408"></a><span class="lineno">  408</span>  y += tid.y * out_vec_size + out_row;</div>
-<div class="line"><a id="l00409" name="l00409"></a><span class="lineno">  409</span> </div>
-<div class="line"><a id="l00410" name="l00410"></a><span class="lineno">  410</span>  U sum = <a class="code hl_function" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread);</div>
-<div class="line"><a id="l00411" name="l00411"></a><span class="lineno">  411</span> </div>
-<div class="line"><a id="l00412" name="l00412"></a><span class="lineno">  412</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_quadgroup; row++) {</div>
-<div class="line"><a id="l00413" name="l00413"></a><span class="lineno">  413</span>    <span class="keyword">const</span> device uint8_t* wl =</div>
-<div class="line"><a id="l00414" name="l00414"></a><span class="lineno">  414</span>        (<span class="keyword">const</span> device uint8_t*)(w + row * in_vec_size_w * quads_per_simd);</div>
-<div class="line"><a id="l00415" name="l00415"></a><span class="lineno">  415</span>    <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g * quads_per_simd;</div>
-<div class="line"><a id="l00416" name="l00416"></a><span class="lineno">  416</span>    <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g * quads_per_simd;</div>
-<div class="line"><a id="l00417" name="l00417"></a><span class="lineno">  417</span> </div>
-<div class="line"><a id="l00418" name="l00418"></a><span class="lineno">  418</span>    U s = sl[0];</div>
-<div class="line"><a id="l00419" name="l00419"></a><span class="lineno">  419</span>    U b = bl[0];</div>
-<div class="line"><a id="l00420" name="l00420"></a><span class="lineno">  420</span>    <span class="keywordflow">if</span> (row * quads_per_simd + out_row &lt; out_vec_size) {</div>
-<div class="line"><a id="l00421" name="l00421"></a><span class="lineno">  421</span>      result[row] += <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
-<div class="line"><a id="l00422" name="l00422"></a><span class="lineno">  422</span>    }</div>
-<div class="line"><a id="l00423" name="l00423"></a><span class="lineno">  423</span>  }</div>
-<div class="line"><a id="l00424" name="l00424"></a><span class="lineno">  424</span> </div>
-<div class="line"><a id="l00425" name="l00425"></a><span class="lineno">  425</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_quadgroup; row++) {</div>
-<div class="line"><a id="l00426" name="l00426"></a><span class="lineno">  426</span>    result[row] = quad_sum(result[row]);</div>
-<div class="line"><a id="l00427" name="l00427"></a><span class="lineno">  427</span>    <span class="keywordflow">if</span> (quad_lid == 0 &amp;&amp; row * quads_per_simd + out_row &lt; out_vec_size) {</div>
-<div class="line"><a id="l00428" name="l00428"></a><span class="lineno">  428</span>      y[row * quads_per_simd] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[row]);</div>
-<div class="line"><a id="l00429" name="l00429"></a><span class="lineno">  429</span>    }</div>
-<div class="line"><a id="l00430" name="l00430"></a><span class="lineno">  430</span>  }</div>
-<div class="line"><a id="l00431" name="l00431"></a><span class="lineno">  431</span>}</div>
+<div class="line"><a id="l00396" name="l00396"></a><span class="lineno">  396</span>      w_local[0] = (w[0] &amp; 0x7) * scale + bias;</div>
+<div class="line"><a id="l00397" name="l00397"></a><span class="lineno">  397</span>      w_local[1] = ((w[0] &amp; 0x38) &gt;&gt; 3) * scale + bias;</div>
+<div class="line"><a id="l00398" name="l00398"></a><span class="lineno">  398</span>      w_local[2] = (((w[0] &amp; 0xc0) &gt;&gt; 6) + ((w[1] &amp; 0x1) &lt;&lt; 2)) * scale + bias;</div>
+<div class="line"><a id="l00399" name="l00399"></a><span class="lineno">  399</span>      w_local[3] = ((w[1] &amp; 0xe) &gt;&gt; 1) * scale + bias;</div>
+<div class="line"><a id="l00400" name="l00400"></a><span class="lineno">  400</span>      w_local[4] = ((w[1] &amp; 0x70) &gt;&gt; 4) * scale + bias;</div>
+<div class="line"><a id="l00401" name="l00401"></a><span class="lineno">  401</span>      w_local[5] = (((w[1] &amp; 0x80) &gt;&gt; 7) + ((w[2] &amp; 0x3) &lt;&lt; 1)) * scale + bias;</div>
+<div class="line"><a id="l00402" name="l00402"></a><span class="lineno">  402</span>      w_local[6] = ((w[2] &amp; 0x1c) &gt;&gt; 2) * scale + bias;</div>
+<div class="line"><a id="l00403" name="l00403"></a><span class="lineno">  403</span>      w_local[7] = ((w[2] &amp; 0xe0) &gt;&gt; 5) * scale + bias;</div>
+<div class="line"><a id="l00404" name="l00404"></a><span class="lineno">  404</span>    }</div>
+<div class="line"><a id="l00405" name="l00405"></a><span class="lineno">  405</span>  }</div>
+<div class="line"><a id="l00406" name="l00406"></a><span class="lineno">  406</span> </div>
+<div class="line"><a id="l00407" name="l00407"></a><span class="lineno">  407</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
+<div class="line"><a id="l00408" name="l00408"></a><span class="lineno">  408</span>    U s[2] = {scale, scale / <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(16.0f)};</div>
+<div class="line"><a id="l00409" name="l00409"></a><span class="lineno">  409</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 2); i++) {</div>
+<div class="line"><a id="l00410" name="l00410"></a><span class="lineno">  410</span>      w_local[2 * i] = s[0] * (w[i] &amp; 0x0f) + bias;</div>
+<div class="line"><a id="l00411" name="l00411"></a><span class="lineno">  411</span>      w_local[2 * i + 1] = s[1] * (w[i] &amp; 0xf0) + bias;</div>
+<div class="line"><a id="l00412" name="l00412"></a><span class="lineno">  412</span>    }</div>
+<div class="line"><a id="l00413" name="l00413"></a><span class="lineno">  413</span>  }</div>
+<div class="line"><a id="l00414" name="l00414"></a><span class="lineno">  414</span> </div>
+<div class="line"><a id="l00415" name="l00415"></a><span class="lineno">  415</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 6) {</div>
+<div class="line"><a id="l00416" name="l00416"></a><span class="lineno">  416</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; (N / 4); i++) {</div>
+<div class="line"><a id="l00417" name="l00417"></a><span class="lineno">  417</span>      w_local += 4 * i;</div>
+<div class="line"><a id="l00418" name="l00418"></a><span class="lineno">  418</span>      w += 3 * i;</div>
+<div class="line"><a id="l00419" name="l00419"></a><span class="lineno">  419</span> </div>
+<div class="line"><a id="l00420" name="l00420"></a><span class="lineno">  420</span>      w_local[0] = (w[0] &amp; 0x3f) * scale + bias;</div>
+<div class="line"><a id="l00421" name="l00421"></a><span class="lineno">  421</span>      w_local[1] = (((w[0] &gt;&gt; 6) &amp; 0x03) + ((w[1] &amp; 0x0f) &lt;&lt; 2)) * scale + bias;</div>
+<div class="line"><a id="l00422" name="l00422"></a><span class="lineno">  422</span>      w_local[2] = (((w[1] &gt;&gt; 4) &amp; 0x0f) + ((w[2] &amp; 0x03) &lt;&lt; 4)) * scale + bias;</div>
+<div class="line"><a id="l00423" name="l00423"></a><span class="lineno">  423</span>      w_local[3] = ((w[2] &gt;&gt; 2) &amp; 0x3f) * scale + bias;</div>
+<div class="line"><a id="l00424" name="l00424"></a><span class="lineno">  424</span>    }</div>
+<div class="line"><a id="l00425" name="l00425"></a><span class="lineno">  425</span>  }</div>
+<div class="line"><a id="l00426" name="l00426"></a><span class="lineno">  426</span> </div>
+<div class="line"><a id="l00427" name="l00427"></a><span class="lineno">  427</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l00428" name="l00428"></a><span class="lineno">  428</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N; i++) {</div>
+<div class="line"><a id="l00429" name="l00429"></a><span class="lineno">  429</span>      w_local[i] = scale * w[i] + bias;</div>
+<div class="line"><a id="l00430" name="l00430"></a><span class="lineno">  430</span>    }</div>
+<div class="line"><a id="l00431" name="l00431"></a><span class="lineno">  431</span>  }</div>
+<div class="line"><a id="l00432" name="l00432"></a><span class="lineno">  432</span>}</div>
 </div>
-<div class="line"><a id="l00432" name="l00432"></a><span class="lineno">  432</span> </div>
-<div class="line"><a id="l00433" name="l00433"></a><span class="lineno">  433</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen00434" data-start="{" data-end="}">
-<div class="line"><a id="l00434" name="l00434"></a><span class="lineno"><a class="line" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">  434</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl</a>(</div>
-<div class="line"><a id="l00435" name="l00435"></a><span class="lineno">  435</span>    <span class="keyword">const</span> device uint32_t* w,</div>
-<div class="line"><a id="l00436" name="l00436"></a><span class="lineno">  436</span>    <span class="keyword">const</span> device T* scales,</div>
-<div class="line"><a id="l00437" name="l00437"></a><span class="lineno">  437</span>    <span class="keyword">const</span> device T* biases,</div>
-<div class="line"><a id="l00438" name="l00438"></a><span class="lineno">  438</span>    <span class="keyword">const</span> device T* x,</div>
-<div class="line"><a id="l00439" name="l00439"></a><span class="lineno">  439</span>    device T* y,</div>
-<div class="line"><a id="l00440" name="l00440"></a><span class="lineno">  440</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size,</div>
-<div class="line"><a id="l00441" name="l00441"></a><span class="lineno">  441</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size,</div>
-<div class="line"><a id="l00442" name="l00442"></a><span class="lineno">  442</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00443" name="l00443"></a><span class="lineno">  443</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l00444" name="l00444"></a><span class="lineno">  444</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l00445" name="l00445"></a><span class="lineno">  445</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_thread = bits &gt; 2 ? 2 : 1;</div>
-<div class="line"><a id="l00446" name="l00446"></a><span class="lineno">  446</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> num_simdgroups = 2;</div>
-<div class="line"><a id="l00447" name="l00447"></a><span class="lineno">  447</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> results_per_simdgroup = 4;</div>
-<div class="line"><a id="l00448" name="l00448"></a><span class="lineno">  448</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = 32 / bits;</div>
-<div class="line"><a id="l00449" name="l00449"></a><span class="lineno">  449</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_thread = pack_factor * packs_per_thread;</div>
-<div class="line"><a id="l00450" name="l00450"></a><span class="lineno">  450</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> block_size = values_per_thread * <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>;</div>
-<div class="line"><a id="l00451" name="l00451"></a><span class="lineno">  451</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> scale_step_per_thread = group_size / values_per_thread;</div>
-<div class="line"><a id="l00452" name="l00452"></a><span class="lineno">  452</span> </div>
-<div class="line"><a id="l00453" name="l00453"></a><span class="lineno">  453</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
-<div class="line"><a id="l00454" name="l00454"></a><span class="lineno">  454</span> </div>
-<div class="line"><a id="l00455" name="l00455"></a><span class="lineno">  455</span>  thread U x_thread[values_per_thread];</div>
-<div class="line"><a id="l00456" name="l00456"></a><span class="lineno">  456</span>  thread U result[results_per_simdgroup] = {0};</div>
-<div class="line"><a id="l00457" name="l00457"></a><span class="lineno">  457</span> </div>
-<div class="line"><a id="l00458" name="l00458"></a><span class="lineno">  458</span>  <span class="comment">// Adjust positions</span></div>
-<div class="line"><a id="l00459" name="l00459"></a><span class="lineno">  459</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_w = in_vec_size / pack_factor;</div>
-<div class="line"><a id="l00460" name="l00460"></a><span class="lineno">  460</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_g = in_vec_size / group_size;</div>
-<div class="line"><a id="l00461" name="l00461"></a><span class="lineno">  461</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_row = tid.x * (num_simdgroups * results_per_simdgroup) +</div>
-<div class="line"><a id="l00462" name="l00462"></a><span class="lineno">  462</span>      simd_gid * results_per_simdgroup;</div>
-<div class="line"><a id="l00463" name="l00463"></a><span class="lineno">  463</span>  w += out_row * in_vec_size_w + simd_lid * packs_per_thread;</div>
-<div class="line"><a id="l00464" name="l00464"></a><span class="lineno">  464</span>  scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
-<div class="line"><a id="l00465" name="l00465"></a><span class="lineno">  465</span>  biases += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
-<div class="line"><a id="l00466" name="l00466"></a><span class="lineno">  466</span>  x += tid.y * in_vec_size + simd_lid * values_per_thread;</div>
-<div class="line"><a id="l00467" name="l00467"></a><span class="lineno">  467</span>  y += tid.y * out_vec_size + out_row;</div>
-<div class="line"><a id="l00468" name="l00468"></a><span class="lineno">  468</span> </div>
-<div class="line"><a id="l00469" name="l00469"></a><span class="lineno">  469</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; in_vec_size; k += block_size) {</div>
-<div class="line"><a id="l00470" name="l00470"></a><span class="lineno">  470</span>    U sum = <a class="code hl_function" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread);</div>
-<div class="line"><a id="l00471" name="l00471"></a><span class="lineno">  471</span> </div>
-<div class="line"><a id="l00472" name="l00472"></a><span class="lineno">  472</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
-<div class="line"><a id="l00473" name="l00473"></a><span class="lineno">  473</span>      <span class="keyword">const</span> device uint8_t* wl =</div>
-<div class="line"><a id="l00474" name="l00474"></a><span class="lineno">  474</span>          (<span class="keyword">const</span> device uint8_t*)(w + row * in_vec_size_w);</div>
-<div class="line"><a id="l00475" name="l00475"></a><span class="lineno">  475</span>      <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
-<div class="line"><a id="l00476" name="l00476"></a><span class="lineno">  476</span>      <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
-<div class="line"><a id="l00477" name="l00477"></a><span class="lineno">  477</span> </div>
-<div class="line"><a id="l00478" name="l00478"></a><span class="lineno">  478</span>      U s = sl[0];</div>
-<div class="line"><a id="l00479" name="l00479"></a><span class="lineno">  479</span>      U b = bl[0];</div>
-<div class="line"><a id="l00480" name="l00480"></a><span class="lineno">  480</span>      result[row] += <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
-<div class="line"><a id="l00481" name="l00481"></a><span class="lineno">  481</span>    }</div>
-<div class="line"><a id="l00482" name="l00482"></a><span class="lineno">  482</span> </div>
-<div class="line"><a id="l00483" name="l00483"></a><span class="lineno">  483</span>    w += block_size / pack_factor;</div>
-<div class="line"><a id="l00484" name="l00484"></a><span class="lineno">  484</span>    scales += block_size / group_size;</div>
-<div class="line"><a id="l00485" name="l00485"></a><span class="lineno">  485</span>    biases += block_size / group_size;</div>
-<div class="line"><a id="l00486" name="l00486"></a><span class="lineno">  486</span>    x += block_size;</div>
-<div class="line"><a id="l00487" name="l00487"></a><span class="lineno">  487</span>  }</div>
-<div class="line"><a id="l00488" name="l00488"></a><span class="lineno">  488</span> </div>
-<div class="line"><a id="l00489" name="l00489"></a><span class="lineno">  489</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
-<div class="line"><a id="l00490" name="l00490"></a><span class="lineno">  490</span>    result[row] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(result[row]);</div>
-<div class="line"><a id="l00491" name="l00491"></a><span class="lineno">  491</span>    <span class="keywordflow">if</span> (simd_lid == 0) {</div>
-<div class="line"><a id="l00492" name="l00492"></a><span class="lineno">  492</span>      y[row] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[row]);</div>
-<div class="line"><a id="l00493" name="l00493"></a><span class="lineno">  493</span>    }</div>
-<div class="line"><a id="l00494" name="l00494"></a><span class="lineno">  494</span>  }</div>
-<div class="line"><a id="l00495" name="l00495"></a><span class="lineno">  495</span>}</div>
+<div class="line"><a id="l00433" name="l00433"></a><span class="lineno">  433</span> </div>
+<div class="line"><a id="l00434" name="l00434"></a><span class="lineno">  434</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00435" name="l00435"></a><span class="lineno">  435</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00436" name="l00436"></a><span class="lineno">  436</span>    <span class="keywordtype">short</span> BROWS,</div>
+<div class="line"><a id="l00437" name="l00437"></a><span class="lineno">  437</span>    <span class="keywordtype">short</span> BCOLS,</div>
+<div class="line"><a id="l00438" name="l00438"></a><span class="lineno">  438</span>    <span class="keywordtype">short</span> dst_ld,</div>
+<div class="line"><a id="l00439" name="l00439"></a><span class="lineno">  439</span>    <span class="keywordtype">short</span> reduction_dim,</div>
+<div class="line"><a id="l00440" name="l00440"></a><span class="lineno">  440</span>    <span class="keywordtype">short</span> tgp_size,</div>
+<div class="line"><a id="l00441" name="l00441"></a><span class="lineno">  441</span>    <span class="keywordtype">short</span> group_size,</div>
+<div class="line"><a id="l00442" name="l00442"></a><span class="lineno">  442</span>    <span class="keywordtype">short</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>&gt;</div>
+<div class="foldopen" id="foldopen00443" data-start="{" data-end="};">
+<div class="line"><a id="l00443" name="l00443"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html">  443</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a> {</div>
+<div class="line"><a id="l00444" name="l00444"></a><span class="lineno">  444</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00445" name="l00445"></a><span class="lineno">  445</span>      BCOLS &lt;= group_size,</div>
+<div class="line"><a id="l00446" name="l00446"></a><span class="lineno">  446</span>      <span class="stringliteral">&quot;The group size should be larger than the columns&quot;</span>);</div>
+<div class="line"><a id="l00447" name="l00447"></a><span class="lineno">  447</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00448" name="l00448"></a><span class="lineno">  448</span>      group_size % BCOLS == 0,</div>
+<div class="line"><a id="l00449" name="l00449"></a><span class="lineno">  449</span>      <span class="stringliteral">&quot;The group size should be divisible by the columns&quot;</span>);</div>
+<div class="line"><a id="l00450" name="l00450"></a><span class="lineno">  450</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00451" name="l00451"></a><span class="lineno">  451</span>      bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,</div>
+<div class="line"><a id="l00452" name="l00452"></a><span class="lineno">  452</span>      <span class="stringliteral">&quot;Template undefined for bits not in {2, 3, 4, 6, 8}&quot;</span>);</div>
+<div class="line"><a id="l00453" name="l00453"></a><span class="lineno">  453</span> </div>
+<div class="line"><a id="l00454" name="l00454"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">  454</a></span>  <a class="code hl_define" href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a> = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;</div>
+<div class="line"><a id="l00455" name="l00455"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">  455</a></span>  <a class="code hl_define" href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">bytes_per_pack</a> = (bits == 3 || bits == 6) ? 3 : 1;</div>
+<div class="line"><a id="l00456" name="l00456"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">  456</a></span>  <a class="code hl_define" href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> = BCOLS / <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>;</div>
+<div class="line"><a id="l00457" name="l00457"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">  457</a></span>  <a class="code hl_define" href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> =</div>
+<div class="line"><a id="l00458" name="l00458"></a><span class="lineno">  458</span>      (<a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS &lt; tgp_size) ? 1 : (<a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS) / tgp_size;</div>
+<div class="line"><a id="l00459" name="l00459"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">  459</a></span>  <a class="code hl_define" href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">group_steps</a> = group_size / BCOLS;</div>
+<div class="line"><a id="l00460" name="l00460"></a><span class="lineno">  460</span> </div>
+<div class="line"><a id="l00461" name="l00461"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">  461</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a>;</div>
+<div class="line"><a id="l00462" name="l00462"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">  462</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">tile_stride</a>;</div>
+<div class="line"><a id="l00463" name="l00463"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">  463</a></span>  <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a>;</div>
+<div class="line"><a id="l00464" name="l00464"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">  464</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a>;</div>
+<div class="line"><a id="l00465" name="l00465"></a><span class="lineno">  465</span> </div>
+<div class="line"><a id="l00466" name="l00466"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">  466</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a>;</div>
+<div class="line"><a id="l00467" name="l00467"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">  467</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a>;</div>
+<div class="line"><a id="l00468" name="l00468"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">  468</a></span>  <span class="keyword">const</span> <span class="keywordtype">short</span> <a class="code hl_variable" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">bj</a>;</div>
+<div class="line"><a id="l00469" name="l00469"></a><span class="lineno">  469</span> </div>
+<div class="line"><a id="l00470" name="l00470"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">  470</a></span>  threadgroup T* <a class="code hl_variable" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">dst</a>;</div>
+<div class="line"><a id="l00471" name="l00471"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">  471</a></span>  <span class="keyword">const</span> device uint8_t* <a class="code hl_variable" href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">src</a>;</div>
+<div class="line"><a id="l00472" name="l00472"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">  472</a></span>  <span class="keyword">const</span> device T* <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>;</div>
+<div class="line"><a id="l00473" name="l00473"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">  473</a></span>  <span class="keyword">const</span> device T* <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>;</div>
+<div class="line"><a id="l00474" name="l00474"></a><span class="lineno">  474</span> </div>
+<div class="foldopen" id="foldopen00475" data-start="{" data-end="}">
+<div class="line"><a id="l00475" name="l00475"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589">  475</a></span>  <a class="code hl_function" href="struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589">QuantizedBlockLoader</a>(</div>
+<div class="line"><a id="l00476" name="l00476"></a><span class="lineno">  476</span>      <span class="keyword">const</span> device uint8_t* src_,</div>
+<div class="line"><a id="l00477" name="l00477"></a><span class="lineno">  477</span>      <span class="keyword">const</span> device T* scales_,</div>
+<div class="line"><a id="l00478" name="l00478"></a><span class="lineno">  478</span>      <span class="keyword">const</span> device T* biases_,</div>
+<div class="line"><a id="l00479" name="l00479"></a><span class="lineno">  479</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> src_ld_,</div>
+<div class="line"><a id="l00480" name="l00480"></a><span class="lineno">  480</span>      threadgroup T* dst_,</div>
+<div class="line"><a id="l00481" name="l00481"></a><span class="lineno">  481</span>      ushort simd_group_id [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00482" name="l00482"></a><span class="lineno">  482</span>      ushort simd_lane_id [[thread_index_in_simdgroup]])</div>
+<div class="line"><a id="l00483" name="l00483"></a><span class="lineno">  483</span>      : <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a>(src_ld_),</div>
+<div class="line"><a id="l00484" name="l00484"></a><span class="lineno">  484</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">tile_stride</a>(</div>
+<div class="line"><a id="l00485" name="l00485"></a><span class="lineno">  485</span>            reduction_dim ? <a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">bytes_per_pack</a></div>
+<div class="line"><a id="l00486" name="l00486"></a><span class="lineno">  486</span>                          : BROWS * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">bytes_per_pack</a> / <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>),</div>
+<div class="line"><a id="l00487" name="l00487"></a><span class="lineno">  487</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a>(0),</div>
+<div class="line"><a id="l00488" name="l00488"></a><span class="lineno">  488</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a>(BROWS * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> / group_size),</div>
+<div class="line"><a id="l00489" name="l00489"></a><span class="lineno">  489</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a>(simd_group_id * 32 + simd_lane_id),</div>
+<div class="line"><a id="l00490" name="l00490"></a><span class="lineno">  490</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a>(<a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a> / <a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a>),</div>
+<div class="line"><a id="l00491" name="l00491"></a><span class="lineno">  491</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">bj</a>((<a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a>) % <a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a>),</div>
+<div class="line"><a id="l00492" name="l00492"></a><span class="lineno">  492</span>        dst(dst_ + <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> * dst_ld + <a class="code hl_variable" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">bj</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>),</div>
+<div class="line"><a id="l00493" name="l00493"></a><span class="lineno">  493</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">src</a>(src_ + <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">bytes_per_pack</a> / <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a> +</div>
+<div class="line"><a id="l00494" name="l00494"></a><span class="lineno">  494</span>            <a class="code hl_variable" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">bj</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">bytes_per_pack</a>),</div>
+<div class="line"><a id="l00495" name="l00495"></a><span class="lineno">  495</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>(scales_ + <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> / group_size),</div>
+<div class="line"><a id="l00496" name="l00496"></a><span class="lineno">  496</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>(biases_ + <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a> / group_size) {}</div>
 </div>
-<div class="line"><a id="l00496" name="l00496"></a><span class="lineno">  496</span> </div>
-<div class="line"><a id="l00497" name="l00497"></a><span class="lineno">  497</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="line"><a id="l00497" name="l00497"></a><span class="lineno">  497</span> </div>
 <div class="foldopen" id="foldopen00498" data-start="{" data-end="}">
-<div class="line"><a id="l00498" name="l00498"></a><span class="lineno"><a class="line" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">  498</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl</a>(</div>
-<div class="line"><a id="l00499" name="l00499"></a><span class="lineno">  499</span>    <span class="keyword">const</span> device uint32_t* w,</div>
-<div class="line"><a id="l00500" name="l00500"></a><span class="lineno">  500</span>    <span class="keyword">const</span> device T* scales,</div>
-<div class="line"><a id="l00501" name="l00501"></a><span class="lineno">  501</span>    <span class="keyword">const</span> device T* biases,</div>
-<div class="line"><a id="l00502" name="l00502"></a><span class="lineno">  502</span>    <span class="keyword">const</span> device T* x,</div>
-<div class="line"><a id="l00503" name="l00503"></a><span class="lineno">  503</span>    device T* y,</div>
-<div class="line"><a id="l00504" name="l00504"></a><span class="lineno">  504</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size,</div>
-<div class="line"><a id="l00505" name="l00505"></a><span class="lineno">  505</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size,</div>
-<div class="line"><a id="l00506" name="l00506"></a><span class="lineno">  506</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00507" name="l00507"></a><span class="lineno">  507</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l00508" name="l00508"></a><span class="lineno">  508</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l00509" name="l00509"></a><span class="lineno">  509</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> num_simdgroups = 2;</div>
-<div class="line"><a id="l00510" name="l00510"></a><span class="lineno">  510</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> results_per_simdgroup = 4;</div>
-<div class="line"><a id="l00511" name="l00511"></a><span class="lineno">  511</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_thread = 1;</div>
-<div class="line"><a id="l00512" name="l00512"></a><span class="lineno">  512</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = 32 / bits;</div>
-<div class="line"><a id="l00513" name="l00513"></a><span class="lineno">  513</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_thread = pack_factor * packs_per_thread;</div>
-<div class="line"><a id="l00514" name="l00514"></a><span class="lineno">  514</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> block_size = values_per_thread * <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>;</div>
-<div class="line"><a id="l00515" name="l00515"></a><span class="lineno">  515</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> scale_step_per_thread = group_size / values_per_thread;</div>
-<div class="line"><a id="l00516" name="l00516"></a><span class="lineno">  516</span> </div>
-<div class="line"><a id="l00517" name="l00517"></a><span class="lineno">  517</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
-<div class="line"><a id="l00518" name="l00518"></a><span class="lineno">  518</span> </div>
-<div class="line"><a id="l00519" name="l00519"></a><span class="lineno">  519</span>  thread U x_thread[values_per_thread];</div>
-<div class="line"><a id="l00520" name="l00520"></a><span class="lineno">  520</span>  thread U result[results_per_simdgroup] = {0};</div>
-<div class="line"><a id="l00521" name="l00521"></a><span class="lineno">  521</span> </div>
-<div class="line"><a id="l00522" name="l00522"></a><span class="lineno">  522</span>  <span class="comment">// Adjust positions</span></div>
-<div class="line"><a id="l00523" name="l00523"></a><span class="lineno">  523</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_w = in_vec_size / pack_factor;</div>
-<div class="line"><a id="l00524" name="l00524"></a><span class="lineno">  524</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_g = in_vec_size / group_size;</div>
-<div class="line"><a id="l00525" name="l00525"></a><span class="lineno">  525</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_row = tid.x * (num_simdgroups * results_per_simdgroup) +</div>
-<div class="line"><a id="l00526" name="l00526"></a><span class="lineno">  526</span>      simd_gid * results_per_simdgroup;</div>
-<div class="line"><a id="l00527" name="l00527"></a><span class="lineno">  527</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> used_out_row = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(out_vec_size - results_per_simdgroup, out_row);</div>
-<div class="line"><a id="l00528" name="l00528"></a><span class="lineno">  528</span> </div>
-<div class="line"><a id="l00529" name="l00529"></a><span class="lineno">  529</span>  <span class="keywordflow">if</span> (out_row &gt;= out_vec_size) {</div>
-<div class="line"><a id="l00530" name="l00530"></a><span class="lineno">  530</span>    <span class="keywordflow">return</span>;</div>
-<div class="line"><a id="l00531" name="l00531"></a><span class="lineno">  531</span>  }</div>
-<div class="line"><a id="l00532" name="l00532"></a><span class="lineno">  532</span> </div>
-<div class="line"><a id="l00533" name="l00533"></a><span class="lineno">  533</span>  <span class="comment">// In this case we need to properly guard all our reads because there isn&#39;t</span></div>
-<div class="line"><a id="l00534" name="l00534"></a><span class="lineno">  534</span>  <span class="comment">// even 1 tile in the matrix</span></div>
-<div class="line"><a id="l00535" name="l00535"></a><span class="lineno">  535</span>  <span class="keywordflow">if</span> (out_vec_size &lt; (num_simdgroups * results_per_simdgroup)) {</div>
-<div class="line"><a id="l00536" name="l00536"></a><span class="lineno">  536</span>    w += out_row * in_vec_size_w + simd_lid * packs_per_thread;</div>
-<div class="line"><a id="l00537" name="l00537"></a><span class="lineno">  537</span>    scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
-<div class="line"><a id="l00538" name="l00538"></a><span class="lineno">  538</span>    biases += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
-<div class="line"><a id="l00539" name="l00539"></a><span class="lineno">  539</span>    x += tid.y * in_vec_size + simd_lid * values_per_thread;</div>
-<div class="line"><a id="l00540" name="l00540"></a><span class="lineno">  540</span>    y += tid.y * out_vec_size + out_row;</div>
-<div class="line"><a id="l00541" name="l00541"></a><span class="lineno">  541</span> </div>
-<div class="line"><a id="l00542" name="l00542"></a><span class="lineno">  542</span>    <span class="keywordtype">int</span> k = 0;</div>
-<div class="line"><a id="l00543" name="l00543"></a><span class="lineno">  543</span>    <span class="keywordflow">for</span> (; k &lt; in_vec_size - block_size; k += block_size) {</div>
-<div class="line"><a id="l00544" name="l00544"></a><span class="lineno">  544</span>      U sum = <a class="code hl_function" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread);</div>
-<div class="line"><a id="l00545" name="l00545"></a><span class="lineno">  545</span> </div>
-<div class="line"><a id="l00546" name="l00546"></a><span class="lineno">  546</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; out_row + row &lt; out_vec_size; row++) {</div>
-<div class="line"><a id="l00547" name="l00547"></a><span class="lineno">  547</span>        <span class="keyword">const</span> device uint8_t* wl =</div>
-<div class="line"><a id="l00548" name="l00548"></a><span class="lineno">  548</span>            (<span class="keyword">const</span> device uint8_t*)(w + row * in_vec_size_w);</div>
-<div class="line"><a id="l00549" name="l00549"></a><span class="lineno">  549</span>        <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
-<div class="line"><a id="l00550" name="l00550"></a><span class="lineno">  550</span>        <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
-<div class="line"><a id="l00551" name="l00551"></a><span class="lineno">  551</span> </div>
-<div class="line"><a id="l00552" name="l00552"></a><span class="lineno">  552</span>        U s = sl[0];</div>
-<div class="line"><a id="l00553" name="l00553"></a><span class="lineno">  553</span>        U b = bl[0];</div>
-<div class="line"><a id="l00554" name="l00554"></a><span class="lineno">  554</span>        result[row] +=</div>
-<div class="line"><a id="l00555" name="l00555"></a><span class="lineno">  555</span>            <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
-<div class="line"><a id="l00556" name="l00556"></a><span class="lineno">  556</span>      }</div>
-<div class="line"><a id="l00557" name="l00557"></a><span class="lineno">  557</span> </div>
-<div class="line"><a id="l00558" name="l00558"></a><span class="lineno">  558</span>      w += block_size / pack_factor;</div>
-<div class="line"><a id="l00559" name="l00559"></a><span class="lineno">  559</span>      scales += block_size / group_size;</div>
-<div class="line"><a id="l00560" name="l00560"></a><span class="lineno">  560</span>      biases += block_size / group_size;</div>
-<div class="line"><a id="l00561" name="l00561"></a><span class="lineno">  561</span>      x += block_size;</div>
-<div class="line"><a id="l00562" name="l00562"></a><span class="lineno">  562</span>    }</div>
-<div class="line"><a id="l00563" name="l00563"></a><span class="lineno">  563</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> remaining = clamp(</div>
-<div class="line"><a id="l00564" name="l00564"></a><span class="lineno">  564</span>        <span class="keyword">static_cast&lt;</span><span class="keywordtype">int</span><span class="keyword">&gt;</span>(in_vec_size - k - simd_lid * values_per_thread),</div>
-<div class="line"><a id="l00565" name="l00565"></a><span class="lineno">  565</span>        0,</div>
-<div class="line"><a id="l00566" name="l00566"></a><span class="lineno">  566</span>        values_per_thread);</div>
-<div class="line"><a id="l00567" name="l00567"></a><span class="lineno">  567</span>    U sum =</div>
-<div class="line"><a id="l00568" name="l00568"></a><span class="lineno">  568</span>        <a class="code hl_function" href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">load_vector_safe&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread, remaining);</div>
-<div class="line"><a id="l00569" name="l00569"></a><span class="lineno">  569</span> </div>
-<div class="line"><a id="l00570" name="l00570"></a><span class="lineno">  570</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; out_row + row &lt; out_vec_size; row++) {</div>
-<div class="line"><a id="l00571" name="l00571"></a><span class="lineno">  571</span>      <span class="keyword">const</span> device uint8_t* wl =</div>
-<div class="line"><a id="l00572" name="l00572"></a><span class="lineno">  572</span>          (<span class="keyword">const</span> device uint8_t*)(w + row * in_vec_size_w);</div>
-<div class="line"><a id="l00573" name="l00573"></a><span class="lineno">  573</span>      <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
-<div class="line"><a id="l00574" name="l00574"></a><span class="lineno">  574</span>      <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
-<div class="line"><a id="l00575" name="l00575"></a><span class="lineno">  575</span> </div>
-<div class="line"><a id="l00576" name="l00576"></a><span class="lineno">  576</span>      U s = sl[0];</div>
-<div class="line"><a id="l00577" name="l00577"></a><span class="lineno">  577</span>      U b = bl[0];</div>
-<div class="line"><a id="l00578" name="l00578"></a><span class="lineno">  578</span>      result[row] += <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
-<div class="line"><a id="l00579" name="l00579"></a><span class="lineno">  579</span>    }</div>
+<div class="line"><a id="l00498" name="l00498"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">  498</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">load_unsafe</a>()<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00499" name="l00499"></a><span class="lineno">  499</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS &lt; tgp_size &amp;&amp; bi &gt;= BROWS) {</div>
+<div class="line"><a id="l00500" name="l00500"></a><span class="lineno">  500</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00501" name="l00501"></a><span class="lineno">  501</span>    }</div>
+<div class="line"><a id="l00502" name="l00502"></a><span class="lineno">  502</span> </div>
+<div class="line"><a id="l00503" name="l00503"></a><span class="lineno">  503</span>    T scale = *<a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>;</div>
+<div class="line"><a id="l00504" name="l00504"></a><span class="lineno">  504</span>    T bias = *<a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>;</div>
+<div class="line"><a id="l00505" name="l00505"></a><span class="lineno">  505</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a>; i++) {</div>
+<div class="line"><a id="l00506" name="l00506"></a><span class="lineno">  506</span>      <a class="code hl_function" href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">dequantize&lt;T, pack_factor, bits&gt;</a>(</div>
+<div class="line"><a id="l00507" name="l00507"></a><span class="lineno">  507</span>          <a class="code hl_variable" href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">src</a> + i * <a class="code hl_variable" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">bytes_per_pack</a>, scale, bias, dst + i * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>);</div>
+<div class="line"><a id="l00508" name="l00508"></a><span class="lineno">  508</span>    }</div>
+<div class="line"><a id="l00509" name="l00509"></a><span class="lineno">  509</span>  }</div>
+</div>
+<div class="line"><a id="l00510" name="l00510"></a><span class="lineno">  510</span> </div>
+<div class="foldopen" id="foldopen00511" data-start="{" data-end="}">
+<div class="line"><a id="l00511" name="l00511"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">  511</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">load_safe</a>(short2 src_tile_dim)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00512" name="l00512"></a><span class="lineno">  512</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS &lt; tgp_size &amp;&amp; bi &gt;= BROWS) {</div>
+<div class="line"><a id="l00513" name="l00513"></a><span class="lineno">  513</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00514" name="l00514"></a><span class="lineno">  514</span>    }</div>
+<div class="line"><a id="l00515" name="l00515"></a><span class="lineno">  515</span> </div>
+<div class="line"><a id="l00516" name="l00516"></a><span class="lineno">  516</span>    <span class="keywordflow">if</span> (reduction_dim == 1 &amp;&amp; <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> &gt;= src_tile_dim.y) {</div>
+<div class="line"><a id="l00517" name="l00517"></a><span class="lineno">  517</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>; i++) {</div>
+<div class="line"><a id="l00518" name="l00518"></a><span class="lineno">  518</span>        dst[i] = T(0);</div>
+<div class="line"><a id="l00519" name="l00519"></a><span class="lineno">  519</span>      }</div>
+<div class="line"><a id="l00520" name="l00520"></a><span class="lineno">  520</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00521" name="l00521"></a><span class="lineno">  521</span>    }</div>
+<div class="line"><a id="l00522" name="l00522"></a><span class="lineno">  522</span> </div>
+<div class="line"><a id="l00523" name="l00523"></a><span class="lineno">  523</span>    <span class="keywordflow">if</span> (reduction_dim == 0 &amp;&amp; <a class="code hl_variable" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a> &gt;= src_tile_dim.x) {</div>
+<div class="line"><a id="l00524" name="l00524"></a><span class="lineno">  524</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a> * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>; i++) {</div>
+<div class="line"><a id="l00525" name="l00525"></a><span class="lineno">  525</span>        dst[i] = T(0);</div>
+<div class="line"><a id="l00526" name="l00526"></a><span class="lineno">  526</span>      }</div>
+<div class="line"><a id="l00527" name="l00527"></a><span class="lineno">  527</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00528" name="l00528"></a><span class="lineno">  528</span>    }</div>
+<div class="line"><a id="l00529" name="l00529"></a><span class="lineno">  529</span> </div>
+<div class="line"><a id="l00530" name="l00530"></a><span class="lineno">  530</span>    T scale = *<a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>;</div>
+<div class="line"><a id="l00531" name="l00531"></a><span class="lineno">  531</span>    T bias = *<a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>;</div>
+<div class="line"><a id="l00532" name="l00532"></a><span class="lineno">  532</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; <a class="code hl_variable" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a>; i++) {</div>
+<div class="line"><a id="l00533" name="l00533"></a><span class="lineno">  533</span>      <a class="code hl_function" href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">dequantize&lt;T, pack_factor, bits&gt;</a>(</div>
+<div class="line"><a id="l00534" name="l00534"></a><span class="lineno">  534</span>          (device uint8_t*)(<a class="code hl_variable" href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">src</a> + i * <a class="code hl_variable" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">bytes_per_pack</a>),</div>
+<div class="line"><a id="l00535" name="l00535"></a><span class="lineno">  535</span>          scale,</div>
+<div class="line"><a id="l00536" name="l00536"></a><span class="lineno">  536</span>          bias,</div>
+<div class="line"><a id="l00537" name="l00537"></a><span class="lineno">  537</span>          dst + i * <a class="code hl_variable" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a>);</div>
+<div class="line"><a id="l00538" name="l00538"></a><span class="lineno">  538</span>    }</div>
+<div class="line"><a id="l00539" name="l00539"></a><span class="lineno">  539</span>  }</div>
+</div>
+<div class="line"><a id="l00540" name="l00540"></a><span class="lineno">  540</span> </div>
+<div class="foldopen" id="foldopen00541" data-start="{" data-end="}">
+<div class="line"><a id="l00541" name="l00541"></a><span class="lineno"><a class="line" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">  541</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">next</a>() {</div>
+<div class="line"><a id="l00542" name="l00542"></a><span class="lineno">  542</span>    <a class="code hl_variable" href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">src</a> += <a class="code hl_variable" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">tile_stride</a>;</div>
+<div class="line"><a id="l00543" name="l00543"></a><span class="lineno">  543</span>    <span class="keywordflow">if</span> (reduction_dim == 1) {</div>
+<div class="line"><a id="l00544" name="l00544"></a><span class="lineno">  544</span>      <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">group_steps</a> &gt; 1) {</div>
+<div class="line"><a id="l00545" name="l00545"></a><span class="lineno">  545</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a>++;</div>
+<div class="line"><a id="l00546" name="l00546"></a><span class="lineno">  546</span>        <span class="keywordflow">if</span> (<a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a> == <a class="code hl_variable" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">group_steps</a>) {</div>
+<div class="line"><a id="l00547" name="l00547"></a><span class="lineno">  547</span>          <a class="code hl_variable" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a> = 0;</div>
+<div class="line"><a id="l00548" name="l00548"></a><span class="lineno">  548</span>          <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>++;</div>
+<div class="line"><a id="l00549" name="l00549"></a><span class="lineno">  549</span>          <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>++;</div>
+<div class="line"><a id="l00550" name="l00550"></a><span class="lineno">  550</span>        }</div>
+<div class="line"><a id="l00551" name="l00551"></a><span class="lineno">  551</span>      } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00552" name="l00552"></a><span class="lineno">  552</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a>++;</div>
+<div class="line"><a id="l00553" name="l00553"></a><span class="lineno">  553</span>        <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a>++;</div>
+<div class="line"><a id="l00554" name="l00554"></a><span class="lineno">  554</span>      }</div>
+<div class="line"><a id="l00555" name="l00555"></a><span class="lineno">  555</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00556" name="l00556"></a><span class="lineno">  556</span>      <a class="code hl_variable" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a> += <a class="code hl_variable" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a>;</div>
+<div class="line"><a id="l00557" name="l00557"></a><span class="lineno">  557</span>      <a class="code hl_variable" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a> += <a class="code hl_variable" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a>;</div>
+<div class="line"><a id="l00558" name="l00558"></a><span class="lineno">  558</span>    }</div>
+<div class="line"><a id="l00559" name="l00559"></a><span class="lineno">  559</span>  }</div>
+</div>
+<div class="line"><a id="l00560" name="l00560"></a><span class="lineno">  560</span>};</div>
+</div>
+<div class="line"><a id="l00561" name="l00561"></a><span class="lineno">  561</span> </div>
+<div class="line"><a id="l00562" name="l00562"></a><span class="lineno">  562</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits, <span class="keywordtype">int</span> D&gt;</div>
+<div class="foldopen" id="foldopen00563" data-start="{" data-end="}">
+<div class="line"><a id="l00563" name="l00563"></a><span class="lineno"><a class="line" href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">  563</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">qmv_quad_impl</a>(</div>
+<div class="line"><a id="l00564" name="l00564"></a><span class="lineno">  564</span>    <span class="keyword">const</span> device uint32_t* w,</div>
+<div class="line"><a id="l00565" name="l00565"></a><span class="lineno">  565</span>    <span class="keyword">const</span> device T* scales,</div>
+<div class="line"><a id="l00566" name="l00566"></a><span class="lineno">  566</span>    <span class="keyword">const</span> device T* biases,</div>
+<div class="line"><a id="l00567" name="l00567"></a><span class="lineno">  567</span>    <span class="keyword">const</span> device T* x,</div>
+<div class="line"><a id="l00568" name="l00568"></a><span class="lineno">  568</span>    device T* y,</div>
+<div class="line"><a id="l00569" name="l00569"></a><span class="lineno">  569</span>    constant <span class="keywordtype">int</span>&amp; in_vec_size,</div>
+<div class="line"><a id="l00570" name="l00570"></a><span class="lineno">  570</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size,</div>
+<div class="line"><a id="l00571" name="l00571"></a><span class="lineno">  571</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00572" name="l00572"></a><span class="lineno">  572</span>    uint quad_gid [[quadgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00573" name="l00573"></a><span class="lineno">  573</span>    uint quad_lid [[thread_index_in_quadgroup]]) {</div>
+<div class="line"><a id="l00574" name="l00574"></a><span class="lineno">  574</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> quads_per_simd = <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a> / <a class="code hl_variable" href="quantized_8h.html#a803e4d5a1459844ba647aea5b004e133">QUAD_SIZE</a>;</div>
+<div class="line"><a id="l00575" name="l00575"></a><span class="lineno">  575</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = 32 / bits;</div>
+<div class="line"><a id="l00576" name="l00576"></a><span class="lineno">  576</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_thread = D / <a class="code hl_variable" href="quantized_8h.html#a803e4d5a1459844ba647aea5b004e133">QUAD_SIZE</a>;</div>
+<div class="line"><a id="l00577" name="l00577"></a><span class="lineno">  577</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_thread = values_per_thread / pack_factor;</div>
+<div class="line"><a id="l00578" name="l00578"></a><span class="lineno">  578</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> scale_step_per_thread = group_size / values_per_thread;</div>
+<div class="line"><a id="l00579" name="l00579"></a><span class="lineno">  579</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> results_per_quadgroup = 8;</div>
 <div class="line"><a id="l00580" name="l00580"></a><span class="lineno">  580</span> </div>
-<div class="line"><a id="l00581" name="l00581"></a><span class="lineno">  581</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; out_row + row &lt; out_vec_size; row++) {</div>
-<div class="line"><a id="l00582" name="l00582"></a><span class="lineno">  582</span>      result[row] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(result[row]);</div>
-<div class="line"><a id="l00583" name="l00583"></a><span class="lineno">  583</span>      <span class="keywordflow">if</span> (simd_lid == 0) {</div>
-<div class="line"><a id="l00584" name="l00584"></a><span class="lineno">  584</span>        y[row] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[row]);</div>
-<div class="line"><a id="l00585" name="l00585"></a><span class="lineno">  585</span>      }</div>
-<div class="line"><a id="l00586" name="l00586"></a><span class="lineno">  586</span>    }</div>
-<div class="line"><a id="l00587" name="l00587"></a><span class="lineno">  587</span>  }</div>
-<div class="line"><a id="l00588" name="l00588"></a><span class="lineno">  588</span> </div>
-<div class="line"><a id="l00589" name="l00589"></a><span class="lineno">  589</span>  <span class="comment">// In this case the last tile is moved back to redo some output values</span></div>
-<div class="line"><a id="l00590" name="l00590"></a><span class="lineno">  590</span>  <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00591" name="l00591"></a><span class="lineno">  591</span>    w += used_out_row * in_vec_size_w + simd_lid * packs_per_thread;</div>
-<div class="line"><a id="l00592" name="l00592"></a><span class="lineno">  592</span>    scales += used_out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
-<div class="line"><a id="l00593" name="l00593"></a><span class="lineno">  593</span>    biases += used_out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
-<div class="line"><a id="l00594" name="l00594"></a><span class="lineno">  594</span>    x += tid.y * in_vec_size + simd_lid * values_per_thread;</div>
-<div class="line"><a id="l00595" name="l00595"></a><span class="lineno">  595</span>    y += tid.y * out_vec_size + used_out_row;</div>
+<div class="line"><a id="l00581" name="l00581"></a><span class="lineno">  581</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
+<div class="line"><a id="l00582" name="l00582"></a><span class="lineno">  582</span> </div>
+<div class="line"><a id="l00583" name="l00583"></a><span class="lineno">  583</span>  thread U x_thread[values_per_thread];</div>
+<div class="line"><a id="l00584" name="l00584"></a><span class="lineno">  584</span>  thread U result[results_per_quadgroup] = {0};</div>
+<div class="line"><a id="l00585" name="l00585"></a><span class="lineno">  585</span> </div>
+<div class="line"><a id="l00586" name="l00586"></a><span class="lineno">  586</span>  <span class="comment">// Adjust positions</span></div>
+<div class="line"><a id="l00587" name="l00587"></a><span class="lineno">  587</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_w = in_vec_size / pack_factor;</div>
+<div class="line"><a id="l00588" name="l00588"></a><span class="lineno">  588</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_g = in_vec_size / group_size;</div>
+<div class="line"><a id="l00589" name="l00589"></a><span class="lineno">  589</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_row = tid.x * quads_per_simd * results_per_quadgroup + quad_gid;</div>
+<div class="line"><a id="l00590" name="l00590"></a><span class="lineno">  590</span> </div>
+<div class="line"><a id="l00591" name="l00591"></a><span class="lineno">  591</span>  w += out_row * in_vec_size_w + quad_lid * packs_per_thread;</div>
+<div class="line"><a id="l00592" name="l00592"></a><span class="lineno">  592</span>  scales += out_row * in_vec_size_g + quad_lid / scale_step_per_thread;</div>
+<div class="line"><a id="l00593" name="l00593"></a><span class="lineno">  593</span>  biases += out_row * in_vec_size_g + quad_lid / scale_step_per_thread;</div>
+<div class="line"><a id="l00594" name="l00594"></a><span class="lineno">  594</span>  x += tid.y * in_vec_size + quad_lid * values_per_thread;</div>
+<div class="line"><a id="l00595" name="l00595"></a><span class="lineno">  595</span>  y += tid.y * out_vec_size + out_row;</div>
 <div class="line"><a id="l00596" name="l00596"></a><span class="lineno">  596</span> </div>
-<div class="line"><a id="l00597" name="l00597"></a><span class="lineno">  597</span>    <span class="keywordtype">int</span> k = 0;</div>
-<div class="line"><a id="l00598" name="l00598"></a><span class="lineno">  598</span>    <span class="keywordflow">for</span> (; k &lt; in_vec_size - block_size; k += block_size) {</div>
-<div class="line"><a id="l00599" name="l00599"></a><span class="lineno">  599</span>      U sum = <a class="code hl_function" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread);</div>
-<div class="line"><a id="l00600" name="l00600"></a><span class="lineno">  600</span> </div>
-<div class="line"><a id="l00601" name="l00601"></a><span class="lineno">  601</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
-<div class="line"><a id="l00602" name="l00602"></a><span class="lineno">  602</span>        <span class="keyword">const</span> device uint8_t* wl =</div>
-<div class="line"><a id="l00603" name="l00603"></a><span class="lineno">  603</span>            (<span class="keyword">const</span> device uint8_t*)(w + row * in_vec_size_w);</div>
-<div class="line"><a id="l00604" name="l00604"></a><span class="lineno">  604</span>        <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
-<div class="line"><a id="l00605" name="l00605"></a><span class="lineno">  605</span>        <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
-<div class="line"><a id="l00606" name="l00606"></a><span class="lineno">  606</span> </div>
-<div class="line"><a id="l00607" name="l00607"></a><span class="lineno">  607</span>        U s = sl[0];</div>
-<div class="line"><a id="l00608" name="l00608"></a><span class="lineno">  608</span>        U b = bl[0];</div>
-<div class="line"><a id="l00609" name="l00609"></a><span class="lineno">  609</span>        result[row] +=</div>
-<div class="line"><a id="l00610" name="l00610"></a><span class="lineno">  610</span>            <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
-<div class="line"><a id="l00611" name="l00611"></a><span class="lineno">  611</span>      }</div>
-<div class="line"><a id="l00612" name="l00612"></a><span class="lineno">  612</span> </div>
-<div class="line"><a id="l00613" name="l00613"></a><span class="lineno">  613</span>      w += block_size / pack_factor;</div>
-<div class="line"><a id="l00614" name="l00614"></a><span class="lineno">  614</span>      scales += block_size / group_size;</div>
-<div class="line"><a id="l00615" name="l00615"></a><span class="lineno">  615</span>      biases += block_size / group_size;</div>
-<div class="line"><a id="l00616" name="l00616"></a><span class="lineno">  616</span>      x += block_size;</div>
-<div class="line"><a id="l00617" name="l00617"></a><span class="lineno">  617</span>    }</div>
-<div class="line"><a id="l00618" name="l00618"></a><span class="lineno">  618</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> remaining = clamp(</div>
-<div class="line"><a id="l00619" name="l00619"></a><span class="lineno">  619</span>        <span class="keyword">static_cast&lt;</span><span class="keywordtype">int</span><span class="keyword">&gt;</span>(in_vec_size - k - simd_lid * values_per_thread),</div>
-<div class="line"><a id="l00620" name="l00620"></a><span class="lineno">  620</span>        0,</div>
-<div class="line"><a id="l00621" name="l00621"></a><span class="lineno">  621</span>        values_per_thread);</div>
-<div class="line"><a id="l00622" name="l00622"></a><span class="lineno">  622</span>    U sum =</div>
-<div class="line"><a id="l00623" name="l00623"></a><span class="lineno">  623</span>        <a class="code hl_function" href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">load_vector_safe&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread, remaining);</div>
-<div class="line"><a id="l00624" name="l00624"></a><span class="lineno">  624</span> </div>
-<div class="line"><a id="l00625" name="l00625"></a><span class="lineno">  625</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
-<div class="line"><a id="l00626" name="l00626"></a><span class="lineno">  626</span>      <span class="keyword">const</span> device uint8_t* wl =</div>
-<div class="line"><a id="l00627" name="l00627"></a><span class="lineno">  627</span>          (<span class="keyword">const</span> device uint8_t*)(w + row * in_vec_size_w);</div>
-<div class="line"><a id="l00628" name="l00628"></a><span class="lineno">  628</span>      <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
-<div class="line"><a id="l00629" name="l00629"></a><span class="lineno">  629</span>      <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
-<div class="line"><a id="l00630" name="l00630"></a><span class="lineno">  630</span> </div>
-<div class="line"><a id="l00631" name="l00631"></a><span class="lineno">  631</span>      U s = sl[0];</div>
-<div class="line"><a id="l00632" name="l00632"></a><span class="lineno">  632</span>      U b = bl[0];</div>
-<div class="line"><a id="l00633" name="l00633"></a><span class="lineno">  633</span>      result[row] += <a class="code hl_function" href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42">qdot_safe&lt;U, values_per_thread, bits&gt;</a>(</div>
-<div class="line"><a id="l00634" name="l00634"></a><span class="lineno">  634</span>          wl, x_thread, s, b, sum, remaining);</div>
-<div class="line"><a id="l00635" name="l00635"></a><span class="lineno">  635</span>    }</div>
-<div class="line"><a id="l00636" name="l00636"></a><span class="lineno">  636</span> </div>
-<div class="line"><a id="l00637" name="l00637"></a><span class="lineno">  637</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
-<div class="line"><a id="l00638" name="l00638"></a><span class="lineno">  638</span>      result[row] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(result[row]);</div>
-<div class="line"><a id="l00639" name="l00639"></a><span class="lineno">  639</span>      <span class="keywordflow">if</span> (simd_lid == 0) {</div>
-<div class="line"><a id="l00640" name="l00640"></a><span class="lineno">  640</span>        y[row] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[row]);</div>
-<div class="line"><a id="l00641" name="l00641"></a><span class="lineno">  641</span>      }</div>
-<div class="line"><a id="l00642" name="l00642"></a><span class="lineno">  642</span>    }</div>
-<div class="line"><a id="l00643" name="l00643"></a><span class="lineno">  643</span>  }</div>
-<div class="line"><a id="l00644" name="l00644"></a><span class="lineno">  644</span>}</div>
+<div class="line"><a id="l00597" name="l00597"></a><span class="lineno">  597</span>  U sum = <a class="code hl_function" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread);</div>
+<div class="line"><a id="l00598" name="l00598"></a><span class="lineno">  598</span> </div>
+<div class="line"><a id="l00599" name="l00599"></a><span class="lineno">  599</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_quadgroup; row++) {</div>
+<div class="line"><a id="l00600" name="l00600"></a><span class="lineno">  600</span>    <span class="keyword">auto</span> wl = (<span class="keyword">const</span> device uint8_t*)(w + row * in_vec_size_w * quads_per_simd);</div>
+<div class="line"><a id="l00601" name="l00601"></a><span class="lineno">  601</span>    <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g * quads_per_simd;</div>
+<div class="line"><a id="l00602" name="l00602"></a><span class="lineno">  602</span>    <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g * quads_per_simd;</div>
+<div class="line"><a id="l00603" name="l00603"></a><span class="lineno">  603</span> </div>
+<div class="line"><a id="l00604" name="l00604"></a><span class="lineno">  604</span>    U s = sl[0];</div>
+<div class="line"><a id="l00605" name="l00605"></a><span class="lineno">  605</span>    U b = bl[0];</div>
+<div class="line"><a id="l00606" name="l00606"></a><span class="lineno">  606</span>    <span class="keywordflow">if</span> (row * quads_per_simd + out_row &lt; out_vec_size) {</div>
+<div class="line"><a id="l00607" name="l00607"></a><span class="lineno">  607</span>      result[row] += <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
+<div class="line"><a id="l00608" name="l00608"></a><span class="lineno">  608</span>    }</div>
+<div class="line"><a id="l00609" name="l00609"></a><span class="lineno">  609</span>  }</div>
+<div class="line"><a id="l00610" name="l00610"></a><span class="lineno">  610</span> </div>
+<div class="line"><a id="l00611" name="l00611"></a><span class="lineno">  611</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_quadgroup; row++) {</div>
+<div class="line"><a id="l00612" name="l00612"></a><span class="lineno">  612</span>    result[row] = quad_sum(result[row]);</div>
+<div class="line"><a id="l00613" name="l00613"></a><span class="lineno">  613</span>    <span class="keywordflow">if</span> (quad_lid == 0 &amp;&amp; row * quads_per_simd + out_row &lt; out_vec_size) {</div>
+<div class="line"><a id="l00614" name="l00614"></a><span class="lineno">  614</span>      y[row * quads_per_simd] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[row]);</div>
+<div class="line"><a id="l00615" name="l00615"></a><span class="lineno">  615</span>    }</div>
+<div class="line"><a id="l00616" name="l00616"></a><span class="lineno">  616</span>  }</div>
+<div class="line"><a id="l00617" name="l00617"></a><span class="lineno">  617</span>}</div>
 </div>
-<div class="line"><a id="l00645" name="l00645"></a><span class="lineno">  645</span> </div>
-<div class="line"><a id="l00646" name="l00646"></a><span class="lineno">  646</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen00647" data-start="{" data-end="}">
-<div class="line"><a id="l00647" name="l00647"></a><span class="lineno"><a class="line" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">  647</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl</a>(</div>
-<div class="line"><a id="l00648" name="l00648"></a><span class="lineno">  648</span>    <span class="keyword">const</span> device uint32_t* w,</div>
-<div class="line"><a id="l00649" name="l00649"></a><span class="lineno">  649</span>    <span class="keyword">const</span> device T* scales,</div>
-<div class="line"><a id="l00650" name="l00650"></a><span class="lineno">  650</span>    <span class="keyword">const</span> device T* biases,</div>
-<div class="line"><a id="l00651" name="l00651"></a><span class="lineno">  651</span>    <span class="keyword">const</span> device T* x,</div>
-<div class="line"><a id="l00652" name="l00652"></a><span class="lineno">  652</span>    device T* y,</div>
-<div class="line"><a id="l00653" name="l00653"></a><span class="lineno">  653</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size,</div>
-<div class="line"><a id="l00654" name="l00654"></a><span class="lineno">  654</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> out_vec_size,</div>
-<div class="line"><a id="l00655" name="l00655"></a><span class="lineno">  655</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00656" name="l00656"></a><span class="lineno">  656</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l00657" name="l00657"></a><span class="lineno">  657</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l00658" name="l00658"></a><span class="lineno">  658</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> num_simdgroups = 2;</div>
-<div class="line"><a id="l00659" name="l00659"></a><span class="lineno">  659</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = 32 / bits;</div>
-<div class="line"><a id="l00660" name="l00660"></a><span class="lineno">  660</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> tn = 32 / pack_factor;</div>
-<div class="line"><a id="l00661" name="l00661"></a><span class="lineno">  661</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> blocksize = <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>;</div>
+<div class="line"><a id="l00618" name="l00618"></a><span class="lineno">  618</span> </div>
+<div class="line"><a id="l00619" name="l00619"></a><span class="lineno">  619</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen00620" data-start="{" data-end="}">
+<div class="line"><a id="l00620" name="l00620"></a><span class="lineno"><a class="line" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">  620</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl</a>(</div>
+<div class="line"><a id="l00621" name="l00621"></a><span class="lineno">  621</span>    <span class="keyword">const</span> device uint32_t* w,</div>
+<div class="line"><a id="l00622" name="l00622"></a><span class="lineno">  622</span>    <span class="keyword">const</span> device T* scales,</div>
+<div class="line"><a id="l00623" name="l00623"></a><span class="lineno">  623</span>    <span class="keyword">const</span> device T* biases,</div>
+<div class="line"><a id="l00624" name="l00624"></a><span class="lineno">  624</span>    <span class="keyword">const</span> device T* x,</div>
+<div class="line"><a id="l00625" name="l00625"></a><span class="lineno">  625</span>    device T* y,</div>
+<div class="line"><a id="l00626" name="l00626"></a><span class="lineno">  626</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size,</div>
+<div class="line"><a id="l00627" name="l00627"></a><span class="lineno">  627</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size,</div>
+<div class="line"><a id="l00628" name="l00628"></a><span class="lineno">  628</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00629" name="l00629"></a><span class="lineno">  629</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00630" name="l00630"></a><span class="lineno">  630</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l00631" name="l00631"></a><span class="lineno">  631</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> power_of_2_bits = (bits &amp; (bits - 1)) == 0;</div>
+<div class="line"><a id="l00632" name="l00632"></a><span class="lineno">  632</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_thread = bits == 2 ? 1 : 2;</div>
+<div class="line"><a id="l00633" name="l00633"></a><span class="lineno">  633</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> num_simdgroups = 2;</div>
+<div class="line"><a id="l00634" name="l00634"></a><span class="lineno">  634</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> results_per_simdgroup = 4;</div>
+<div class="line"><a id="l00635" name="l00635"></a><span class="lineno">  635</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;</div>
+<div class="line"><a id="l00636" name="l00636"></a><span class="lineno">  636</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> bytes_per_pack = power_of_2_bits ? 4 : 3;</div>
+<div class="line"><a id="l00637" name="l00637"></a><span class="lineno">  637</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_thread = pack_factor * packs_per_thread;</div>
+<div class="line"><a id="l00638" name="l00638"></a><span class="lineno">  638</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> block_size = values_per_thread * <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>;</div>
+<div class="line"><a id="l00639" name="l00639"></a><span class="lineno">  639</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> scale_step_per_thread = group_size / values_per_thread;</div>
+<div class="line"><a id="l00640" name="l00640"></a><span class="lineno">  640</span> </div>
+<div class="line"><a id="l00641" name="l00641"></a><span class="lineno">  641</span>  <span class="keyword">const</span> device uint8_t* ws = (<span class="keyword">const</span> device uint8_t*)w;</div>
+<div class="line"><a id="l00642" name="l00642"></a><span class="lineno">  642</span> </div>
+<div class="line"><a id="l00643" name="l00643"></a><span class="lineno">  643</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
+<div class="line"><a id="l00644" name="l00644"></a><span class="lineno">  644</span> </div>
+<div class="line"><a id="l00645" name="l00645"></a><span class="lineno">  645</span>  thread U x_thread[values_per_thread];</div>
+<div class="line"><a id="l00646" name="l00646"></a><span class="lineno">  646</span>  thread U result[results_per_simdgroup] = {0};</div>
+<div class="line"><a id="l00647" name="l00647"></a><span class="lineno">  647</span> </div>
+<div class="line"><a id="l00648" name="l00648"></a><span class="lineno">  648</span>  <span class="comment">// Adjust positions</span></div>
+<div class="line"><a id="l00649" name="l00649"></a><span class="lineno">  649</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;</div>
+<div class="line"><a id="l00650" name="l00650"></a><span class="lineno">  650</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_g = in_vec_size / group_size;</div>
+<div class="line"><a id="l00651" name="l00651"></a><span class="lineno">  651</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_row = tid.x * (num_simdgroups * results_per_simdgroup) +</div>
+<div class="line"><a id="l00652" name="l00652"></a><span class="lineno">  652</span>      simd_gid * results_per_simdgroup;</div>
+<div class="line"><a id="l00653" name="l00653"></a><span class="lineno">  653</span> </div>
+<div class="line"><a id="l00654" name="l00654"></a><span class="lineno">  654</span>  ws += out_row * in_vec_size_w + simd_lid * packs_per_thread * bytes_per_pack;</div>
+<div class="line"><a id="l00655" name="l00655"></a><span class="lineno">  655</span>  scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
+<div class="line"><a id="l00656" name="l00656"></a><span class="lineno">  656</span>  biases += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
+<div class="line"><a id="l00657" name="l00657"></a><span class="lineno">  657</span>  x += tid.y * in_vec_size + simd_lid * values_per_thread;</div>
+<div class="line"><a id="l00658" name="l00658"></a><span class="lineno">  658</span>  y += tid.y * out_vec_size + out_row;</div>
+<div class="line"><a id="l00659" name="l00659"></a><span class="lineno">  659</span> </div>
+<div class="line"><a id="l00660" name="l00660"></a><span class="lineno">  660</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; in_vec_size; k += block_size) {</div>
+<div class="line"><a id="l00661" name="l00661"></a><span class="lineno">  661</span>    U sum = <a class="code hl_function" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread);</div>
 <div class="line"><a id="l00662" name="l00662"></a><span class="lineno">  662</span> </div>
-<div class="line"><a id="l00663" name="l00663"></a><span class="lineno">  663</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
-<div class="line"><a id="l00664" name="l00664"></a><span class="lineno">  664</span>  <span class="keyword">typedef</span> <span class="keyword">struct </span>{</div>
-<div class="line"><a id="l00665" name="l00665"></a><span class="lineno">  665</span>    uint32_t wi[tn];</div>
-<div class="line"><a id="l00666" name="l00666"></a><span class="lineno">  666</span>  } vec_w;</div>
+<div class="line"><a id="l00663" name="l00663"></a><span class="lineno">  663</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
+<div class="line"><a id="l00664" name="l00664"></a><span class="lineno">  664</span>      <span class="keyword">auto</span> wl = (<span class="keyword">const</span> device uint8_t*)(ws + row * in_vec_size_w);</div>
+<div class="line"><a id="l00665" name="l00665"></a><span class="lineno">  665</span>      <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
+<div class="line"><a id="l00666" name="l00666"></a><span class="lineno">  666</span>      <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
 <div class="line"><a id="l00667" name="l00667"></a><span class="lineno">  667</span> </div>
-<div class="line"><a id="l00668" name="l00668"></a><span class="lineno">  668</span>  thread vec_w w_local;</div>
-<div class="line"><a id="l00669" name="l00669"></a><span class="lineno">  669</span>  thread U result[tn * pack_factor] = {0};</div>
-<div class="line"><a id="l00670" name="l00670"></a><span class="lineno">  670</span>  thread U scale = 1;</div>
-<div class="line"><a id="l00671" name="l00671"></a><span class="lineno">  671</span>  thread U bias = 0;</div>
-<div class="line"><a id="l00672" name="l00672"></a><span class="lineno">  672</span>  thread U x_local = 0;</div>
-<div class="line"><a id="l00673" name="l00673"></a><span class="lineno">  673</span> </div>
-<div class="line"><a id="l00674" name="l00674"></a><span class="lineno">  674</span>  <span class="comment">// Adjust positions</span></div>
-<div class="line"><a id="l00675" name="l00675"></a><span class="lineno">  675</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_vec_size_w = out_vec_size / pack_factor;</div>
-<div class="line"><a id="l00676" name="l00676"></a><span class="lineno">  676</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_vec_size_g = out_vec_size / group_size;</div>
-<div class="line"><a id="l00677" name="l00677"></a><span class="lineno">  677</span>  <span class="keywordtype">int</span> out_col =</div>
-<div class="line"><a id="l00678" name="l00678"></a><span class="lineno">  678</span>      tid.x * (num_simdgroups * pack_factor * tn) + simd_gid * pack_factor * tn;</div>
-<div class="line"><a id="l00679" name="l00679"></a><span class="lineno">  679</span>  w += out_col / pack_factor + simd_lid * out_vec_size_w;</div>
-<div class="line"><a id="l00680" name="l00680"></a><span class="lineno">  680</span>  scales += out_col / group_size + simd_lid * out_vec_size_g;</div>
-<div class="line"><a id="l00681" name="l00681"></a><span class="lineno">  681</span>  biases += out_col / group_size + simd_lid * out_vec_size_g;</div>
-<div class="line"><a id="l00682" name="l00682"></a><span class="lineno">  682</span>  x += tid.y * in_vec_size + simd_lid;</div>
-<div class="line"><a id="l00683" name="l00683"></a><span class="lineno">  683</span>  y += tid.y * out_vec_size + out_col;</div>
-<div class="line"><a id="l00684" name="l00684"></a><span class="lineno">  684</span> </div>
-<div class="line"><a id="l00685" name="l00685"></a><span class="lineno">  685</span>  <span class="keywordflow">if</span> (out_col &gt;= out_vec_size) {</div>
-<div class="line"><a id="l00686" name="l00686"></a><span class="lineno">  686</span>    <span class="keywordflow">return</span>;</div>
-<div class="line"><a id="l00687" name="l00687"></a><span class="lineno">  687</span>  }</div>
-<div class="line"><a id="l00688" name="l00688"></a><span class="lineno">  688</span> </div>
-<div class="line"><a id="l00689" name="l00689"></a><span class="lineno">  689</span>  <span class="comment">// Loop over in_vec in blocks of blocksize</span></div>
-<div class="line"><a id="l00690" name="l00690"></a><span class="lineno">  690</span>  <span class="keywordtype">int</span> remaining = in_vec_size % blocksize;</div>
-<div class="line"><a id="l00691" name="l00691"></a><span class="lineno">  691</span>  <span class="keywordflow">if</span> (remaining == 0) {</div>
-<div class="line"><a id="l00692" name="l00692"></a><span class="lineno">  692</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; in_vec_size; i += blocksize) {</div>
-<div class="line"><a id="l00693" name="l00693"></a><span class="lineno">  693</span>      x_local = *x;</div>
-<div class="line"><a id="l00694" name="l00694"></a><span class="lineno">  694</span>      scale = *scales;</div>
-<div class="line"><a id="l00695" name="l00695"></a><span class="lineno">  695</span>      bias = *biases;</div>
-<div class="line"><a id="l00696" name="l00696"></a><span class="lineno">  696</span>      w_local = *((device vec_w*)w);</div>
-<div class="line"><a id="l00697" name="l00697"></a><span class="lineno">  697</span> </div>
-<div class="line"><a id="l00698" name="l00698"></a><span class="lineno">  698</span>      <a class="code hl_function" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter&lt;U, tn * pack_factor, bits&gt;</a>(</div>
-<div class="line"><a id="l00699" name="l00699"></a><span class="lineno">  699</span>          (thread uint8_t*)&amp;w_local, x_local, scale, bias, result);</div>
-<div class="line"><a id="l00700" name="l00700"></a><span class="lineno">  700</span> </div>
-<div class="line"><a id="l00701" name="l00701"></a><span class="lineno">  701</span>      x += blocksize;</div>
-<div class="line"><a id="l00702" name="l00702"></a><span class="lineno">  702</span>      scales += blocksize * out_vec_size_g;</div>
-<div class="line"><a id="l00703" name="l00703"></a><span class="lineno">  703</span>      biases += blocksize * out_vec_size_g;</div>
-<div class="line"><a id="l00704" name="l00704"></a><span class="lineno">  704</span>      w += blocksize * out_vec_size_w;</div>
-<div class="line"><a id="l00705" name="l00705"></a><span class="lineno">  705</span>    }</div>
-<div class="line"><a id="l00706" name="l00706"></a><span class="lineno">  706</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00707" name="l00707"></a><span class="lineno">  707</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = blocksize; i &lt; in_vec_size; i += blocksize) {</div>
-<div class="line"><a id="l00708" name="l00708"></a><span class="lineno">  708</span>      x_local = *x;</div>
-<div class="line"><a id="l00709" name="l00709"></a><span class="lineno">  709</span>      scale = *scales;</div>
-<div class="line"><a id="l00710" name="l00710"></a><span class="lineno">  710</span>      bias = *biases;</div>
-<div class="line"><a id="l00711" name="l00711"></a><span class="lineno">  711</span>      w_local = *((device vec_w*)w);</div>
-<div class="line"><a id="l00712" name="l00712"></a><span class="lineno">  712</span> </div>
-<div class="line"><a id="l00713" name="l00713"></a><span class="lineno">  713</span>      <a class="code hl_function" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter&lt;U, tn * pack_factor, bits&gt;</a>(</div>
-<div class="line"><a id="l00714" name="l00714"></a><span class="lineno">  714</span>          (thread uint8_t*)&amp;w_local, x_local, scale, bias, result);</div>
-<div class="line"><a id="l00715" name="l00715"></a><span class="lineno">  715</span> </div>
-<div class="line"><a id="l00716" name="l00716"></a><span class="lineno">  716</span>      x += blocksize;</div>
-<div class="line"><a id="l00717" name="l00717"></a><span class="lineno">  717</span>      scales += blocksize * out_vec_size_g;</div>
-<div class="line"><a id="l00718" name="l00718"></a><span class="lineno">  718</span>      biases += blocksize * out_vec_size_g;</div>
-<div class="line"><a id="l00719" name="l00719"></a><span class="lineno">  719</span>      w += blocksize * out_vec_size_w;</div>
-<div class="line"><a id="l00720" name="l00720"></a><span class="lineno">  720</span>    }</div>
-<div class="line"><a id="l00721" name="l00721"></a><span class="lineno">  721</span>    <span class="keywordflow">if</span> (<span class="keyword">static_cast&lt;</span><span class="keywordtype">int</span><span class="keyword">&gt;</span>(simd_lid) &lt; remaining) {</div>
-<div class="line"><a id="l00722" name="l00722"></a><span class="lineno">  722</span>      x_local = *x;</div>
-<div class="line"><a id="l00723" name="l00723"></a><span class="lineno">  723</span>      scale = *scales;</div>
-<div class="line"><a id="l00724" name="l00724"></a><span class="lineno">  724</span>      bias = *biases;</div>
-<div class="line"><a id="l00725" name="l00725"></a><span class="lineno">  725</span>      w_local = *((device vec_w*)w);</div>
-<div class="line"><a id="l00726" name="l00726"></a><span class="lineno">  726</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00727" name="l00727"></a><span class="lineno">  727</span>      x_local = 0;</div>
-<div class="line"><a id="l00728" name="l00728"></a><span class="lineno">  728</span>      scale = 0;</div>
-<div class="line"><a id="l00729" name="l00729"></a><span class="lineno">  729</span>      bias = 0;</div>
-<div class="line"><a id="l00730" name="l00730"></a><span class="lineno">  730</span>    }</div>
-<div class="line"><a id="l00731" name="l00731"></a><span class="lineno">  731</span>    <a class="code hl_function" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter&lt;U, tn * pack_factor, bits&gt;</a>(</div>
-<div class="line"><a id="l00732" name="l00732"></a><span class="lineno">  732</span>        (thread uint8_t*)&amp;w_local, x_local, scale, bias, result);</div>
-<div class="line"><a id="l00733" name="l00733"></a><span class="lineno">  733</span>  }</div>
-<div class="line"><a id="l00734" name="l00734"></a><span class="lineno">  734</span> </div>
-<div class="line"><a id="l00735" name="l00735"></a><span class="lineno">  735</span><span class="comment">// Accumulate in the simdgroup</span></div>
-<div class="line"><a id="l00736" name="l00736"></a><span class="lineno">  736</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l00737" name="l00737"></a><span class="lineno">  737</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; tn * pack_factor; k++) {</div>
-<div class="line"><a id="l00738" name="l00738"></a><span class="lineno">  738</span>    result[k] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(result[k]);</div>
-<div class="line"><a id="l00739" name="l00739"></a><span class="lineno">  739</span>  }</div>
-<div class="line"><a id="l00740" name="l00740"></a><span class="lineno">  740</span> </div>
-<div class="line"><a id="l00741" name="l00741"></a><span class="lineno">  741</span>  <span class="comment">// Store the result</span></div>
-<div class="line"><a id="l00742" name="l00742"></a><span class="lineno">  742</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
-<div class="line"><a id="l00743" name="l00743"></a><span class="lineno">  743</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l00744" name="l00744"></a><span class="lineno">  744</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; tn * pack_factor; k++) {</div>
-<div class="line"><a id="l00745" name="l00745"></a><span class="lineno">  745</span>      y[k] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[k]);</div>
-<div class="line"><a id="l00746" name="l00746"></a><span class="lineno">  746</span>    }</div>
-<div class="line"><a id="l00747" name="l00747"></a><span class="lineno">  747</span>  }</div>
-<div class="line"><a id="l00748" name="l00748"></a><span class="lineno">  748</span>}</div>
+<div class="line"><a id="l00668" name="l00668"></a><span class="lineno">  668</span>      U s = sl[0];</div>
+<div class="line"><a id="l00669" name="l00669"></a><span class="lineno">  669</span>      U b = bl[0];</div>
+<div class="line"><a id="l00670" name="l00670"></a><span class="lineno">  670</span>      result[row] += <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
+<div class="line"><a id="l00671" name="l00671"></a><span class="lineno">  671</span>    }</div>
+<div class="line"><a id="l00672" name="l00672"></a><span class="lineno">  672</span> </div>
+<div class="line"><a id="l00673" name="l00673"></a><span class="lineno">  673</span>    ws += block_size * bytes_per_pack / pack_factor;</div>
+<div class="line"><a id="l00674" name="l00674"></a><span class="lineno">  674</span>    scales += block_size / group_size;</div>
+<div class="line"><a id="l00675" name="l00675"></a><span class="lineno">  675</span>    biases += block_size / group_size;</div>
+<div class="line"><a id="l00676" name="l00676"></a><span class="lineno">  676</span>    x += block_size;</div>
+<div class="line"><a id="l00677" name="l00677"></a><span class="lineno">  677</span>  }</div>
+<div class="line"><a id="l00678" name="l00678"></a><span class="lineno">  678</span> </div>
+<div class="line"><a id="l00679" name="l00679"></a><span class="lineno">  679</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
+<div class="line"><a id="l00680" name="l00680"></a><span class="lineno">  680</span>    result[row] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(result[row]);</div>
+<div class="line"><a id="l00681" name="l00681"></a><span class="lineno">  681</span>    <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00682" name="l00682"></a><span class="lineno">  682</span>      y[row] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[row]);</div>
+<div class="line"><a id="l00683" name="l00683"></a><span class="lineno">  683</span>    }</div>
+<div class="line"><a id="l00684" name="l00684"></a><span class="lineno">  684</span>  }</div>
+<div class="line"><a id="l00685" name="l00685"></a><span class="lineno">  685</span>}</div>
 </div>
-<div class="line"><a id="l00749" name="l00749"></a><span class="lineno">  749</span> </div>
-<div class="line"><a id="l00750" name="l00750"></a><span class="lineno">  750</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00751" name="l00751"></a><span class="lineno">  751</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00752" name="l00752"></a><span class="lineno">  752</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l00753" name="l00753"></a><span class="lineno">  753</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l00754" name="l00754"></a><span class="lineno">  754</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
-<div class="line"><a id="l00755" name="l00755"></a><span class="lineno">  755</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l00756" name="l00756"></a><span class="lineno">  756</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l00757" name="l00757"></a><span class="lineno">  757</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen00758" data-start="{" data-end="}">
-<div class="line"><a id="l00758" name="l00758"></a><span class="lineno"><a class="line" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">  758</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl</a>(</div>
-<div class="line"><a id="l00759" name="l00759"></a><span class="lineno">  759</span>    <span class="keyword">const</span> device uint32_t* w,</div>
-<div class="line"><a id="l00760" name="l00760"></a><span class="lineno">  760</span>    <span class="keyword">const</span> device T* scales,</div>
-<div class="line"><a id="l00761" name="l00761"></a><span class="lineno">  761</span>    <span class="keyword">const</span> device T* biases,</div>
-<div class="line"><a id="l00762" name="l00762"></a><span class="lineno">  762</span>    <span class="keyword">const</span> device T* x,</div>
-<div class="line"><a id="l00763" name="l00763"></a><span class="lineno">  763</span>    device T* y,</div>
-<div class="line"><a id="l00764" name="l00764"></a><span class="lineno">  764</span>    threadgroup T* Xs,</div>
-<div class="line"><a id="l00765" name="l00765"></a><span class="lineno">  765</span>    threadgroup T* Ws,</div>
-<div class="line"><a id="l00766" name="l00766"></a><span class="lineno">  766</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K,</div>
-<div class="line"><a id="l00767" name="l00767"></a><span class="lineno">  767</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N,</div>
-<div class="line"><a id="l00768" name="l00768"></a><span class="lineno">  768</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M,</div>
-<div class="line"><a id="l00769" name="l00769"></a><span class="lineno">  769</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00770" name="l00770"></a><span class="lineno">  770</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l00771" name="l00771"></a><span class="lineno">  771</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l00772" name="l00772"></a><span class="lineno">  772</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l00773" name="l00773"></a><span class="lineno">  773</span>  <span class="keyword">static_assert</span>(BK &gt;= <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>, <span class="stringliteral">&quot;BK should be larger than SIMD_SIZE&quot;</span>);</div>
-<div class="line"><a id="l00774" name="l00774"></a><span class="lineno">  774</span>  <span class="keyword">static_assert</span>(BK % <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a> == 0, <span class="stringliteral">&quot;BK should be divisible by SIMD_SIZE&quot;</span>);</div>
-<div class="line"><a id="l00775" name="l00775"></a><span class="lineno">  775</span> </div>
-<div class="line"><a id="l00776" name="l00776"></a><span class="lineno">  776</span>  (void)lid;</div>
-<div class="line"><a id="l00777" name="l00777"></a><span class="lineno">  777</span> </div>
-<div class="line"><a id="l00778" name="l00778"></a><span class="lineno">  778</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> WM = 2;</div>
-<div class="line"><a id="l00779" name="l00779"></a><span class="lineno">  779</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> WN = 2;</div>
-<div class="line"><a id="l00780" name="l00780"></a><span class="lineno">  780</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = 32 / bits;</div>
-<div class="line"><a id="l00781" name="l00781"></a><span class="lineno">  781</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l00782" name="l00782"></a><span class="lineno">  782</span> </div>
-<div class="line"><a id="l00783" name="l00783"></a><span class="lineno">  783</span>  <span class="comment">// Instantiate the appropriate BlockMMA and Loader</span></div>
-<div class="line"><a id="l00784" name="l00784"></a><span class="lineno">  784</span>  <span class="keyword">using </span>mma_t = mlx::steel::</div>
-<div class="line"><a id="l00785" name="l00785"></a><span class="lineno">  785</span>      BlockMMA&lt;T, T, BM, BN, BK, WM, WN, false, true, BK_padded, BK_padded&gt;;</div>
-<div class="line"><a id="l00786" name="l00786"></a><span class="lineno">  786</span>  <span class="keyword">using </span>loader_x_t =</div>
-<div class="line"><a id="l00787" name="l00787"></a><span class="lineno">  787</span>      <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt;T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE&gt;</a>;</div>
-<div class="line"><a id="l00788" name="l00788"></a><span class="lineno">  788</span>  <span class="keyword">using </span>loader_w_t = <a class="code hl_struct" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt;</div>
-<div class="line"><a id="l00789" name="l00789"></a><span class="lineno">  789</span>      T,</div>
-<div class="line"><a id="l00790" name="l00790"></a><span class="lineno">  790</span>      BN,</div>
-<div class="line"><a id="l00791" name="l00791"></a><span class="lineno">  791</span>      BK,</div>
-<div class="line"><a id="l00792" name="l00792"></a><span class="lineno">  792</span>      BK_padded,</div>
-<div class="line"><a id="l00793" name="l00793"></a><span class="lineno">  793</span>      1,</div>
-<div class="line"><a id="l00794" name="l00794"></a><span class="lineno">  794</span>      WM * WN * <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>,</div>
-<div class="line"><a id="l00795" name="l00795"></a><span class="lineno">  795</span>      group_size,</div>
-<div class="line"><a id="l00796" name="l00796"></a><span class="lineno">  796</span>      bits&gt;;</div>
+<div class="line"><a id="l00686" name="l00686"></a><span class="lineno">  686</span> </div>
+<div class="line"><a id="l00687" name="l00687"></a><span class="lineno">  687</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen00688" data-start="{" data-end="}">
+<div class="line"><a id="l00688" name="l00688"></a><span class="lineno"><a class="line" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">  688</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl</a>(</div>
+<div class="line"><a id="l00689" name="l00689"></a><span class="lineno">  689</span>    <span class="keyword">const</span> device uint32_t* w,</div>
+<div class="line"><a id="l00690" name="l00690"></a><span class="lineno">  690</span>    <span class="keyword">const</span> device T* scales,</div>
+<div class="line"><a id="l00691" name="l00691"></a><span class="lineno">  691</span>    <span class="keyword">const</span> device T* biases,</div>
+<div class="line"><a id="l00692" name="l00692"></a><span class="lineno">  692</span>    <span class="keyword">const</span> device T* x,</div>
+<div class="line"><a id="l00693" name="l00693"></a><span class="lineno">  693</span>    device T* y,</div>
+<div class="line"><a id="l00694" name="l00694"></a><span class="lineno">  694</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size,</div>
+<div class="line"><a id="l00695" name="l00695"></a><span class="lineno">  695</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size,</div>
+<div class="line"><a id="l00696" name="l00696"></a><span class="lineno">  696</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00697" name="l00697"></a><span class="lineno">  697</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00698" name="l00698"></a><span class="lineno">  698</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l00699" name="l00699"></a><span class="lineno">  699</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> power_of_2_bits = (bits &amp; (bits - 1)) == 0;</div>
+<div class="line"><a id="l00700" name="l00700"></a><span class="lineno">  700</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> num_simdgroups = 2;</div>
+<div class="line"><a id="l00701" name="l00701"></a><span class="lineno">  701</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> results_per_simdgroup = 4;</div>
+<div class="line"><a id="l00702" name="l00702"></a><span class="lineno">  702</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_thread = 1;</div>
+<div class="line"><a id="l00703" name="l00703"></a><span class="lineno">  703</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;</div>
+<div class="line"><a id="l00704" name="l00704"></a><span class="lineno">  704</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> bytes_per_pack = power_of_2_bits ? 4 : 3;</div>
+<div class="line"><a id="l00705" name="l00705"></a><span class="lineno">  705</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_thread = pack_factor * packs_per_thread;</div>
+<div class="line"><a id="l00706" name="l00706"></a><span class="lineno">  706</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> block_size = values_per_thread * <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>;</div>
+<div class="line"><a id="l00707" name="l00707"></a><span class="lineno">  707</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> scale_step_per_thread = group_size / values_per_thread;</div>
+<div class="line"><a id="l00708" name="l00708"></a><span class="lineno">  708</span> </div>
+<div class="line"><a id="l00709" name="l00709"></a><span class="lineno">  709</span>  <span class="keyword">const</span> device uint8_t* ws = (<span class="keyword">const</span> device uint8_t*)w;</div>
+<div class="line"><a id="l00710" name="l00710"></a><span class="lineno">  710</span> </div>
+<div class="line"><a id="l00711" name="l00711"></a><span class="lineno">  711</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
+<div class="line"><a id="l00712" name="l00712"></a><span class="lineno">  712</span> </div>
+<div class="line"><a id="l00713" name="l00713"></a><span class="lineno">  713</span>  thread U x_thread[values_per_thread];</div>
+<div class="line"><a id="l00714" name="l00714"></a><span class="lineno">  714</span>  thread U result[results_per_simdgroup] = {0};</div>
+<div class="line"><a id="l00715" name="l00715"></a><span class="lineno">  715</span> </div>
+<div class="line"><a id="l00716" name="l00716"></a><span class="lineno">  716</span>  <span class="comment">// Adjust positions</span></div>
+<div class="line"><a id="l00717" name="l00717"></a><span class="lineno">  717</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;</div>
+<div class="line"><a id="l00718" name="l00718"></a><span class="lineno">  718</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size_g = in_vec_size / group_size;</div>
+<div class="line"><a id="l00719" name="l00719"></a><span class="lineno">  719</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_row = tid.x * (num_simdgroups * results_per_simdgroup) +</div>
+<div class="line"><a id="l00720" name="l00720"></a><span class="lineno">  720</span>      simd_gid * results_per_simdgroup;</div>
+<div class="line"><a id="l00721" name="l00721"></a><span class="lineno">  721</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> used_out_row = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(out_vec_size - results_per_simdgroup, out_row);</div>
+<div class="line"><a id="l00722" name="l00722"></a><span class="lineno">  722</span> </div>
+<div class="line"><a id="l00723" name="l00723"></a><span class="lineno">  723</span>  <span class="keywordflow">if</span> (out_row &gt;= out_vec_size) {</div>
+<div class="line"><a id="l00724" name="l00724"></a><span class="lineno">  724</span>    <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00725" name="l00725"></a><span class="lineno">  725</span>  }</div>
+<div class="line"><a id="l00726" name="l00726"></a><span class="lineno">  726</span> </div>
+<div class="line"><a id="l00727" name="l00727"></a><span class="lineno">  727</span>  <span class="comment">// In this case we need to properly guard all our reads because there isn&#39;t</span></div>
+<div class="line"><a id="l00728" name="l00728"></a><span class="lineno">  728</span>  <span class="comment">// even 1 tile in the matrix</span></div>
+<div class="line"><a id="l00729" name="l00729"></a><span class="lineno">  729</span>  <span class="keywordflow">if</span> (out_vec_size &lt; (num_simdgroups * results_per_simdgroup)) {</div>
+<div class="line"><a id="l00730" name="l00730"></a><span class="lineno">  730</span>    ws +=</div>
+<div class="line"><a id="l00731" name="l00731"></a><span class="lineno">  731</span>        out_row * in_vec_size_w + simd_lid * packs_per_thread * bytes_per_pack;</div>
+<div class="line"><a id="l00732" name="l00732"></a><span class="lineno">  732</span>    scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
+<div class="line"><a id="l00733" name="l00733"></a><span class="lineno">  733</span>    biases += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
+<div class="line"><a id="l00734" name="l00734"></a><span class="lineno">  734</span>    x += tid.y * in_vec_size + simd_lid * values_per_thread;</div>
+<div class="line"><a id="l00735" name="l00735"></a><span class="lineno">  735</span>    y += tid.y * out_vec_size + out_row;</div>
+<div class="line"><a id="l00736" name="l00736"></a><span class="lineno">  736</span> </div>
+<div class="line"><a id="l00737" name="l00737"></a><span class="lineno">  737</span>    <span class="keywordtype">int</span> k = 0;</div>
+<div class="line"><a id="l00738" name="l00738"></a><span class="lineno">  738</span>    <span class="keywordflow">for</span> (; k &lt; in_vec_size - block_size; k += block_size) {</div>
+<div class="line"><a id="l00739" name="l00739"></a><span class="lineno">  739</span>      U sum = <a class="code hl_function" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread);</div>
+<div class="line"><a id="l00740" name="l00740"></a><span class="lineno">  740</span> </div>
+<div class="line"><a id="l00741" name="l00741"></a><span class="lineno">  741</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; out_row + row &lt; out_vec_size; row++) {</div>
+<div class="line"><a id="l00742" name="l00742"></a><span class="lineno">  742</span>        <span class="keyword">auto</span> wl = (<span class="keyword">const</span> device uint8_t*)(ws + row * in_vec_size_w);</div>
+<div class="line"><a id="l00743" name="l00743"></a><span class="lineno">  743</span>        <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
+<div class="line"><a id="l00744" name="l00744"></a><span class="lineno">  744</span>        <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
+<div class="line"><a id="l00745" name="l00745"></a><span class="lineno">  745</span> </div>
+<div class="line"><a id="l00746" name="l00746"></a><span class="lineno">  746</span>        U s = sl[0];</div>
+<div class="line"><a id="l00747" name="l00747"></a><span class="lineno">  747</span>        U b = bl[0];</div>
+<div class="line"><a id="l00748" name="l00748"></a><span class="lineno">  748</span>        result[row] +=</div>
+<div class="line"><a id="l00749" name="l00749"></a><span class="lineno">  749</span>            <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
+<div class="line"><a id="l00750" name="l00750"></a><span class="lineno">  750</span>      }</div>
+<div class="line"><a id="l00751" name="l00751"></a><span class="lineno">  751</span> </div>
+<div class="line"><a id="l00752" name="l00752"></a><span class="lineno">  752</span>      ws += block_size * bytes_per_pack / pack_factor;</div>
+<div class="line"><a id="l00753" name="l00753"></a><span class="lineno">  753</span>      scales += block_size / group_size;</div>
+<div class="line"><a id="l00754" name="l00754"></a><span class="lineno">  754</span>      biases += block_size / group_size;</div>
+<div class="line"><a id="l00755" name="l00755"></a><span class="lineno">  755</span>      x += block_size;</div>
+<div class="line"><a id="l00756" name="l00756"></a><span class="lineno">  756</span>    }</div>
+<div class="line"><a id="l00757" name="l00757"></a><span class="lineno">  757</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> remaining = clamp(</div>
+<div class="line"><a id="l00758" name="l00758"></a><span class="lineno">  758</span>        <span class="keyword">static_cast&lt;</span><span class="keywordtype">int</span><span class="keyword">&gt;</span>(in_vec_size - k - simd_lid * values_per_thread),</div>
+<div class="line"><a id="l00759" name="l00759"></a><span class="lineno">  759</span>        0,</div>
+<div class="line"><a id="l00760" name="l00760"></a><span class="lineno">  760</span>        values_per_thread);</div>
+<div class="line"><a id="l00761" name="l00761"></a><span class="lineno">  761</span>    <span class="keywordflow">if</span> (remaining &gt; 0) {</div>
+<div class="line"><a id="l00762" name="l00762"></a><span class="lineno">  762</span>      U sum = <a class="code hl_function" href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">load_vector_safe&lt;T, U, values_per_thread, bits&gt;</a>(</div>
+<div class="line"><a id="l00763" name="l00763"></a><span class="lineno">  763</span>          x, x_thread, remaining);</div>
+<div class="line"><a id="l00764" name="l00764"></a><span class="lineno">  764</span> </div>
+<div class="line"><a id="l00765" name="l00765"></a><span class="lineno">  765</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; out_row + row &lt; out_vec_size; row++) {</div>
+<div class="line"><a id="l00766" name="l00766"></a><span class="lineno">  766</span>        <span class="keyword">auto</span> wl = (<span class="keyword">const</span> device uint8_t*)(ws + row * in_vec_size_w);</div>
+<div class="line"><a id="l00767" name="l00767"></a><span class="lineno">  767</span>        <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
+<div class="line"><a id="l00768" name="l00768"></a><span class="lineno">  768</span>        <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
+<div class="line"><a id="l00769" name="l00769"></a><span class="lineno">  769</span> </div>
+<div class="line"><a id="l00770" name="l00770"></a><span class="lineno">  770</span>        U s = sl[0];</div>
+<div class="line"><a id="l00771" name="l00771"></a><span class="lineno">  771</span>        U b = bl[0];</div>
+<div class="line"><a id="l00772" name="l00772"></a><span class="lineno">  772</span>        result[row] +=</div>
+<div class="line"><a id="l00773" name="l00773"></a><span class="lineno">  773</span>            <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
+<div class="line"><a id="l00774" name="l00774"></a><span class="lineno">  774</span>      }</div>
+<div class="line"><a id="l00775" name="l00775"></a><span class="lineno">  775</span>    }</div>
+<div class="line"><a id="l00776" name="l00776"></a><span class="lineno">  776</span> </div>
+<div class="line"><a id="l00777" name="l00777"></a><span class="lineno">  777</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; out_row + row &lt; out_vec_size; row++) {</div>
+<div class="line"><a id="l00778" name="l00778"></a><span class="lineno">  778</span>      result[row] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(result[row]);</div>
+<div class="line"><a id="l00779" name="l00779"></a><span class="lineno">  779</span>      <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00780" name="l00780"></a><span class="lineno">  780</span>        y[row] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[row]);</div>
+<div class="line"><a id="l00781" name="l00781"></a><span class="lineno">  781</span>      }</div>
+<div class="line"><a id="l00782" name="l00782"></a><span class="lineno">  782</span>    }</div>
+<div class="line"><a id="l00783" name="l00783"></a><span class="lineno">  783</span>  }</div>
+<div class="line"><a id="l00784" name="l00784"></a><span class="lineno">  784</span> </div>
+<div class="line"><a id="l00785" name="l00785"></a><span class="lineno">  785</span>  <span class="comment">// In this case the last tile is moved back to redo some output values</span></div>
+<div class="line"><a id="l00786" name="l00786"></a><span class="lineno">  786</span>  <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00787" name="l00787"></a><span class="lineno">  787</span>    ws += used_out_row * in_vec_size_w +</div>
+<div class="line"><a id="l00788" name="l00788"></a><span class="lineno">  788</span>        simd_lid * packs_per_thread * bytes_per_pack;</div>
+<div class="line"><a id="l00789" name="l00789"></a><span class="lineno">  789</span>    scales += used_out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
+<div class="line"><a id="l00790" name="l00790"></a><span class="lineno">  790</span>    biases += used_out_row * in_vec_size_g + simd_lid / scale_step_per_thread;</div>
+<div class="line"><a id="l00791" name="l00791"></a><span class="lineno">  791</span>    x += tid.y * in_vec_size + simd_lid * values_per_thread;</div>
+<div class="line"><a id="l00792" name="l00792"></a><span class="lineno">  792</span>    y += tid.y * out_vec_size + used_out_row;</div>
+<div class="line"><a id="l00793" name="l00793"></a><span class="lineno">  793</span> </div>
+<div class="line"><a id="l00794" name="l00794"></a><span class="lineno">  794</span>    <span class="keywordtype">int</span> k = 0;</div>
+<div class="line"><a id="l00795" name="l00795"></a><span class="lineno">  795</span>    <span class="keywordflow">for</span> (; k &lt; in_vec_size - block_size; k += block_size) {</div>
+<div class="line"><a id="l00796" name="l00796"></a><span class="lineno">  796</span>      U sum = <a class="code hl_function" href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector&lt;T, U, values_per_thread, bits&gt;</a>(x, x_thread);</div>
 <div class="line"><a id="l00797" name="l00797"></a><span class="lineno">  797</span> </div>
-<div class="line"><a id="l00798" name="l00798"></a><span class="lineno">  798</span>  <span class="comment">// Set the block</span></div>
-<div class="line"><a id="l00799" name="l00799"></a><span class="lineno">  799</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> K_w = K / pack_factor;</div>
-<div class="line"><a id="l00800" name="l00800"></a><span class="lineno">  800</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> K_g = K / group_size;</div>
-<div class="line"><a id="l00801" name="l00801"></a><span class="lineno">  801</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> y_row = tid.y * BM;</div>
-<div class="line"><a id="l00802" name="l00802"></a><span class="lineno">  802</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> y_col = tid.x * BN;</div>
-<div class="line"><a id="l00803" name="l00803"></a><span class="lineno">  803</span> </div>
-<div class="line"><a id="l00804" name="l00804"></a><span class="lineno">  804</span>  x += y_row * K;</div>
-<div class="line"><a id="l00805" name="l00805"></a><span class="lineno">  805</span>  w += y_col * K_w;</div>
-<div class="line"><a id="l00806" name="l00806"></a><span class="lineno">  806</span>  scales += y_col * K_g;</div>
-<div class="line"><a id="l00807" name="l00807"></a><span class="lineno">  807</span>  biases += y_col * K_g;</div>
-<div class="line"><a id="l00808" name="l00808"></a><span class="lineno">  808</span>  y += y_row * N + y_col;</div>
-<div class="line"><a id="l00809" name="l00809"></a><span class="lineno">  809</span> </div>
-<div class="line"><a id="l00810" name="l00810"></a><span class="lineno">  810</span>  <span class="comment">// Make the x loader and mma operation</span></div>
-<div class="line"><a id="l00811" name="l00811"></a><span class="lineno">  811</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> num_els = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(BM, M - y_row);</div>
-<div class="line"><a id="l00812" name="l00812"></a><span class="lineno">  812</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> num_outs = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(BN, N - y_col);</div>
-<div class="line"><a id="l00813" name="l00813"></a><span class="lineno">  813</span>  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);</div>
-<div class="line"><a id="l00814" name="l00814"></a><span class="lineno">  814</span>  loader_w_t loader_w(w, scales, biases, K, Ws, simd_gid, simd_lid);</div>
-<div class="line"><a id="l00815" name="l00815"></a><span class="lineno">  815</span>  mma_t mma_op(simd_gid, simd_lid);</div>
-<div class="line"><a id="l00816" name="l00816"></a><span class="lineno">  816</span> </div>
-<div class="line"><a id="l00817" name="l00817"></a><span class="lineno">  817</span>  <span class="keywordflow">if</span> (num_els &lt; BM) {</div>
-<div class="line"><a id="l00818" name="l00818"></a><span class="lineno">  818</span>    <span class="keywordflow">if</span> (!aligned_N &amp;&amp; num_outs &lt; BN) {</div>
-<div class="line"><a id="l00819" name="l00819"></a><span class="lineno">  819</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
-<div class="line"><a id="l00820" name="l00820"></a><span class="lineno">  820</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00821" name="l00821"></a><span class="lineno">  821</span>        loader_x.load_safe(short2(BK, num_els));</div>
-<div class="line"><a id="l00822" name="l00822"></a><span class="lineno">  822</span>        loader_w.load_safe(short2(BK, num_outs));</div>
-<div class="line"><a id="l00823" name="l00823"></a><span class="lineno">  823</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00824" name="l00824"></a><span class="lineno">  824</span>        mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00825" name="l00825"></a><span class="lineno">  825</span>        loader_x.next();</div>
-<div class="line"><a id="l00826" name="l00826"></a><span class="lineno">  826</span>        loader_w.next();</div>
-<div class="line"><a id="l00827" name="l00827"></a><span class="lineno">  827</span>      }</div>
-<div class="line"><a id="l00828" name="l00828"></a><span class="lineno">  828</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00829" name="l00829"></a><span class="lineno">  829</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
-<div class="line"><a id="l00830" name="l00830"></a><span class="lineno">  830</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00831" name="l00831"></a><span class="lineno">  831</span>        loader_x.load_safe(short2(BK, num_els));</div>
-<div class="line"><a id="l00832" name="l00832"></a><span class="lineno">  832</span>        loader_w.load_unsafe();</div>
-<div class="line"><a id="l00833" name="l00833"></a><span class="lineno">  833</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00834" name="l00834"></a><span class="lineno">  834</span>        mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00835" name="l00835"></a><span class="lineno">  835</span>        loader_x.next();</div>
-<div class="line"><a id="l00836" name="l00836"></a><span class="lineno">  836</span>        loader_w.next();</div>
+<div class="line"><a id="l00798" name="l00798"></a><span class="lineno">  798</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
+<div class="line"><a id="l00799" name="l00799"></a><span class="lineno">  799</span>        <span class="keyword">auto</span> wl = (<span class="keyword">const</span> device uint8_t*)(ws + row * in_vec_size_w);</div>
+<div class="line"><a id="l00800" name="l00800"></a><span class="lineno">  800</span>        <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
+<div class="line"><a id="l00801" name="l00801"></a><span class="lineno">  801</span>        <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
+<div class="line"><a id="l00802" name="l00802"></a><span class="lineno">  802</span> </div>
+<div class="line"><a id="l00803" name="l00803"></a><span class="lineno">  803</span>        U s = sl[0];</div>
+<div class="line"><a id="l00804" name="l00804"></a><span class="lineno">  804</span>        U b = bl[0];</div>
+<div class="line"><a id="l00805" name="l00805"></a><span class="lineno">  805</span>        result[row] +=</div>
+<div class="line"><a id="l00806" name="l00806"></a><span class="lineno">  806</span>            <a class="code hl_function" href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot&lt;U, values_per_thread, bits&gt;</a>(wl, x_thread, s, b, sum);</div>
+<div class="line"><a id="l00807" name="l00807"></a><span class="lineno">  807</span>      }</div>
+<div class="line"><a id="l00808" name="l00808"></a><span class="lineno">  808</span> </div>
+<div class="line"><a id="l00809" name="l00809"></a><span class="lineno">  809</span>      ws += block_size * bytes_per_pack / pack_factor;</div>
+<div class="line"><a id="l00810" name="l00810"></a><span class="lineno">  810</span>      scales += block_size / group_size;</div>
+<div class="line"><a id="l00811" name="l00811"></a><span class="lineno">  811</span>      biases += block_size / group_size;</div>
+<div class="line"><a id="l00812" name="l00812"></a><span class="lineno">  812</span>      x += block_size;</div>
+<div class="line"><a id="l00813" name="l00813"></a><span class="lineno">  813</span>    }</div>
+<div class="line"><a id="l00814" name="l00814"></a><span class="lineno">  814</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> remaining = clamp(</div>
+<div class="line"><a id="l00815" name="l00815"></a><span class="lineno">  815</span>        <span class="keyword">static_cast&lt;</span><span class="keywordtype">int</span><span class="keyword">&gt;</span>(in_vec_size - k - simd_lid * values_per_thread),</div>
+<div class="line"><a id="l00816" name="l00816"></a><span class="lineno">  816</span>        0,</div>
+<div class="line"><a id="l00817" name="l00817"></a><span class="lineno">  817</span>        values_per_thread);</div>
+<div class="line"><a id="l00818" name="l00818"></a><span class="lineno">  818</span>    <span class="keywordflow">if</span> (remaining &gt; 0) {</div>
+<div class="line"><a id="l00819" name="l00819"></a><span class="lineno">  819</span>      U sum = <a class="code hl_function" href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">load_vector_safe&lt;T, U, values_per_thread, bits&gt;</a>(</div>
+<div class="line"><a id="l00820" name="l00820"></a><span class="lineno">  820</span>          x, x_thread, remaining);</div>
+<div class="line"><a id="l00821" name="l00821"></a><span class="lineno">  821</span> </div>
+<div class="line"><a id="l00822" name="l00822"></a><span class="lineno">  822</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
+<div class="line"><a id="l00823" name="l00823"></a><span class="lineno">  823</span>        <span class="keyword">auto</span> wl = (<span class="keyword">const</span> device uint8_t*)(ws + row * in_vec_size_w);</div>
+<div class="line"><a id="l00824" name="l00824"></a><span class="lineno">  824</span>        <span class="keyword">const</span> device T* sl = scales + row * in_vec_size_g;</div>
+<div class="line"><a id="l00825" name="l00825"></a><span class="lineno">  825</span>        <span class="keyword">const</span> device T* bl = biases + row * in_vec_size_g;</div>
+<div class="line"><a id="l00826" name="l00826"></a><span class="lineno">  826</span> </div>
+<div class="line"><a id="l00827" name="l00827"></a><span class="lineno">  827</span>        U s = sl[0];</div>
+<div class="line"><a id="l00828" name="l00828"></a><span class="lineno">  828</span>        U b = bl[0];</div>
+<div class="line"><a id="l00829" name="l00829"></a><span class="lineno">  829</span>        result[row] += <a class="code hl_function" href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42">qdot_safe&lt;U, values_per_thread, bits&gt;</a>(</div>
+<div class="line"><a id="l00830" name="l00830"></a><span class="lineno">  830</span>            wl, x_thread, s, b, sum, remaining);</div>
+<div class="line"><a id="l00831" name="l00831"></a><span class="lineno">  831</span>      }</div>
+<div class="line"><a id="l00832" name="l00832"></a><span class="lineno">  832</span>    }</div>
+<div class="line"><a id="l00833" name="l00833"></a><span class="lineno">  833</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> row = 0; row &lt; results_per_simdgroup; row++) {</div>
+<div class="line"><a id="l00834" name="l00834"></a><span class="lineno">  834</span>      result[row] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(result[row]);</div>
+<div class="line"><a id="l00835" name="l00835"></a><span class="lineno">  835</span>      <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00836" name="l00836"></a><span class="lineno">  836</span>        y[row] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[row]);</div>
 <div class="line"><a id="l00837" name="l00837"></a><span class="lineno">  837</span>      }</div>
 <div class="line"><a id="l00838" name="l00838"></a><span class="lineno">  838</span>    }</div>
-<div class="line"><a id="l00839" name="l00839"></a><span class="lineno">  839</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00840" name="l00840"></a><span class="lineno">  840</span>    <span class="keywordflow">if</span> (!aligned_N &amp;&amp; num_outs &lt; BN) {</div>
-<div class="line"><a id="l00841" name="l00841"></a><span class="lineno">  841</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
-<div class="line"><a id="l00842" name="l00842"></a><span class="lineno">  842</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00843" name="l00843"></a><span class="lineno">  843</span>        loader_x.load_unsafe();</div>
-<div class="line"><a id="l00844" name="l00844"></a><span class="lineno">  844</span>        loader_w.load_safe(short2(BK, num_outs));</div>
-<div class="line"><a id="l00845" name="l00845"></a><span class="lineno">  845</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00846" name="l00846"></a><span class="lineno">  846</span>        mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00847" name="l00847"></a><span class="lineno">  847</span>        loader_x.next();</div>
-<div class="line"><a id="l00848" name="l00848"></a><span class="lineno">  848</span>        loader_w.next();</div>
-<div class="line"><a id="l00849" name="l00849"></a><span class="lineno">  849</span>      }</div>
-<div class="line"><a id="l00850" name="l00850"></a><span class="lineno">  850</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00851" name="l00851"></a><span class="lineno">  851</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
-<div class="line"><a id="l00852" name="l00852"></a><span class="lineno">  852</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00853" name="l00853"></a><span class="lineno">  853</span>        loader_x.load_unsafe();</div>
-<div class="line"><a id="l00854" name="l00854"></a><span class="lineno">  854</span>        loader_w.load_unsafe();</div>
-<div class="line"><a id="l00855" name="l00855"></a><span class="lineno">  855</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00856" name="l00856"></a><span class="lineno">  856</span>        mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00857" name="l00857"></a><span class="lineno">  857</span>        loader_x.next();</div>
-<div class="line"><a id="l00858" name="l00858"></a><span class="lineno">  858</span>        loader_w.next();</div>
-<div class="line"><a id="l00859" name="l00859"></a><span class="lineno">  859</span>      }</div>
-<div class="line"><a id="l00860" name="l00860"></a><span class="lineno">  860</span>    }</div>
-<div class="line"><a id="l00861" name="l00861"></a><span class="lineno">  861</span>  }</div>
+<div class="line"><a id="l00839" name="l00839"></a><span class="lineno">  839</span>  }</div>
+<div class="line"><a id="l00840" name="l00840"></a><span class="lineno">  840</span>}</div>
+</div>
+<div class="line"><a id="l00841" name="l00841"></a><span class="lineno">  841</span> </div>
+<div class="line"><a id="l00842" name="l00842"></a><span class="lineno">  842</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen00843" data-start="{" data-end="}">
+<div class="line"><a id="l00843" name="l00843"></a><span class="lineno"><a class="line" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">  843</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl</a>(</div>
+<div class="line"><a id="l00844" name="l00844"></a><span class="lineno">  844</span>    <span class="keyword">const</span> device uint32_t* w,</div>
+<div class="line"><a id="l00845" name="l00845"></a><span class="lineno">  845</span>    <span class="keyword">const</span> device T* scales,</div>
+<div class="line"><a id="l00846" name="l00846"></a><span class="lineno">  846</span>    <span class="keyword">const</span> device T* biases,</div>
+<div class="line"><a id="l00847" name="l00847"></a><span class="lineno">  847</span>    <span class="keyword">const</span> device T* x,</div>
+<div class="line"><a id="l00848" name="l00848"></a><span class="lineno">  848</span>    device T* y,</div>
+<div class="line"><a id="l00849" name="l00849"></a><span class="lineno">  849</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size,</div>
+<div class="line"><a id="l00850" name="l00850"></a><span class="lineno">  850</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> out_vec_size,</div>
+<div class="line"><a id="l00851" name="l00851"></a><span class="lineno">  851</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00852" name="l00852"></a><span class="lineno">  852</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00853" name="l00853"></a><span class="lineno">  853</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l00854" name="l00854"></a><span class="lineno">  854</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> power_of_2_bits = (bits &amp; (bits - 1)) == 0;</div>
+<div class="line"><a id="l00855" name="l00855"></a><span class="lineno">  855</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> num_simdgroups = 2;</div>
+<div class="line"><a id="l00856" name="l00856"></a><span class="lineno">  856</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;</div>
+<div class="line"><a id="l00857" name="l00857"></a><span class="lineno">  857</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> bytes_per_pack = power_of_2_bits ? 4 : 3;</div>
+<div class="line"><a id="l00858" name="l00858"></a><span class="lineno">  858</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> tn = 32 / pack_factor;</div>
+<div class="line"><a id="l00859" name="l00859"></a><span class="lineno">  859</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> block_size = <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>;</div>
+<div class="line"><a id="l00860" name="l00860"></a><span class="lineno">  860</span> </div>
+<div class="line"><a id="l00861" name="l00861"></a><span class="lineno">  861</span>  <span class="keyword">const</span> device uint8_t* ws = (<span class="keyword">const</span> device uint8_t*)w;</div>
 <div class="line"><a id="l00862" name="l00862"></a><span class="lineno">  862</span> </div>
-<div class="line"><a id="l00863" name="l00863"></a><span class="lineno">  863</span>  <span class="comment">// Store results to device memory</span></div>
-<div class="line"><a id="l00864" name="l00864"></a><span class="lineno">  864</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00865" name="l00865"></a><span class="lineno">  865</span>  <span class="keywordflow">if</span> (num_els &lt; BM || num_outs &lt; BN) {</div>
-<div class="line"><a id="l00866" name="l00866"></a><span class="lineno">  866</span>    mma_op.store_result_safe(y, N, short2(num_outs, num_els));</div>
-<div class="line"><a id="l00867" name="l00867"></a><span class="lineno">  867</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00868" name="l00868"></a><span class="lineno">  868</span>    mma_op.store_result(y, N);</div>
-<div class="line"><a id="l00869" name="l00869"></a><span class="lineno">  869</span>  }</div>
-<div class="line"><a id="l00870" name="l00870"></a><span class="lineno">  870</span>}</div>
-</div>
-<div class="line"><a id="l00871" name="l00871"></a><span class="lineno">  871</span> </div>
-<div class="line"><a id="l00872" name="l00872"></a><span class="lineno">  872</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00873" name="l00873"></a><span class="lineno">  873</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00874" name="l00874"></a><span class="lineno">  874</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l00875" name="l00875"></a><span class="lineno">  875</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l00876" name="l00876"></a><span class="lineno">  876</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l00877" name="l00877"></a><span class="lineno">  877</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l00878" name="l00878"></a><span class="lineno">  878</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen00879" data-start="{" data-end="}">
-<div class="line"><a id="l00879" name="l00879"></a><span class="lineno"><a class="line" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">  879</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl</a>(</div>
-<div class="line"><a id="l00880" name="l00880"></a><span class="lineno">  880</span>    <span class="keyword">const</span> device uint32_t* w,</div>
-<div class="line"><a id="l00881" name="l00881"></a><span class="lineno">  881</span>    <span class="keyword">const</span> device T* scales,</div>
-<div class="line"><a id="l00882" name="l00882"></a><span class="lineno">  882</span>    <span class="keyword">const</span> device T* biases,</div>
-<div class="line"><a id="l00883" name="l00883"></a><span class="lineno">  883</span>    <span class="keyword">const</span> device T* x,</div>
-<div class="line"><a id="l00884" name="l00884"></a><span class="lineno">  884</span>    device T* y,</div>
-<div class="line"><a id="l00885" name="l00885"></a><span class="lineno">  885</span>    threadgroup T* Xs,</div>
-<div class="line"><a id="l00886" name="l00886"></a><span class="lineno">  886</span>    threadgroup T* Ws,</div>
-<div class="line"><a id="l00887" name="l00887"></a><span class="lineno">  887</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K,</div>
-<div class="line"><a id="l00888" name="l00888"></a><span class="lineno">  888</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N,</div>
-<div class="line"><a id="l00889" name="l00889"></a><span class="lineno">  889</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M,</div>
-<div class="line"><a id="l00890" name="l00890"></a><span class="lineno">  890</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00891" name="l00891"></a><span class="lineno">  891</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l00892" name="l00892"></a><span class="lineno">  892</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l00893" name="l00893"></a><span class="lineno">  893</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l00894" name="l00894"></a><span class="lineno">  894</span>  <span class="keyword">static_assert</span>(BK &gt;= <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>, <span class="stringliteral">&quot;BK should be larger than SIMD_SIZE&quot;</span>);</div>
-<div class="line"><a id="l00895" name="l00895"></a><span class="lineno">  895</span>  <span class="keyword">static_assert</span>(BK % <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a> == 0, <span class="stringliteral">&quot;BK should be divisible by SIMD_SIZE&quot;</span>);</div>
-<div class="line"><a id="l00896" name="l00896"></a><span class="lineno">  896</span> </div>
-<div class="line"><a id="l00897" name="l00897"></a><span class="lineno">  897</span>  (void)lid;</div>
+<div class="line"><a id="l00863" name="l00863"></a><span class="lineno">  863</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
+<div class="line"><a id="l00864" name="l00864"></a><span class="lineno">  864</span>  <span class="keyword">typedef</span> <span class="keyword">struct </span>{</div>
+<div class="line"><a id="l00865" name="l00865"></a><span class="lineno">  865</span>    uint8_t wi[tn * bytes_per_pack];</div>
+<div class="line"><a id="l00866" name="l00866"></a><span class="lineno">  866</span>  } vec_w;</div>
+<div class="line"><a id="l00867" name="l00867"></a><span class="lineno">  867</span> </div>
+<div class="line"><a id="l00868" name="l00868"></a><span class="lineno">  868</span>  thread vec_w w_local;</div>
+<div class="line"><a id="l00869" name="l00869"></a><span class="lineno">  869</span>  thread U result[tn * pack_factor] = {0};</div>
+<div class="line"><a id="l00870" name="l00870"></a><span class="lineno">  870</span>  thread U scale = 1;</div>
+<div class="line"><a id="l00871" name="l00871"></a><span class="lineno">  871</span>  thread U bias = 0;</div>
+<div class="line"><a id="l00872" name="l00872"></a><span class="lineno">  872</span>  thread U x_local = 0;</div>
+<div class="line"><a id="l00873" name="l00873"></a><span class="lineno">  873</span> </div>
+<div class="line"><a id="l00874" name="l00874"></a><span class="lineno">  874</span>  <span class="comment">// Adjust positions</span></div>
+<div class="line"><a id="l00875" name="l00875"></a><span class="lineno">  875</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_vec_size_w = out_vec_size * bytes_per_pack / pack_factor;</div>
+<div class="line"><a id="l00876" name="l00876"></a><span class="lineno">  876</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> out_vec_size_g = out_vec_size / group_size;</div>
+<div class="line"><a id="l00877" name="l00877"></a><span class="lineno">  877</span>  <span class="keywordtype">int</span> out_col = pack_factor * tn * (tid.x * num_simdgroups + simd_gid);</div>
+<div class="line"><a id="l00878" name="l00878"></a><span class="lineno">  878</span>  ws += out_col * bytes_per_pack / pack_factor + simd_lid * out_vec_size_w;</div>
+<div class="line"><a id="l00879" name="l00879"></a><span class="lineno">  879</span>  scales += out_col / group_size + simd_lid * out_vec_size_g;</div>
+<div class="line"><a id="l00880" name="l00880"></a><span class="lineno">  880</span>  biases += out_col / group_size + simd_lid * out_vec_size_g;</div>
+<div class="line"><a id="l00881" name="l00881"></a><span class="lineno">  881</span>  x += tid.y * in_vec_size + simd_lid;</div>
+<div class="line"><a id="l00882" name="l00882"></a><span class="lineno">  882</span>  y += tid.y * out_vec_size + out_col;</div>
+<div class="line"><a id="l00883" name="l00883"></a><span class="lineno">  883</span> </div>
+<div class="line"><a id="l00884" name="l00884"></a><span class="lineno">  884</span>  <span class="keywordflow">if</span> (out_col &gt;= out_vec_size) {</div>
+<div class="line"><a id="l00885" name="l00885"></a><span class="lineno">  885</span>    <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00886" name="l00886"></a><span class="lineno">  886</span>  }</div>
+<div class="line"><a id="l00887" name="l00887"></a><span class="lineno">  887</span> </div>
+<div class="line"><a id="l00888" name="l00888"></a><span class="lineno">  888</span>  <span class="comment">// Loop over in_vec in blocks of block_size</span></div>
+<div class="line"><a id="l00889" name="l00889"></a><span class="lineno">  889</span>  <span class="keywordtype">int</span> remaining = in_vec_size % block_size;</div>
+<div class="line"><a id="l00890" name="l00890"></a><span class="lineno">  890</span>  <span class="keywordflow">if</span> (remaining == 0) {</div>
+<div class="line"><a id="l00891" name="l00891"></a><span class="lineno">  891</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; in_vec_size; i += block_size) {</div>
+<div class="line"><a id="l00892" name="l00892"></a><span class="lineno">  892</span>      x_local = *x;</div>
+<div class="line"><a id="l00893" name="l00893"></a><span class="lineno">  893</span>      scale = *scales;</div>
+<div class="line"><a id="l00894" name="l00894"></a><span class="lineno">  894</span>      bias = *biases;</div>
+<div class="line"><a id="l00895" name="l00895"></a><span class="lineno">  895</span>      w_local = *((device vec_w*)ws);</div>
+<div class="line"><a id="l00896" name="l00896"></a><span class="lineno">  896</span>      <a class="code hl_function" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter&lt;U, tn * pack_factor, bits&gt;</a>(</div>
+<div class="line"><a id="l00897" name="l00897"></a><span class="lineno">  897</span>          (thread uint8_t*)&amp;w_local, x_local, scale, bias, result);</div>
 <div class="line"><a id="l00898" name="l00898"></a><span class="lineno">  898</span> </div>
-<div class="line"><a id="l00899" name="l00899"></a><span class="lineno">  899</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> WM = 2;</div>
-<div class="line"><a id="l00900" name="l00900"></a><span class="lineno">  900</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> WN = 2;</div>
-<div class="line"><a id="l00901" name="l00901"></a><span class="lineno">  901</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = 32 / bits;</div>
-<div class="line"><a id="l00902" name="l00902"></a><span class="lineno">  902</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l00903" name="l00903"></a><span class="lineno">  903</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l00904" name="l00904"></a><span class="lineno">  904</span> </div>
-<div class="line"><a id="l00905" name="l00905"></a><span class="lineno">  905</span>  <span class="comment">// Instantiate the appropriate BlockMMA and Loader</span></div>
-<div class="line"><a id="l00906" name="l00906"></a><span class="lineno">  906</span>  <span class="keyword">using </span>mma_t = mlx::steel::</div>
-<div class="line"><a id="l00907" name="l00907"></a><span class="lineno">  907</span>      BlockMMA&lt;T, T, BM, BN, BK, WM, WN, false, false, BK_padded, BN_padded&gt;;</div>
-<div class="line"><a id="l00908" name="l00908"></a><span class="lineno">  908</span>  <span class="keyword">using </span>loader_x_t = mlx::steel::</div>
-<div class="line"><a id="l00909" name="l00909"></a><span class="lineno">  909</span>      BlockLoader&lt;T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE, 1, 4&gt;;</div>
-<div class="line"><a id="l00910" name="l00910"></a><span class="lineno">  910</span>  <span class="keyword">using </span>loader_w_t = <a class="code hl_struct" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt;</div>
-<div class="line"><a id="l00911" name="l00911"></a><span class="lineno">  911</span>      T,</div>
-<div class="line"><a id="l00912" name="l00912"></a><span class="lineno">  912</span>      BK,</div>
-<div class="line"><a id="l00913" name="l00913"></a><span class="lineno">  913</span>      BN,</div>
-<div class="line"><a id="l00914" name="l00914"></a><span class="lineno">  914</span>      BN_padded,</div>
-<div class="line"><a id="l00915" name="l00915"></a><span class="lineno">  915</span>      0,</div>
-<div class="line"><a id="l00916" name="l00916"></a><span class="lineno">  916</span>      WM * WN * <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>,</div>
-<div class="line"><a id="l00917" name="l00917"></a><span class="lineno">  917</span>      group_size,</div>
-<div class="line"><a id="l00918" name="l00918"></a><span class="lineno">  918</span>      bits&gt;;</div>
-<div class="line"><a id="l00919" name="l00919"></a><span class="lineno">  919</span> </div>
-<div class="line"><a id="l00920" name="l00920"></a><span class="lineno">  920</span>  <span class="comment">// Set the block</span></div>
-<div class="line"><a id="l00921" name="l00921"></a><span class="lineno">  921</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> y_row = tid.y * BM;</div>
-<div class="line"><a id="l00922" name="l00922"></a><span class="lineno">  922</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> y_col = tid.x * BN;</div>
-<div class="line"><a id="l00923" name="l00923"></a><span class="lineno">  923</span>  x += y_row * K;</div>
-<div class="line"><a id="l00924" name="l00924"></a><span class="lineno">  924</span>  w += y_col / pack_factor;</div>
-<div class="line"><a id="l00925" name="l00925"></a><span class="lineno">  925</span>  scales += y_col / group_size;</div>
-<div class="line"><a id="l00926" name="l00926"></a><span class="lineno">  926</span>  biases += y_col / group_size;</div>
-<div class="line"><a id="l00927" name="l00927"></a><span class="lineno">  927</span>  y += y_row * N + y_col;</div>
-<div class="line"><a id="l00928" name="l00928"></a><span class="lineno">  928</span> </div>
-<div class="line"><a id="l00929" name="l00929"></a><span class="lineno">  929</span>  <span class="comment">// Make the x loader and mma operation</span></div>
-<div class="line"><a id="l00930" name="l00930"></a><span class="lineno">  930</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> num_els = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(BM, M - y_row);</div>
-<div class="line"><a id="l00931" name="l00931"></a><span class="lineno">  931</span>  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);</div>
-<div class="line"><a id="l00932" name="l00932"></a><span class="lineno">  932</span>  loader_w_t loader_w(w, scales, biases, N, Ws, simd_gid, simd_lid);</div>
-<div class="line"><a id="l00933" name="l00933"></a><span class="lineno">  933</span>  mma_t mma_op(simd_gid, simd_lid);</div>
-<div class="line"><a id="l00934" name="l00934"></a><span class="lineno">  934</span> </div>
-<div class="line"><a id="l00935" name="l00935"></a><span class="lineno">  935</span>  <span class="keywordflow">if</span> (num_els &lt; BM) {</div>
-<div class="line"><a id="l00936" name="l00936"></a><span class="lineno">  936</span>    <span class="keywordflow">if</span> ((K % BK) != 0) {</div>
-<div class="line"><a id="l00937" name="l00937"></a><span class="lineno">  937</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> k_blocks = K / BK;</div>
-<div class="line"><a id="l00938" name="l00938"></a><span class="lineno">  938</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; k_blocks; k++) {</div>
-<div class="line"><a id="l00939" name="l00939"></a><span class="lineno">  939</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00940" name="l00940"></a><span class="lineno">  940</span>        loader_x.load_safe(short2(BK, num_els));</div>
-<div class="line"><a id="l00941" name="l00941"></a><span class="lineno">  941</span>        loader_w.load_unsafe();</div>
-<div class="line"><a id="l00942" name="l00942"></a><span class="lineno">  942</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00943" name="l00943"></a><span class="lineno">  943</span>        mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00944" name="l00944"></a><span class="lineno">  944</span>        loader_x.next();</div>
-<div class="line"><a id="l00945" name="l00945"></a><span class="lineno">  945</span>        loader_w.next();</div>
-<div class="line"><a id="l00946" name="l00946"></a><span class="lineno">  946</span>      }</div>
-<div class="line"><a id="l00947" name="l00947"></a><span class="lineno">  947</span>      <span class="keyword">const</span> <span class="keywordtype">short</span> num_k = K - k_blocks * BK;</div>
-<div class="line"><a id="l00948" name="l00948"></a><span class="lineno">  948</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00949" name="l00949"></a><span class="lineno">  949</span>      loader_x.load_safe(short2(num_k, num_els));</div>
-<div class="line"><a id="l00950" name="l00950"></a><span class="lineno">  950</span>      loader_w.load_safe(short2(BN, num_k));</div>
-<div class="line"><a id="l00951" name="l00951"></a><span class="lineno">  951</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00952" name="l00952"></a><span class="lineno">  952</span>      mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00953" name="l00953"></a><span class="lineno">  953</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00954" name="l00954"></a><span class="lineno">  954</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
-<div class="line"><a id="l00955" name="l00955"></a><span class="lineno">  955</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00956" name="l00956"></a><span class="lineno">  956</span>        loader_x.load_safe(short2(BK, num_els));</div>
-<div class="line"><a id="l00957" name="l00957"></a><span class="lineno">  957</span>        loader_w.load_unsafe();</div>
-<div class="line"><a id="l00958" name="l00958"></a><span class="lineno">  958</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00959" name="l00959"></a><span class="lineno">  959</span>        mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00960" name="l00960"></a><span class="lineno">  960</span>        loader_x.next();</div>
-<div class="line"><a id="l00961" name="l00961"></a><span class="lineno">  961</span>        loader_w.next();</div>
-<div class="line"><a id="l00962" name="l00962"></a><span class="lineno">  962</span>      }</div>
-<div class="line"><a id="l00963" name="l00963"></a><span class="lineno">  963</span>    }</div>
-<div class="line"><a id="l00964" name="l00964"></a><span class="lineno">  964</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00965" name="l00965"></a><span class="lineno">  965</span>    <span class="keywordflow">if</span> ((K % BK) != 0) {</div>
-<div class="line"><a id="l00966" name="l00966"></a><span class="lineno">  966</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> k_blocks = K / BK;</div>
-<div class="line"><a id="l00967" name="l00967"></a><span class="lineno">  967</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; k_blocks; k++) {</div>
-<div class="line"><a id="l00968" name="l00968"></a><span class="lineno">  968</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00969" name="l00969"></a><span class="lineno">  969</span>        loader_x.load_unsafe();</div>
-<div class="line"><a id="l00970" name="l00970"></a><span class="lineno">  970</span>        loader_w.load_unsafe();</div>
-<div class="line"><a id="l00971" name="l00971"></a><span class="lineno">  971</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00972" name="l00972"></a><span class="lineno">  972</span>        mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00973" name="l00973"></a><span class="lineno">  973</span>        loader_x.next();</div>
-<div class="line"><a id="l00974" name="l00974"></a><span class="lineno">  974</span>        loader_w.next();</div>
-<div class="line"><a id="l00975" name="l00975"></a><span class="lineno">  975</span>      }</div>
-<div class="line"><a id="l00976" name="l00976"></a><span class="lineno">  976</span>      <span class="keyword">const</span> <span class="keywordtype">short</span> num_k = K - k_blocks * BK;</div>
-<div class="line"><a id="l00977" name="l00977"></a><span class="lineno">  977</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00978" name="l00978"></a><span class="lineno">  978</span>      loader_x.load_safe(short2(num_k, BM));</div>
-<div class="line"><a id="l00979" name="l00979"></a><span class="lineno">  979</span>      loader_w.load_safe(short2(BN, num_k));</div>
-<div class="line"><a id="l00980" name="l00980"></a><span class="lineno">  980</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00981" name="l00981"></a><span class="lineno">  981</span>      mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00982" name="l00982"></a><span class="lineno">  982</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00983" name="l00983"></a><span class="lineno">  983</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
-<div class="line"><a id="l00984" name="l00984"></a><span class="lineno">  984</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00985" name="l00985"></a><span class="lineno">  985</span>        loader_x.load_unsafe();</div>
-<div class="line"><a id="l00986" name="l00986"></a><span class="lineno">  986</span>        loader_w.load_unsafe();</div>
-<div class="line"><a id="l00987" name="l00987"></a><span class="lineno">  987</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00988" name="l00988"></a><span class="lineno">  988</span>        mma_op.mma(Xs, Ws);</div>
-<div class="line"><a id="l00989" name="l00989"></a><span class="lineno">  989</span>        loader_x.next();</div>
-<div class="line"><a id="l00990" name="l00990"></a><span class="lineno">  990</span>        loader_w.next();</div>
-<div class="line"><a id="l00991" name="l00991"></a><span class="lineno">  991</span>      }</div>
-<div class="line"><a id="l00992" name="l00992"></a><span class="lineno">  992</span>    }</div>
-<div class="line"><a id="l00993" name="l00993"></a><span class="lineno">  993</span>  }</div>
-<div class="line"><a id="l00994" name="l00994"></a><span class="lineno">  994</span> </div>
-<div class="line"><a id="l00995" name="l00995"></a><span class="lineno">  995</span>  <span class="comment">// Store results to device memory</span></div>
-<div class="line"><a id="l00996" name="l00996"></a><span class="lineno">  996</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00997" name="l00997"></a><span class="lineno">  997</span>  <span class="keywordflow">if</span> (num_els &lt; BM) {</div>
-<div class="line"><a id="l00998" name="l00998"></a><span class="lineno">  998</span>    mma_op.store_result_safe(y, N, short2(BN, num_els));</div>
-<div class="line"><a id="l00999" name="l00999"></a><span class="lineno">  999</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01000" name="l01000"></a><span class="lineno"> 1000</span>    mma_op.store_result(y, N);</div>
-<div class="line"><a id="l01001" name="l01001"></a><span class="lineno"> 1001</span>  }</div>
-<div class="line"><a id="l01002" name="l01002"></a><span class="lineno"> 1002</span>}</div>
+<div class="line"><a id="l00899" name="l00899"></a><span class="lineno">  899</span>      x += block_size;</div>
+<div class="line"><a id="l00900" name="l00900"></a><span class="lineno">  900</span>      scales += block_size * out_vec_size_g;</div>
+<div class="line"><a id="l00901" name="l00901"></a><span class="lineno">  901</span>      biases += block_size * out_vec_size_g;</div>
+<div class="line"><a id="l00902" name="l00902"></a><span class="lineno">  902</span>      ws += block_size * out_vec_size_w;</div>
+<div class="line"><a id="l00903" name="l00903"></a><span class="lineno">  903</span>    }</div>
+<div class="line"><a id="l00904" name="l00904"></a><span class="lineno">  904</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00905" name="l00905"></a><span class="lineno">  905</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = block_size; i &lt; in_vec_size; i += block_size) {</div>
+<div class="line"><a id="l00906" name="l00906"></a><span class="lineno">  906</span>      x_local = *x;</div>
+<div class="line"><a id="l00907" name="l00907"></a><span class="lineno">  907</span>      scale = *scales;</div>
+<div class="line"><a id="l00908" name="l00908"></a><span class="lineno">  908</span>      bias = *biases;</div>
+<div class="line"><a id="l00909" name="l00909"></a><span class="lineno">  909</span>      w_local = *((device vec_w*)ws);</div>
+<div class="line"><a id="l00910" name="l00910"></a><span class="lineno">  910</span> </div>
+<div class="line"><a id="l00911" name="l00911"></a><span class="lineno">  911</span>      <a class="code hl_function" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter&lt;U, tn * pack_factor, bits&gt;</a>(</div>
+<div class="line"><a id="l00912" name="l00912"></a><span class="lineno">  912</span>          (thread uint8_t*)&amp;w_local, x_local, scale, bias, result);</div>
+<div class="line"><a id="l00913" name="l00913"></a><span class="lineno">  913</span> </div>
+<div class="line"><a id="l00914" name="l00914"></a><span class="lineno">  914</span>      x += block_size;</div>
+<div class="line"><a id="l00915" name="l00915"></a><span class="lineno">  915</span>      scales += block_size * out_vec_size_g;</div>
+<div class="line"><a id="l00916" name="l00916"></a><span class="lineno">  916</span>      biases += block_size * out_vec_size_g;</div>
+<div class="line"><a id="l00917" name="l00917"></a><span class="lineno">  917</span>      ws += block_size * out_vec_size_w;</div>
+<div class="line"><a id="l00918" name="l00918"></a><span class="lineno">  918</span>    }</div>
+<div class="line"><a id="l00919" name="l00919"></a><span class="lineno">  919</span>    <span class="keywordflow">if</span> (<span class="keyword">static_cast&lt;</span><span class="keywordtype">int</span><span class="keyword">&gt;</span>(simd_lid) &lt; remaining) {</div>
+<div class="line"><a id="l00920" name="l00920"></a><span class="lineno">  920</span>      x_local = *x;</div>
+<div class="line"><a id="l00921" name="l00921"></a><span class="lineno">  921</span>      scale = *scales;</div>
+<div class="line"><a id="l00922" name="l00922"></a><span class="lineno">  922</span>      bias = *biases;</div>
+<div class="line"><a id="l00923" name="l00923"></a><span class="lineno">  923</span>      w_local = *((device vec_w*)ws);</div>
+<div class="line"><a id="l00924" name="l00924"></a><span class="lineno">  924</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00925" name="l00925"></a><span class="lineno">  925</span>      x_local = 0;</div>
+<div class="line"><a id="l00926" name="l00926"></a><span class="lineno">  926</span>      scale = 0;</div>
+<div class="line"><a id="l00927" name="l00927"></a><span class="lineno">  927</span>      bias = 0;</div>
+<div class="line"><a id="l00928" name="l00928"></a><span class="lineno">  928</span>    }</div>
+<div class="line"><a id="l00929" name="l00929"></a><span class="lineno">  929</span>    <a class="code hl_function" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter&lt;U, tn * pack_factor, bits&gt;</a>(</div>
+<div class="line"><a id="l00930" name="l00930"></a><span class="lineno">  930</span>        (thread uint8_t*)&amp;w_local, x_local, scale, bias, result);</div>
+<div class="line"><a id="l00931" name="l00931"></a><span class="lineno">  931</span>  }</div>
+<div class="line"><a id="l00932" name="l00932"></a><span class="lineno">  932</span> </div>
+<div class="line"><a id="l00933" name="l00933"></a><span class="lineno">  933</span><span class="comment">// Accumulate in the simdgroup</span></div>
+<div class="line"><a id="l00934" name="l00934"></a><span class="lineno">  934</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l00935" name="l00935"></a><span class="lineno">  935</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; tn * pack_factor; k++) {</div>
+<div class="line"><a id="l00936" name="l00936"></a><span class="lineno">  936</span>    result[k] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(result[k]);</div>
+<div class="line"><a id="l00937" name="l00937"></a><span class="lineno">  937</span>  }</div>
+<div class="line"><a id="l00938" name="l00938"></a><span class="lineno">  938</span> </div>
+<div class="line"><a id="l00939" name="l00939"></a><span class="lineno">  939</span>  <span class="comment">// Store the result</span></div>
+<div class="line"><a id="l00940" name="l00940"></a><span class="lineno">  940</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00941" name="l00941"></a><span class="lineno">  941</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l00942" name="l00942"></a><span class="lineno">  942</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; tn * pack_factor; k++) {</div>
+<div class="line"><a id="l00943" name="l00943"></a><span class="lineno">  943</span>      y[k] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(result[k]);</div>
+<div class="line"><a id="l00944" name="l00944"></a><span class="lineno">  944</span>    }</div>
+<div class="line"><a id="l00945" name="l00945"></a><span class="lineno">  945</span>  }</div>
+<div class="line"><a id="l00946" name="l00946"></a><span class="lineno">  946</span>}</div>
 </div>
-<div class="line"><a id="l01003" name="l01003"></a><span class="lineno"> 1003</span> </div>
-<div class="line"><a id="l01004" name="l01004"></a><span class="lineno"> 1004</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
-<div class="foldopen" id="foldopen01005" data-start="{" data-end="}">
-<div class="line"><a id="l01005" name="l01005"></a><span class="lineno"><a class="line" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be"> 1005</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets</a>(</div>
-<div class="line"><a id="l01006" name="l01006"></a><span class="lineno"> 1006</span>    <span class="keyword">const</span> device T*&amp; x,</div>
-<div class="line"><a id="l01007" name="l01007"></a><span class="lineno"> 1007</span>    <span class="keyword">const</span> device uint32_t*&amp; w,</div>
-<div class="line"><a id="l01008" name="l01008"></a><span class="lineno"> 1008</span>    <span class="keyword">const</span> device T*&amp; scales,</div>
-<div class="line"><a id="l01009" name="l01009"></a><span class="lineno"> 1009</span>    <span class="keyword">const</span> device T*&amp; biases,</div>
-<div class="line"><a id="l01010" name="l01010"></a><span class="lineno"> 1010</span>    device T*&amp; y,</div>
-<div class="line"><a id="l01011" name="l01011"></a><span class="lineno"> 1011</span>    <span class="keywordtype">int</span> output_stride,</div>
-<div class="line"><a id="l01012" name="l01012"></a><span class="lineno"> 1012</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims,</div>
-<div class="line"><a id="l01013" name="l01013"></a><span class="lineno"> 1013</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape,</div>
-<div class="line"><a id="l01014" name="l01014"></a><span class="lineno"> 1014</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides,</div>
-<div class="line"><a id="l01015" name="l01015"></a><span class="lineno"> 1015</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims,</div>
-<div class="line"><a id="l01016" name="l01016"></a><span class="lineno"> 1016</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape,</div>
-<div class="line"><a id="l01017" name="l01017"></a><span class="lineno"> 1017</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides,</div>
-<div class="line"><a id="l01018" name="l01018"></a><span class="lineno"> 1018</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides,</div>
-<div class="line"><a id="l01019" name="l01019"></a><span class="lineno"> 1019</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides,</div>
-<div class="line"><a id="l01020" name="l01020"></a><span class="lineno"> 1020</span>    uint3 tid [[threadgroup_position_in_grid]]) {</div>
-<div class="line"><a id="l01021" name="l01021"></a><span class="lineno"> 1021</span>  <span class="comment">// Set the input/output matrices</span></div>
-<div class="line"><a id="l01022" name="l01022"></a><span class="lineno"> 1022</span>  uint32_t x_idx = tid.z;</div>
-<div class="line"><a id="l01023" name="l01023"></a><span class="lineno"> 1023</span>  uint32_t w_idx = tid.z;</div>
-<div class="line"><a id="l01024" name="l01024"></a><span class="lineno"> 1024</span>  <span class="keywordflow">if</span> (x_batch_ndims == 1) {</div>
-<div class="line"><a id="l01025" name="l01025"></a><span class="lineno"> 1025</span>    x += x_idx * x_strides[0];</div>
-<div class="line"><a id="l01026" name="l01026"></a><span class="lineno"> 1026</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01027" name="l01027"></a><span class="lineno"> 1027</span>    x += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(x_idx, x_shape, x_strides, x_batch_ndims);</div>
-<div class="line"><a id="l01028" name="l01028"></a><span class="lineno"> 1028</span>  }</div>
-<div class="line"><a id="l01029" name="l01029"></a><span class="lineno"> 1029</span>  <span class="keywordflow">if</span> (w_batch_ndims == 1) {</div>
-<div class="line"><a id="l01030" name="l01030"></a><span class="lineno"> 1030</span>    w += w_idx * w_strides[0];</div>
-<div class="line"><a id="l01031" name="l01031"></a><span class="lineno"> 1031</span>    scales += w_idx * s_strides[0];</div>
-<div class="line"><a id="l01032" name="l01032"></a><span class="lineno"> 1032</span>    biases += w_idx * b_strides[0];</div>
-<div class="line"><a id="l01033" name="l01033"></a><span class="lineno"> 1033</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01034" name="l01034"></a><span class="lineno"> 1034</span>    ulong3 idx = <a class="code hl_function" href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a>(</div>
-<div class="line"><a id="l01035" name="l01035"></a><span class="lineno"> 1035</span>        w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);</div>
-<div class="line"><a id="l01036" name="l01036"></a><span class="lineno"> 1036</span>    w += idx.x;</div>
-<div class="line"><a id="l01037" name="l01037"></a><span class="lineno"> 1037</span>    scales += idx.y;</div>
-<div class="line"><a id="l01038" name="l01038"></a><span class="lineno"> 1038</span>    biases += idx.z;</div>
-<div class="line"><a id="l01039" name="l01039"></a><span class="lineno"> 1039</span>  }</div>
-<div class="line"><a id="l01040" name="l01040"></a><span class="lineno"> 1040</span>  y += tid.z * output_stride;</div>
-<div class="line"><a id="l01041" name="l01041"></a><span class="lineno"> 1041</span>}</div>
+<div class="line"><a id="l00947" name="l00947"></a><span class="lineno">  947</span> </div>
+<div class="line"><a id="l00948" name="l00948"></a><span class="lineno">  948</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00949" name="l00949"></a><span class="lineno">  949</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00950" name="l00950"></a><span class="lineno">  950</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l00951" name="l00951"></a><span class="lineno">  951</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l00952" name="l00952"></a><span class="lineno">  952</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
+<div class="line"><a id="l00953" name="l00953"></a><span class="lineno">  953</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l00954" name="l00954"></a><span class="lineno">  954</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l00955" name="l00955"></a><span class="lineno">  955</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen00956" data-start="{" data-end="}">
+<div class="line"><a id="l00956" name="l00956"></a><span class="lineno"><a class="line" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">  956</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl</a>(</div>
+<div class="line"><a id="l00957" name="l00957"></a><span class="lineno">  957</span>    <span class="keyword">const</span> device uint32_t* w,</div>
+<div class="line"><a id="l00958" name="l00958"></a><span class="lineno">  958</span>    <span class="keyword">const</span> device T* scales,</div>
+<div class="line"><a id="l00959" name="l00959"></a><span class="lineno">  959</span>    <span class="keyword">const</span> device T* biases,</div>
+<div class="line"><a id="l00960" name="l00960"></a><span class="lineno">  960</span>    <span class="keyword">const</span> device T* x,</div>
+<div class="line"><a id="l00961" name="l00961"></a><span class="lineno">  961</span>    device T* y,</div>
+<div class="line"><a id="l00962" name="l00962"></a><span class="lineno">  962</span>    threadgroup T* Xs,</div>
+<div class="line"><a id="l00963" name="l00963"></a><span class="lineno">  963</span>    threadgroup T* Ws,</div>
+<div class="line"><a id="l00964" name="l00964"></a><span class="lineno">  964</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K,</div>
+<div class="line"><a id="l00965" name="l00965"></a><span class="lineno">  965</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N,</div>
+<div class="line"><a id="l00966" name="l00966"></a><span class="lineno">  966</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M,</div>
+<div class="line"><a id="l00967" name="l00967"></a><span class="lineno">  967</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00968" name="l00968"></a><span class="lineno">  968</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l00969" name="l00969"></a><span class="lineno">  969</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00970" name="l00970"></a><span class="lineno">  970</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l00971" name="l00971"></a><span class="lineno">  971</span>  <span class="keyword">static_assert</span>(BK &gt;= <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>, <span class="stringliteral">&quot;BK should be larger than SIMD_SIZE&quot;</span>);</div>
+<div class="line"><a id="l00972" name="l00972"></a><span class="lineno">  972</span>  <span class="keyword">static_assert</span>(BK % <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a> == 0, <span class="stringliteral">&quot;BK should be divisible by SIMD_SIZE&quot;</span>);</div>
+<div class="line"><a id="l00973" name="l00973"></a><span class="lineno">  973</span> </div>
+<div class="line"><a id="l00974" name="l00974"></a><span class="lineno">  974</span>  (void)lid;</div>
+<div class="line"><a id="l00975" name="l00975"></a><span class="lineno">  975</span> </div>
+<div class="line"><a id="l00976" name="l00976"></a><span class="lineno">  976</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> WM = 2;</div>
+<div class="line"><a id="l00977" name="l00977"></a><span class="lineno">  977</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> WN = 2;</div>
+<div class="line"><a id="l00978" name="l00978"></a><span class="lineno">  978</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;</div>
+<div class="line"><a id="l00979" name="l00979"></a><span class="lineno">  979</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l00980" name="l00980"></a><span class="lineno">  980</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;</div>
+<div class="line"><a id="l00981" name="l00981"></a><span class="lineno">  981</span> </div>
+<div class="line"><a id="l00982" name="l00982"></a><span class="lineno">  982</span>  <span class="comment">// Instantiate the appropriate BlockMMA and Loader</span></div>
+<div class="line"><a id="l00983" name="l00983"></a><span class="lineno">  983</span>  <span class="keyword">using </span>mma_t = mlx::steel::</div>
+<div class="line"><a id="l00984" name="l00984"></a><span class="lineno">  984</span>      BlockMMA&lt;T, T, BM, BN, BK, WM, WN, false, true, BK_padded, BK_padded&gt;;</div>
+<div class="line"><a id="l00985" name="l00985"></a><span class="lineno">  985</span>  <span class="keyword">using </span>loader_x_t =</div>
+<div class="line"><a id="l00986" name="l00986"></a><span class="lineno">  986</span>      <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt;T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE&gt;</a>;</div>
+<div class="line"><a id="l00987" name="l00987"></a><span class="lineno">  987</span>  <span class="keyword">using </span>loader_w_t = <a class="code hl_struct" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt;</div>
+<div class="line"><a id="l00988" name="l00988"></a><span class="lineno">  988</span>      T,</div>
+<div class="line"><a id="l00989" name="l00989"></a><span class="lineno">  989</span>      BN,</div>
+<div class="line"><a id="l00990" name="l00990"></a><span class="lineno">  990</span>      BK,</div>
+<div class="line"><a id="l00991" name="l00991"></a><span class="lineno">  991</span>      BK_padded,</div>
+<div class="line"><a id="l00992" name="l00992"></a><span class="lineno">  992</span>      1,</div>
+<div class="line"><a id="l00993" name="l00993"></a><span class="lineno">  993</span>      WM * WN * <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>,</div>
+<div class="line"><a id="l00994" name="l00994"></a><span class="lineno">  994</span>      group_size,</div>
+<div class="line"><a id="l00995" name="l00995"></a><span class="lineno">  995</span>      bits&gt;;</div>
+<div class="line"><a id="l00996" name="l00996"></a><span class="lineno">  996</span> </div>
+<div class="line"><a id="l00997" name="l00997"></a><span class="lineno">  997</span>  <span class="comment">// Set the block</span></div>
+<div class="line"><a id="l00998" name="l00998"></a><span class="lineno">  998</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> K_w = K * bytes_per_pack / pack_factor;</div>
+<div class="line"><a id="l00999" name="l00999"></a><span class="lineno">  999</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> K_g = K / group_size;</div>
+<div class="line"><a id="l01000" name="l01000"></a><span class="lineno"> 1000</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> y_row = tid.y * BM;</div>
+<div class="line"><a id="l01001" name="l01001"></a><span class="lineno"> 1001</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> y_col = tid.x * BN;</div>
+<div class="line"><a id="l01002" name="l01002"></a><span class="lineno"> 1002</span> </div>
+<div class="line"><a id="l01003" name="l01003"></a><span class="lineno"> 1003</span>  <span class="keyword">auto</span> wl = (<span class="keyword">const</span> device uint8_t*)w;</div>
+<div class="line"><a id="l01004" name="l01004"></a><span class="lineno"> 1004</span> </div>
+<div class="line"><a id="l01005" name="l01005"></a><span class="lineno"> 1005</span>  x += y_row * K;</div>
+<div class="line"><a id="l01006" name="l01006"></a><span class="lineno"> 1006</span>  wl += y_col * K_w;</div>
+<div class="line"><a id="l01007" name="l01007"></a><span class="lineno"> 1007</span>  scales += y_col * K_g;</div>
+<div class="line"><a id="l01008" name="l01008"></a><span class="lineno"> 1008</span>  biases += y_col * K_g;</div>
+<div class="line"><a id="l01009" name="l01009"></a><span class="lineno"> 1009</span>  y += y_row * N + y_col;</div>
+<div class="line"><a id="l01010" name="l01010"></a><span class="lineno"> 1010</span> </div>
+<div class="line"><a id="l01011" name="l01011"></a><span class="lineno"> 1011</span>  <span class="comment">// Make the x loader and mma operation</span></div>
+<div class="line"><a id="l01012" name="l01012"></a><span class="lineno"> 1012</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> num_els = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(BM, M - y_row);</div>
+<div class="line"><a id="l01013" name="l01013"></a><span class="lineno"> 1013</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> num_outs = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(BN, N - y_col);</div>
+<div class="line"><a id="l01014" name="l01014"></a><span class="lineno"> 1014</span>  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01015" name="l01015"></a><span class="lineno"> 1015</span>  loader_w_t loader_w(wl, scales, biases, K, Ws, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01016" name="l01016"></a><span class="lineno"> 1016</span>  mma_t mma_op(simd_gid, simd_lid);</div>
+<div class="line"><a id="l01017" name="l01017"></a><span class="lineno"> 1017</span> </div>
+<div class="line"><a id="l01018" name="l01018"></a><span class="lineno"> 1018</span>  <span class="keywordflow">if</span> (num_els &lt; BM) {</div>
+<div class="line"><a id="l01019" name="l01019"></a><span class="lineno"> 1019</span>    <span class="keywordflow">if</span> (!aligned_N &amp;&amp; num_outs &lt; BN) {</div>
+<div class="line"><a id="l01020" name="l01020"></a><span class="lineno"> 1020</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
+<div class="line"><a id="l01021" name="l01021"></a><span class="lineno"> 1021</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01022" name="l01022"></a><span class="lineno"> 1022</span>        loader_x.load_safe(short2(BK, num_els));</div>
+<div class="line"><a id="l01023" name="l01023"></a><span class="lineno"> 1023</span>        loader_w.load_safe(short2(BK, num_outs));</div>
+<div class="line"><a id="l01024" name="l01024"></a><span class="lineno"> 1024</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01025" name="l01025"></a><span class="lineno"> 1025</span>        mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01026" name="l01026"></a><span class="lineno"> 1026</span>        loader_x.next();</div>
+<div class="line"><a id="l01027" name="l01027"></a><span class="lineno"> 1027</span>        loader_w.next();</div>
+<div class="line"><a id="l01028" name="l01028"></a><span class="lineno"> 1028</span>      }</div>
+<div class="line"><a id="l01029" name="l01029"></a><span class="lineno"> 1029</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01030" name="l01030"></a><span class="lineno"> 1030</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
+<div class="line"><a id="l01031" name="l01031"></a><span class="lineno"> 1031</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01032" name="l01032"></a><span class="lineno"> 1032</span>        loader_x.load_safe(short2(BK, num_els));</div>
+<div class="line"><a id="l01033" name="l01033"></a><span class="lineno"> 1033</span>        loader_w.load_unsafe();</div>
+<div class="line"><a id="l01034" name="l01034"></a><span class="lineno"> 1034</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01035" name="l01035"></a><span class="lineno"> 1035</span>        mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01036" name="l01036"></a><span class="lineno"> 1036</span>        loader_x.next();</div>
+<div class="line"><a id="l01037" name="l01037"></a><span class="lineno"> 1037</span>        loader_w.next();</div>
+<div class="line"><a id="l01038" name="l01038"></a><span class="lineno"> 1038</span>      }</div>
+<div class="line"><a id="l01039" name="l01039"></a><span class="lineno"> 1039</span>    }</div>
+<div class="line"><a id="l01040" name="l01040"></a><span class="lineno"> 1040</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01041" name="l01041"></a><span class="lineno"> 1041</span>    <span class="keywordflow">if</span> (!aligned_N &amp;&amp; num_outs &lt; BN) {</div>
+<div class="line"><a id="l01042" name="l01042"></a><span class="lineno"> 1042</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
+<div class="line"><a id="l01043" name="l01043"></a><span class="lineno"> 1043</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01044" name="l01044"></a><span class="lineno"> 1044</span>        loader_x.load_unsafe();</div>
+<div class="line"><a id="l01045" name="l01045"></a><span class="lineno"> 1045</span>        loader_w.load_safe(short2(BK, num_outs));</div>
+<div class="line"><a id="l01046" name="l01046"></a><span class="lineno"> 1046</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01047" name="l01047"></a><span class="lineno"> 1047</span>        mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01048" name="l01048"></a><span class="lineno"> 1048</span>        loader_x.next();</div>
+<div class="line"><a id="l01049" name="l01049"></a><span class="lineno"> 1049</span>        loader_w.next();</div>
+<div class="line"><a id="l01050" name="l01050"></a><span class="lineno"> 1050</span>      }</div>
+<div class="line"><a id="l01051" name="l01051"></a><span class="lineno"> 1051</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01052" name="l01052"></a><span class="lineno"> 1052</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
+<div class="line"><a id="l01053" name="l01053"></a><span class="lineno"> 1053</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01054" name="l01054"></a><span class="lineno"> 1054</span>        loader_x.load_unsafe();</div>
+<div class="line"><a id="l01055" name="l01055"></a><span class="lineno"> 1055</span>        loader_w.load_unsafe();</div>
+<div class="line"><a id="l01056" name="l01056"></a><span class="lineno"> 1056</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01057" name="l01057"></a><span class="lineno"> 1057</span> </div>
+<div class="line"><a id="l01058" name="l01058"></a><span class="lineno"> 1058</span>        mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01059" name="l01059"></a><span class="lineno"> 1059</span>        loader_x.next();</div>
+<div class="line"><a id="l01060" name="l01060"></a><span class="lineno"> 1060</span>        loader_w.next();</div>
+<div class="line"><a id="l01061" name="l01061"></a><span class="lineno"> 1061</span>      }</div>
+<div class="line"><a id="l01062" name="l01062"></a><span class="lineno"> 1062</span>    }</div>
+<div class="line"><a id="l01063" name="l01063"></a><span class="lineno"> 1063</span>  }</div>
+<div class="line"><a id="l01064" name="l01064"></a><span class="lineno"> 1064</span> </div>
+<div class="line"><a id="l01065" name="l01065"></a><span class="lineno"> 1065</span>  <span class="comment">// Store results to device memory</span></div>
+<div class="line"><a id="l01066" name="l01066"></a><span class="lineno"> 1066</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01067" name="l01067"></a><span class="lineno"> 1067</span>  <span class="keywordflow">if</span> (num_els &lt; BM || num_outs &lt; BN) {</div>
+<div class="line"><a id="l01068" name="l01068"></a><span class="lineno"> 1068</span>    mma_op.store_result_safe(y, N, short2(num_outs, num_els));</div>
+<div class="line"><a id="l01069" name="l01069"></a><span class="lineno"> 1069</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01070" name="l01070"></a><span class="lineno"> 1070</span>    mma_op.store_result(y, N);</div>
+<div class="line"><a id="l01071" name="l01071"></a><span class="lineno"> 1071</span>  }</div>
+<div class="line"><a id="l01072" name="l01072"></a><span class="lineno"> 1072</span>}</div>
 </div>
-<div class="line"><a id="l01042" name="l01042"></a><span class="lineno"> 1042</span> </div>
-<div class="line"><a id="l01043" name="l01043"></a><span class="lineno"> 1043</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
-<div class="foldopen" id="foldopen01044" data-start="{" data-end="}">
-<div class="line"><a id="l01044" name="l01044"></a><span class="lineno"><a class="line" href="quantized_8h.html#a3ab400746ad77be89c30d25638e01698"> 1044</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets</a>(</div>
-<div class="line"><a id="l01045" name="l01045"></a><span class="lineno"> 1045</span>    <span class="keyword">const</span> device T*&amp; x,</div>
-<div class="line"><a id="l01046" name="l01046"></a><span class="lineno"> 1046</span>    <span class="keyword">const</span> device uint32_t*&amp; w,</div>
-<div class="line"><a id="l01047" name="l01047"></a><span class="lineno"> 1047</span>    <span class="keyword">const</span> device T*&amp; scales,</div>
-<div class="line"><a id="l01048" name="l01048"></a><span class="lineno"> 1048</span>    <span class="keyword">const</span> device T*&amp; biases,</div>
-<div class="line"><a id="l01049" name="l01049"></a><span class="lineno"> 1049</span>    <span class="keyword">const</span> device uint32_t* lhs_indices,</div>
-<div class="line"><a id="l01050" name="l01050"></a><span class="lineno"> 1050</span>    <span class="keyword">const</span> device uint32_t* rhs_indices,</div>
-<div class="line"><a id="l01051" name="l01051"></a><span class="lineno"> 1051</span>    device T*&amp; y,</div>
-<div class="line"><a id="l01052" name="l01052"></a><span class="lineno"> 1052</span>    <span class="keywordtype">int</span> output_stride,</div>
-<div class="line"><a id="l01053" name="l01053"></a><span class="lineno"> 1053</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims,</div>
-<div class="line"><a id="l01054" name="l01054"></a><span class="lineno"> 1054</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape,</div>
-<div class="line"><a id="l01055" name="l01055"></a><span class="lineno"> 1055</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides,</div>
-<div class="line"><a id="l01056" name="l01056"></a><span class="lineno"> 1056</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides,</div>
-<div class="line"><a id="l01057" name="l01057"></a><span class="lineno"> 1057</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims,</div>
-<div class="line"><a id="l01058" name="l01058"></a><span class="lineno"> 1058</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape,</div>
-<div class="line"><a id="l01059" name="l01059"></a><span class="lineno"> 1059</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides,</div>
-<div class="line"><a id="l01060" name="l01060"></a><span class="lineno"> 1060</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims,</div>
-<div class="line"><a id="l01061" name="l01061"></a><span class="lineno"> 1061</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape,</div>
-<div class="line"><a id="l01062" name="l01062"></a><span class="lineno"> 1062</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides,</div>
-<div class="line"><a id="l01063" name="l01063"></a><span class="lineno"> 1063</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides,</div>
-<div class="line"><a id="l01064" name="l01064"></a><span class="lineno"> 1064</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides,</div>
-<div class="line"><a id="l01065" name="l01065"></a><span class="lineno"> 1065</span>    uint3 tid [[threadgroup_position_in_grid]]) {</div>
-<div class="line"><a id="l01066" name="l01066"></a><span class="lineno"> 1066</span>  <span class="comment">// Set the input/output matrices</span></div>
-<div class="line"><a id="l01067" name="l01067"></a><span class="lineno"> 1067</span>  uint32_t x_idx;</div>
-<div class="line"><a id="l01068" name="l01068"></a><span class="lineno"> 1068</span>  uint32_t w_idx;</div>
-<div class="line"><a id="l01069" name="l01069"></a><span class="lineno"> 1069</span>  <span class="keywordflow">if</span> (batch_ndims == 1) {</div>
-<div class="line"><a id="l01070" name="l01070"></a><span class="lineno"> 1070</span>    x_idx = lhs_indices[tid.z * lhs_strides[0]];</div>
-<div class="line"><a id="l01071" name="l01071"></a><span class="lineno"> 1071</span>    w_idx = rhs_indices[tid.z * rhs_strides[0]];</div>
-<div class="line"><a id="l01072" name="l01072"></a><span class="lineno"> 1072</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01073" name="l01073"></a><span class="lineno"> 1073</span>    ulong2 idx = <a class="code hl_function" href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a>(</div>
-<div class="line"><a id="l01074" name="l01074"></a><span class="lineno"> 1074</span>        tid.z, batch_shape, lhs_strides, rhs_strides, batch_ndims);</div>
-<div class="line"><a id="l01075" name="l01075"></a><span class="lineno"> 1075</span>    x_idx = lhs_indices[idx.x];</div>
-<div class="line"><a id="l01076" name="l01076"></a><span class="lineno"> 1076</span>    w_idx = rhs_indices[idx.y];</div>
-<div class="line"><a id="l01077" name="l01077"></a><span class="lineno"> 1077</span>  }</div>
-<div class="line"><a id="l01078" name="l01078"></a><span class="lineno"> 1078</span>  <span class="keywordflow">if</span> (x_batch_ndims == 1) {</div>
-<div class="line"><a id="l01079" name="l01079"></a><span class="lineno"> 1079</span>    x += x_idx * x_strides[0];</div>
-<div class="line"><a id="l01080" name="l01080"></a><span class="lineno"> 1080</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01081" name="l01081"></a><span class="lineno"> 1081</span>    x += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(x_idx, x_shape, x_strides, x_batch_ndims);</div>
-<div class="line"><a id="l01082" name="l01082"></a><span class="lineno"> 1082</span>  }</div>
-<div class="line"><a id="l01083" name="l01083"></a><span class="lineno"> 1083</span>  <span class="keywordflow">if</span> (w_batch_ndims == 1) {</div>
-<div class="line"><a id="l01084" name="l01084"></a><span class="lineno"> 1084</span>    w += w_idx * w_strides[0];</div>
-<div class="line"><a id="l01085" name="l01085"></a><span class="lineno"> 1085</span>    scales += w_idx * s_strides[0];</div>
-<div class="line"><a id="l01086" name="l01086"></a><span class="lineno"> 1086</span>    biases += w_idx * b_strides[0];</div>
-<div class="line"><a id="l01087" name="l01087"></a><span class="lineno"> 1087</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01088" name="l01088"></a><span class="lineno"> 1088</span>    ulong3 idx = <a class="code hl_function" href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a>(</div>
-<div class="line"><a id="l01089" name="l01089"></a><span class="lineno"> 1089</span>        w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);</div>
-<div class="line"><a id="l01090" name="l01090"></a><span class="lineno"> 1090</span>    w += idx.x;</div>
-<div class="line"><a id="l01091" name="l01091"></a><span class="lineno"> 1091</span>    scales += idx.y;</div>
-<div class="line"><a id="l01092" name="l01092"></a><span class="lineno"> 1092</span>    biases += idx.z;</div>
-<div class="line"><a id="l01093" name="l01093"></a><span class="lineno"> 1093</span>  }</div>
-<div class="line"><a id="l01094" name="l01094"></a><span class="lineno"> 1094</span>  y += tid.z * output_stride;</div>
-<div class="line"><a id="l01095" name="l01095"></a><span class="lineno"> 1095</span>}</div>
+<div class="line"><a id="l01073" name="l01073"></a><span class="lineno"> 1073</span> </div>
+<div class="line"><a id="l01074" name="l01074"></a><span class="lineno"> 1074</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l01075" name="l01075"></a><span class="lineno"> 1075</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l01076" name="l01076"></a><span class="lineno"> 1076</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01077" name="l01077"></a><span class="lineno"> 1077</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l01078" name="l01078"></a><span class="lineno"> 1078</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l01079" name="l01079"></a><span class="lineno"> 1079</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l01080" name="l01080"></a><span class="lineno"> 1080</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen01081" data-start="{" data-end="}">
+<div class="line"><a id="l01081" name="l01081"></a><span class="lineno"><a class="line" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9"> 1081</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl</a>(</div>
+<div class="line"><a id="l01082" name="l01082"></a><span class="lineno"> 1082</span>    <span class="keyword">const</span> device uint32_t* w,</div>
+<div class="line"><a id="l01083" name="l01083"></a><span class="lineno"> 1083</span>    <span class="keyword">const</span> device T* scales,</div>
+<div class="line"><a id="l01084" name="l01084"></a><span class="lineno"> 1084</span>    <span class="keyword">const</span> device T* biases,</div>
+<div class="line"><a id="l01085" name="l01085"></a><span class="lineno"> 1085</span>    <span class="keyword">const</span> device T* x,</div>
+<div class="line"><a id="l01086" name="l01086"></a><span class="lineno"> 1086</span>    device T* y,</div>
+<div class="line"><a id="l01087" name="l01087"></a><span class="lineno"> 1087</span>    threadgroup T* Xs,</div>
+<div class="line"><a id="l01088" name="l01088"></a><span class="lineno"> 1088</span>    threadgroup T* Ws,</div>
+<div class="line"><a id="l01089" name="l01089"></a><span class="lineno"> 1089</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K,</div>
+<div class="line"><a id="l01090" name="l01090"></a><span class="lineno"> 1090</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N,</div>
+<div class="line"><a id="l01091" name="l01091"></a><span class="lineno"> 1091</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M,</div>
+<div class="line"><a id="l01092" name="l01092"></a><span class="lineno"> 1092</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01093" name="l01093"></a><span class="lineno"> 1093</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l01094" name="l01094"></a><span class="lineno"> 1094</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01095" name="l01095"></a><span class="lineno"> 1095</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01096" name="l01096"></a><span class="lineno"> 1096</span>  <span class="keyword">static_assert</span>(BK &gt;= <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>, <span class="stringliteral">&quot;BK should be larger than SIMD_SIZE&quot;</span>);</div>
+<div class="line"><a id="l01097" name="l01097"></a><span class="lineno"> 1097</span>  <span class="keyword">static_assert</span>(BK % <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a> == 0, <span class="stringliteral">&quot;BK should be divisible by SIMD_SIZE&quot;</span>);</div>
+<div class="line"><a id="l01098" name="l01098"></a><span class="lineno"> 1098</span> </div>
+<div class="line"><a id="l01099" name="l01099"></a><span class="lineno"> 1099</span>  (void)lid;</div>
+<div class="line"><a id="l01100" name="l01100"></a><span class="lineno"> 1100</span> </div>
+<div class="line"><a id="l01101" name="l01101"></a><span class="lineno"> 1101</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> WM = 2;</div>
+<div class="line"><a id="l01102" name="l01102"></a><span class="lineno"> 1102</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> WN = 2;</div>
+<div class="line"><a id="l01103" name="l01103"></a><span class="lineno"> 1103</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;</div>
+<div class="line"><a id="l01104" name="l01104"></a><span class="lineno"> 1104</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01105" name="l01105"></a><span class="lineno"> 1105</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01106" name="l01106"></a><span class="lineno"> 1106</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> power_of_2_bits = (bits &amp; (bits - 1)) == 0;</div>
+<div class="line"><a id="l01107" name="l01107"></a><span class="lineno"> 1107</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> bytes_per_pack = power_of_2_bits ? 1 : 3;</div>
+<div class="line"><a id="l01108" name="l01108"></a><span class="lineno"> 1108</span> </div>
+<div class="line"><a id="l01109" name="l01109"></a><span class="lineno"> 1109</span>  <span class="comment">// Instantiate the appropriate BlockMMA and Loader</span></div>
+<div class="line"><a id="l01110" name="l01110"></a><span class="lineno"> 1110</span>  <span class="keyword">using </span>mma_t = mlx::steel::</div>
+<div class="line"><a id="l01111" name="l01111"></a><span class="lineno"> 1111</span>      BlockMMA&lt;T, T, BM, BN, BK, WM, WN, false, false, BK_padded, BN_padded&gt;;</div>
+<div class="line"><a id="l01112" name="l01112"></a><span class="lineno"> 1112</span>  <span class="keyword">using </span>loader_x_t = mlx::steel::</div>
+<div class="line"><a id="l01113" name="l01113"></a><span class="lineno"> 1113</span>      BlockLoader&lt;T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE, 1, 4&gt;;</div>
+<div class="line"><a id="l01114" name="l01114"></a><span class="lineno"> 1114</span>  <span class="keyword">using </span>loader_w_t = <a class="code hl_struct" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt;</div>
+<div class="line"><a id="l01115" name="l01115"></a><span class="lineno"> 1115</span>      T,</div>
+<div class="line"><a id="l01116" name="l01116"></a><span class="lineno"> 1116</span>      BK,</div>
+<div class="line"><a id="l01117" name="l01117"></a><span class="lineno"> 1117</span>      BN,</div>
+<div class="line"><a id="l01118" name="l01118"></a><span class="lineno"> 1118</span>      BN_padded,</div>
+<div class="line"><a id="l01119" name="l01119"></a><span class="lineno"> 1119</span>      0,</div>
+<div class="line"><a id="l01120" name="l01120"></a><span class="lineno"> 1120</span>      WM * WN * <a class="code hl_variable" href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a>,</div>
+<div class="line"><a id="l01121" name="l01121"></a><span class="lineno"> 1121</span>      group_size,</div>
+<div class="line"><a id="l01122" name="l01122"></a><span class="lineno"> 1122</span>      bits&gt;;</div>
+<div class="line"><a id="l01123" name="l01123"></a><span class="lineno"> 1123</span> </div>
+<div class="line"><a id="l01124" name="l01124"></a><span class="lineno"> 1124</span>  <span class="keyword">auto</span> wl = (<span class="keyword">const</span> device uint8_t*)w;</div>
+<div class="line"><a id="l01125" name="l01125"></a><span class="lineno"> 1125</span> </div>
+<div class="line"><a id="l01126" name="l01126"></a><span class="lineno"> 1126</span>  <span class="comment">// Set the block</span></div>
+<div class="line"><a id="l01127" name="l01127"></a><span class="lineno"> 1127</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> y_row = tid.y * BM;</div>
+<div class="line"><a id="l01128" name="l01128"></a><span class="lineno"> 1128</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> y_col = tid.x * BN;</div>
+<div class="line"><a id="l01129" name="l01129"></a><span class="lineno"> 1129</span>  x += y_row * K;</div>
+<div class="line"><a id="l01130" name="l01130"></a><span class="lineno"> 1130</span>  wl += y_col * bytes_per_pack / pack_factor;</div>
+<div class="line"><a id="l01131" name="l01131"></a><span class="lineno"> 1131</span>  scales += y_col / group_size;</div>
+<div class="line"><a id="l01132" name="l01132"></a><span class="lineno"> 1132</span>  biases += y_col / group_size;</div>
+<div class="line"><a id="l01133" name="l01133"></a><span class="lineno"> 1133</span>  y += y_row * N + y_col;</div>
+<div class="line"><a id="l01134" name="l01134"></a><span class="lineno"> 1134</span> </div>
+<div class="line"><a id="l01135" name="l01135"></a><span class="lineno"> 1135</span>  <span class="comment">// Make the x loader and mma operation</span></div>
+<div class="line"><a id="l01136" name="l01136"></a><span class="lineno"> 1136</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> num_els = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(BM, M - y_row);</div>
+<div class="line"><a id="l01137" name="l01137"></a><span class="lineno"> 1137</span>  loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01138" name="l01138"></a><span class="lineno"> 1138</span>  loader_w_t loader_w(wl, scales, biases, N, Ws, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01139" name="l01139"></a><span class="lineno"> 1139</span>  mma_t mma_op(simd_gid, simd_lid);</div>
+<div class="line"><a id="l01140" name="l01140"></a><span class="lineno"> 1140</span> </div>
+<div class="line"><a id="l01141" name="l01141"></a><span class="lineno"> 1141</span>  <span class="keywordflow">if</span> (num_els &lt; BM) {</div>
+<div class="line"><a id="l01142" name="l01142"></a><span class="lineno"> 1142</span>    <span class="keywordflow">if</span> ((K % BK) != 0) {</div>
+<div class="line"><a id="l01143" name="l01143"></a><span class="lineno"> 1143</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> k_blocks = K / BK;</div>
+<div class="line"><a id="l01144" name="l01144"></a><span class="lineno"> 1144</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; k_blocks; k++) {</div>
+<div class="line"><a id="l01145" name="l01145"></a><span class="lineno"> 1145</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01146" name="l01146"></a><span class="lineno"> 1146</span>        loader_x.load_safe(short2(BK, num_els));</div>
+<div class="line"><a id="l01147" name="l01147"></a><span class="lineno"> 1147</span>        loader_w.load_unsafe();</div>
+<div class="line"><a id="l01148" name="l01148"></a><span class="lineno"> 1148</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01149" name="l01149"></a><span class="lineno"> 1149</span>        mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01150" name="l01150"></a><span class="lineno"> 1150</span>        loader_x.next();</div>
+<div class="line"><a id="l01151" name="l01151"></a><span class="lineno"> 1151</span>        loader_w.next();</div>
+<div class="line"><a id="l01152" name="l01152"></a><span class="lineno"> 1152</span>      }</div>
+<div class="line"><a id="l01153" name="l01153"></a><span class="lineno"> 1153</span>      <span class="keyword">const</span> <span class="keywordtype">short</span> num_k = K - k_blocks * BK;</div>
+<div class="line"><a id="l01154" name="l01154"></a><span class="lineno"> 1154</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01155" name="l01155"></a><span class="lineno"> 1155</span>      loader_x.load_safe(short2(num_k, num_els));</div>
+<div class="line"><a id="l01156" name="l01156"></a><span class="lineno"> 1156</span>      loader_w.load_safe(short2(BN, num_k));</div>
+<div class="line"><a id="l01157" name="l01157"></a><span class="lineno"> 1157</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01158" name="l01158"></a><span class="lineno"> 1158</span>      mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01159" name="l01159"></a><span class="lineno"> 1159</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01160" name="l01160"></a><span class="lineno"> 1160</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
+<div class="line"><a id="l01161" name="l01161"></a><span class="lineno"> 1161</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01162" name="l01162"></a><span class="lineno"> 1162</span>        loader_x.load_safe(short2(BK, num_els));</div>
+<div class="line"><a id="l01163" name="l01163"></a><span class="lineno"> 1163</span>        loader_w.load_unsafe();</div>
+<div class="line"><a id="l01164" name="l01164"></a><span class="lineno"> 1164</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01165" name="l01165"></a><span class="lineno"> 1165</span>        mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01166" name="l01166"></a><span class="lineno"> 1166</span>        loader_x.next();</div>
+<div class="line"><a id="l01167" name="l01167"></a><span class="lineno"> 1167</span>        loader_w.next();</div>
+<div class="line"><a id="l01168" name="l01168"></a><span class="lineno"> 1168</span>      }</div>
+<div class="line"><a id="l01169" name="l01169"></a><span class="lineno"> 1169</span>    }</div>
+<div class="line"><a id="l01170" name="l01170"></a><span class="lineno"> 1170</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01171" name="l01171"></a><span class="lineno"> 1171</span>    <span class="keywordflow">if</span> ((K % BK) != 0) {</div>
+<div class="line"><a id="l01172" name="l01172"></a><span class="lineno"> 1172</span>      <span class="keyword">const</span> <span class="keywordtype">int</span> k_blocks = K / BK;</div>
+<div class="line"><a id="l01173" name="l01173"></a><span class="lineno"> 1173</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; k_blocks; k++) {</div>
+<div class="line"><a id="l01174" name="l01174"></a><span class="lineno"> 1174</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01175" name="l01175"></a><span class="lineno"> 1175</span>        loader_x.load_unsafe();</div>
+<div class="line"><a id="l01176" name="l01176"></a><span class="lineno"> 1176</span>        loader_w.load_unsafe();</div>
+<div class="line"><a id="l01177" name="l01177"></a><span class="lineno"> 1177</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01178" name="l01178"></a><span class="lineno"> 1178</span>        mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01179" name="l01179"></a><span class="lineno"> 1179</span>        loader_x.next();</div>
+<div class="line"><a id="l01180" name="l01180"></a><span class="lineno"> 1180</span>        loader_w.next();</div>
+<div class="line"><a id="l01181" name="l01181"></a><span class="lineno"> 1181</span>      }</div>
+<div class="line"><a id="l01182" name="l01182"></a><span class="lineno"> 1182</span>      <span class="keyword">const</span> <span class="keywordtype">short</span> num_k = K - k_blocks * BK;</div>
+<div class="line"><a id="l01183" name="l01183"></a><span class="lineno"> 1183</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01184" name="l01184"></a><span class="lineno"> 1184</span>      loader_x.load_safe(short2(num_k, BM));</div>
+<div class="line"><a id="l01185" name="l01185"></a><span class="lineno"> 1185</span>      loader_w.load_safe(short2(BN, num_k));</div>
+<div class="line"><a id="l01186" name="l01186"></a><span class="lineno"> 1186</span>      threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01187" name="l01187"></a><span class="lineno"> 1187</span>      mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01188" name="l01188"></a><span class="lineno"> 1188</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01189" name="l01189"></a><span class="lineno"> 1189</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> k = 0; k &lt; K; k += BK) {</div>
+<div class="line"><a id="l01190" name="l01190"></a><span class="lineno"> 1190</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01191" name="l01191"></a><span class="lineno"> 1191</span>        loader_x.load_unsafe();</div>
+<div class="line"><a id="l01192" name="l01192"></a><span class="lineno"> 1192</span>        loader_w.load_unsafe();</div>
+<div class="line"><a id="l01193" name="l01193"></a><span class="lineno"> 1193</span>        threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01194" name="l01194"></a><span class="lineno"> 1194</span>        mma_op.mma(Xs, Ws);</div>
+<div class="line"><a id="l01195" name="l01195"></a><span class="lineno"> 1195</span>        loader_x.next();</div>
+<div class="line"><a id="l01196" name="l01196"></a><span class="lineno"> 1196</span>        loader_w.next();</div>
+<div class="line"><a id="l01197" name="l01197"></a><span class="lineno"> 1197</span>      }</div>
+<div class="line"><a id="l01198" name="l01198"></a><span class="lineno"> 1198</span>    }</div>
+<div class="line"><a id="l01199" name="l01199"></a><span class="lineno"> 1199</span>  }</div>
+<div class="line"><a id="l01200" name="l01200"></a><span class="lineno"> 1200</span> </div>
+<div class="line"><a id="l01201" name="l01201"></a><span class="lineno"> 1201</span>  <span class="comment">// Store results to device memory</span></div>
+<div class="line"><a id="l01202" name="l01202"></a><span class="lineno"> 1202</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l01203" name="l01203"></a><span class="lineno"> 1203</span>  <span class="keywordflow">if</span> (num_els &lt; BM) {</div>
+<div class="line"><a id="l01204" name="l01204"></a><span class="lineno"> 1204</span>    mma_op.store_result_safe(y, N, short2(BN, num_els));</div>
+<div class="line"><a id="l01205" name="l01205"></a><span class="lineno"> 1205</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01206" name="l01206"></a><span class="lineno"> 1206</span>    mma_op.store_result(y, N);</div>
+<div class="line"><a id="l01207" name="l01207"></a><span class="lineno"> 1207</span>  }</div>
+<div class="line"><a id="l01208" name="l01208"></a><span class="lineno"> 1208</span>}</div>
 </div>
-<div class="line"><a id="l01096" name="l01096"></a><span class="lineno"> 1096</span> </div>
-<div class="line"><a id="l01097" name="l01097"></a><span class="lineno"> 1097</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits, <span class="keywordtype">int</span> D, <span class="keywordtype">bool</span> batched&gt;</div>
-<div class="foldopen" id="foldopen01098" data-start="{" data-end="}">
-<div class="line"><a id="l01098" name="l01098"></a><span class="lineno"><a class="line" href="quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad"> 1098</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad">qmv_quad</a>(</div>
-<div class="line"><a id="l01099" name="l01099"></a><span class="lineno"> 1099</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01100" name="l01100"></a><span class="lineno"> 1100</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01101" name="l01101"></a><span class="lineno"> 1101</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01102" name="l01102"></a><span class="lineno"> 1102</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01103" name="l01103"></a><span class="lineno"> 1103</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01104" name="l01104"></a><span class="lineno"> 1104</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01105" name="l01105"></a><span class="lineno"> 1105</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01106" name="l01106"></a><span class="lineno"> 1106</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01107" name="l01107"></a><span class="lineno"> 1107</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01108" name="l01108"></a><span class="lineno"> 1108</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01109" name="l01109"></a><span class="lineno"> 1109</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01110" name="l01110"></a><span class="lineno"> 1110</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01111" name="l01111"></a><span class="lineno"> 1111</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01112" name="l01112"></a><span class="lineno"> 1112</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01113" name="l01113"></a><span class="lineno"> 1113</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01114" name="l01114"></a><span class="lineno"> 1114</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01115" name="l01115"></a><span class="lineno"> 1115</span>    uint quad_gid [[quadgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01116" name="l01116"></a><span class="lineno"> 1116</span>    uint quad_lid [[thread_index_in_quadgroup]]) {</div>
-<div class="line"><a id="l01117" name="l01117"></a><span class="lineno"> 1117</span>  <span class="keywordflow">if</span> (batched) {</div>
-<div class="line"><a id="l01118" name="l01118"></a><span class="lineno"> 1118</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01119" name="l01119"></a><span class="lineno"> 1119</span>        x,</div>
-<div class="line"><a id="l01120" name="l01120"></a><span class="lineno"> 1120</span>        w,</div>
-<div class="line"><a id="l01121" name="l01121"></a><span class="lineno"> 1121</span>        scales,</div>
-<div class="line"><a id="l01122" name="l01122"></a><span class="lineno"> 1122</span>        biases,</div>
-<div class="line"><a id="l01123" name="l01123"></a><span class="lineno"> 1123</span>        y,</div>
-<div class="line"><a id="l01124" name="l01124"></a><span class="lineno"> 1124</span>        out_vec_size,</div>
-<div class="line"><a id="l01125" name="l01125"></a><span class="lineno"> 1125</span>        x_batch_ndims,</div>
-<div class="line"><a id="l01126" name="l01126"></a><span class="lineno"> 1126</span>        x_shape,</div>
-<div class="line"><a id="l01127" name="l01127"></a><span class="lineno"> 1127</span>        x_strides,</div>
-<div class="line"><a id="l01128" name="l01128"></a><span class="lineno"> 1128</span>        w_batch_ndims,</div>
-<div class="line"><a id="l01129" name="l01129"></a><span class="lineno"> 1129</span>        w_shape,</div>
-<div class="line"><a id="l01130" name="l01130"></a><span class="lineno"> 1130</span>        w_strides,</div>
-<div class="line"><a id="l01131" name="l01131"></a><span class="lineno"> 1131</span>        s_strides,</div>
-<div class="line"><a id="l01132" name="l01132"></a><span class="lineno"> 1132</span>        b_strides,</div>
-<div class="line"><a id="l01133" name="l01133"></a><span class="lineno"> 1133</span>        tid);</div>
-<div class="line"><a id="l01134" name="l01134"></a><span class="lineno"> 1134</span>  }</div>
-<div class="line"><a id="l01135" name="l01135"></a><span class="lineno"> 1135</span>  <a class="code hl_function" href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">qmv_quad_impl&lt;T, group_size, bits, D&gt;</a>(</div>
-<div class="line"><a id="l01136" name="l01136"></a><span class="lineno"> 1136</span>      w,</div>
-<div class="line"><a id="l01137" name="l01137"></a><span class="lineno"> 1137</span>      scales,</div>
-<div class="line"><a id="l01138" name="l01138"></a><span class="lineno"> 1138</span>      biases,</div>
-<div class="line"><a id="l01139" name="l01139"></a><span class="lineno"> 1139</span>      x,</div>
-<div class="line"><a id="l01140" name="l01140"></a><span class="lineno"> 1140</span>      y,</div>
-<div class="line"><a id="l01141" name="l01141"></a><span class="lineno"> 1141</span>      in_vec_size,</div>
-<div class="line"><a id="l01142" name="l01142"></a><span class="lineno"> 1142</span>      out_vec_size,</div>
-<div class="line"><a id="l01143" name="l01143"></a><span class="lineno"> 1143</span>      tid,</div>
-<div class="line"><a id="l01144" name="l01144"></a><span class="lineno"> 1144</span>      quad_gid,</div>
-<div class="line"><a id="l01145" name="l01145"></a><span class="lineno"> 1145</span>      quad_lid);</div>
-<div class="line"><a id="l01146" name="l01146"></a><span class="lineno"> 1146</span>}</div>
+<div class="line"><a id="l01209" name="l01209"></a><span class="lineno"> 1209</span> </div>
+<div class="line"><a id="l01210" name="l01210"></a><span class="lineno"> 1210</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen01211" data-start="{" data-end="}">
+<div class="line"><a id="l01211" name="l01211"></a><span class="lineno"><a class="line" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be"> 1211</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets</a>(</div>
+<div class="line"><a id="l01212" name="l01212"></a><span class="lineno"> 1212</span>    <span class="keyword">const</span> device T*&amp; x,</div>
+<div class="line"><a id="l01213" name="l01213"></a><span class="lineno"> 1213</span>    <span class="keyword">const</span> device uint32_t*&amp; w,</div>
+<div class="line"><a id="l01214" name="l01214"></a><span class="lineno"> 1214</span>    <span class="keyword">const</span> device T*&amp; scales,</div>
+<div class="line"><a id="l01215" name="l01215"></a><span class="lineno"> 1215</span>    <span class="keyword">const</span> device T*&amp; biases,</div>
+<div class="line"><a id="l01216" name="l01216"></a><span class="lineno"> 1216</span>    device T*&amp; y,</div>
+<div class="line"><a id="l01217" name="l01217"></a><span class="lineno"> 1217</span>    <span class="keywordtype">int</span> output_stride,</div>
+<div class="line"><a id="l01218" name="l01218"></a><span class="lineno"> 1218</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims,</div>
+<div class="line"><a id="l01219" name="l01219"></a><span class="lineno"> 1219</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape,</div>
+<div class="line"><a id="l01220" name="l01220"></a><span class="lineno"> 1220</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides,</div>
+<div class="line"><a id="l01221" name="l01221"></a><span class="lineno"> 1221</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims,</div>
+<div class="line"><a id="l01222" name="l01222"></a><span class="lineno"> 1222</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape,</div>
+<div class="line"><a id="l01223" name="l01223"></a><span class="lineno"> 1223</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides,</div>
+<div class="line"><a id="l01224" name="l01224"></a><span class="lineno"> 1224</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides,</div>
+<div class="line"><a id="l01225" name="l01225"></a><span class="lineno"> 1225</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides,</div>
+<div class="line"><a id="l01226" name="l01226"></a><span class="lineno"> 1226</span>    uint3 tid [[threadgroup_position_in_grid]]) {</div>
+<div class="line"><a id="l01227" name="l01227"></a><span class="lineno"> 1227</span>  <span class="comment">// Set the input/output matrices</span></div>
+<div class="line"><a id="l01228" name="l01228"></a><span class="lineno"> 1228</span>  uint32_t x_idx = tid.z;</div>
+<div class="line"><a id="l01229" name="l01229"></a><span class="lineno"> 1229</span>  uint32_t w_idx = tid.z;</div>
+<div class="line"><a id="l01230" name="l01230"></a><span class="lineno"> 1230</span>  <span class="keywordflow">if</span> (x_batch_ndims == 1) {</div>
+<div class="line"><a id="l01231" name="l01231"></a><span class="lineno"> 1231</span>    x += x_idx * x_strides[0];</div>
+<div class="line"><a id="l01232" name="l01232"></a><span class="lineno"> 1232</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01233" name="l01233"></a><span class="lineno"> 1233</span>    x += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(x_idx, x_shape, x_strides, x_batch_ndims);</div>
+<div class="line"><a id="l01234" name="l01234"></a><span class="lineno"> 1234</span>  }</div>
+<div class="line"><a id="l01235" name="l01235"></a><span class="lineno"> 1235</span>  <span class="keywordflow">if</span> (w_batch_ndims == 1) {</div>
+<div class="line"><a id="l01236" name="l01236"></a><span class="lineno"> 1236</span>    w += w_idx * w_strides[0];</div>
+<div class="line"><a id="l01237" name="l01237"></a><span class="lineno"> 1237</span>    scales += w_idx * s_strides[0];</div>
+<div class="line"><a id="l01238" name="l01238"></a><span class="lineno"> 1238</span>    biases += w_idx * b_strides[0];</div>
+<div class="line"><a id="l01239" name="l01239"></a><span class="lineno"> 1239</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01240" name="l01240"></a><span class="lineno"> 1240</span>    ulong3 idx = <a class="code hl_function" href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a>(</div>
+<div class="line"><a id="l01241" name="l01241"></a><span class="lineno"> 1241</span>        w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);</div>
+<div class="line"><a id="l01242" name="l01242"></a><span class="lineno"> 1242</span>    w += idx.x;</div>
+<div class="line"><a id="l01243" name="l01243"></a><span class="lineno"> 1243</span>    scales += idx.y;</div>
+<div class="line"><a id="l01244" name="l01244"></a><span class="lineno"> 1244</span>    biases += idx.z;</div>
+<div class="line"><a id="l01245" name="l01245"></a><span class="lineno"> 1245</span>  }</div>
+<div class="line"><a id="l01246" name="l01246"></a><span class="lineno"> 1246</span>  y += tid.z * output_stride;</div>
+<div class="line"><a id="l01247" name="l01247"></a><span class="lineno"> 1247</span>}</div>
 </div>
-<div class="line"><a id="l01147" name="l01147"></a><span class="lineno"> 1147</span> </div>
-<div class="line"><a id="l01148" name="l01148"></a><span class="lineno"> 1148</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits, <span class="keywordtype">bool</span> batched&gt;</div>
-<div class="foldopen" id="foldopen01149" data-start="{" data-end="}">
-<div class="line"><a id="l01149" name="l01149"></a><span class="lineno"><a class="line" href="quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f"> 1149</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f">qmv_fast</a>(</div>
-<div class="line"><a id="l01150" name="l01150"></a><span class="lineno"> 1150</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01151" name="l01151"></a><span class="lineno"> 1151</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01152" name="l01152"></a><span class="lineno"> 1152</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01153" name="l01153"></a><span class="lineno"> 1153</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01154" name="l01154"></a><span class="lineno"> 1154</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01155" name="l01155"></a><span class="lineno"> 1155</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01156" name="l01156"></a><span class="lineno"> 1156</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01157" name="l01157"></a><span class="lineno"> 1157</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01158" name="l01158"></a><span class="lineno"> 1158</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01159" name="l01159"></a><span class="lineno"> 1159</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01160" name="l01160"></a><span class="lineno"> 1160</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01161" name="l01161"></a><span class="lineno"> 1161</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01162" name="l01162"></a><span class="lineno"> 1162</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01163" name="l01163"></a><span class="lineno"> 1163</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01164" name="l01164"></a><span class="lineno"> 1164</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01165" name="l01165"></a><span class="lineno"> 1165</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01166" name="l01166"></a><span class="lineno"> 1166</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01167" name="l01167"></a><span class="lineno"> 1167</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01168" name="l01168"></a><span class="lineno"> 1168</span>  <span class="keywordflow">if</span> (batched) {</div>
-<div class="line"><a id="l01169" name="l01169"></a><span class="lineno"> 1169</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01170" name="l01170"></a><span class="lineno"> 1170</span>        x,</div>
-<div class="line"><a id="l01171" name="l01171"></a><span class="lineno"> 1171</span>        w,</div>
-<div class="line"><a id="l01172" name="l01172"></a><span class="lineno"> 1172</span>        scales,</div>
-<div class="line"><a id="l01173" name="l01173"></a><span class="lineno"> 1173</span>        biases,</div>
-<div class="line"><a id="l01174" name="l01174"></a><span class="lineno"> 1174</span>        y,</div>
-<div class="line"><a id="l01175" name="l01175"></a><span class="lineno"> 1175</span>        out_vec_size,</div>
-<div class="line"><a id="l01176" name="l01176"></a><span class="lineno"> 1176</span>        x_batch_ndims,</div>
-<div class="line"><a id="l01177" name="l01177"></a><span class="lineno"> 1177</span>        x_shape,</div>
-<div class="line"><a id="l01178" name="l01178"></a><span class="lineno"> 1178</span>        x_strides,</div>
-<div class="line"><a id="l01179" name="l01179"></a><span class="lineno"> 1179</span>        w_batch_ndims,</div>
-<div class="line"><a id="l01180" name="l01180"></a><span class="lineno"> 1180</span>        w_shape,</div>
-<div class="line"><a id="l01181" name="l01181"></a><span class="lineno"> 1181</span>        w_strides,</div>
-<div class="line"><a id="l01182" name="l01182"></a><span class="lineno"> 1182</span>        s_strides,</div>
-<div class="line"><a id="l01183" name="l01183"></a><span class="lineno"> 1183</span>        b_strides,</div>
-<div class="line"><a id="l01184" name="l01184"></a><span class="lineno"> 1184</span>        tid);</div>
-<div class="line"><a id="l01185" name="l01185"></a><span class="lineno"> 1185</span>  }</div>
-<div class="line"><a id="l01186" name="l01186"></a><span class="lineno"> 1186</span>  <a class="code hl_function" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01187" name="l01187"></a><span class="lineno"> 1187</span>      w,</div>
-<div class="line"><a id="l01188" name="l01188"></a><span class="lineno"> 1188</span>      scales,</div>
-<div class="line"><a id="l01189" name="l01189"></a><span class="lineno"> 1189</span>      biases,</div>
-<div class="line"><a id="l01190" name="l01190"></a><span class="lineno"> 1190</span>      x,</div>
-<div class="line"><a id="l01191" name="l01191"></a><span class="lineno"> 1191</span>      y,</div>
-<div class="line"><a id="l01192" name="l01192"></a><span class="lineno"> 1192</span>      in_vec_size,</div>
-<div class="line"><a id="l01193" name="l01193"></a><span class="lineno"> 1193</span>      out_vec_size,</div>
-<div class="line"><a id="l01194" name="l01194"></a><span class="lineno"> 1194</span>      tid,</div>
-<div class="line"><a id="l01195" name="l01195"></a><span class="lineno"> 1195</span>      simd_gid,</div>
-<div class="line"><a id="l01196" name="l01196"></a><span class="lineno"> 1196</span>      simd_lid);</div>
-<div class="line"><a id="l01197" name="l01197"></a><span class="lineno"> 1197</span>}</div>
+<div class="line"><a id="l01248" name="l01248"></a><span class="lineno"> 1248</span> </div>
+<div class="line"><a id="l01249" name="l01249"></a><span class="lineno"> 1249</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen01250" data-start="{" data-end="}">
+<div class="line"><a id="l01250" name="l01250"></a><span class="lineno"><a class="line" href="quantized_8h.html#a3ab400746ad77be89c30d25638e01698"> 1250</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets</a>(</div>
+<div class="line"><a id="l01251" name="l01251"></a><span class="lineno"> 1251</span>    <span class="keyword">const</span> device T*&amp; x,</div>
+<div class="line"><a id="l01252" name="l01252"></a><span class="lineno"> 1252</span>    <span class="keyword">const</span> device uint32_t*&amp; w,</div>
+<div class="line"><a id="l01253" name="l01253"></a><span class="lineno"> 1253</span>    <span class="keyword">const</span> device T*&amp; scales,</div>
+<div class="line"><a id="l01254" name="l01254"></a><span class="lineno"> 1254</span>    <span class="keyword">const</span> device T*&amp; biases,</div>
+<div class="line"><a id="l01255" name="l01255"></a><span class="lineno"> 1255</span>    <span class="keyword">const</span> device uint32_t* lhs_indices,</div>
+<div class="line"><a id="l01256" name="l01256"></a><span class="lineno"> 1256</span>    <span class="keyword">const</span> device uint32_t* rhs_indices,</div>
+<div class="line"><a id="l01257" name="l01257"></a><span class="lineno"> 1257</span>    device T*&amp; y,</div>
+<div class="line"><a id="l01258" name="l01258"></a><span class="lineno"> 1258</span>    <span class="keywordtype">int</span> output_stride,</div>
+<div class="line"><a id="l01259" name="l01259"></a><span class="lineno"> 1259</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims,</div>
+<div class="line"><a id="l01260" name="l01260"></a><span class="lineno"> 1260</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape,</div>
+<div class="line"><a id="l01261" name="l01261"></a><span class="lineno"> 1261</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides,</div>
+<div class="line"><a id="l01262" name="l01262"></a><span class="lineno"> 1262</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides,</div>
+<div class="line"><a id="l01263" name="l01263"></a><span class="lineno"> 1263</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims,</div>
+<div class="line"><a id="l01264" name="l01264"></a><span class="lineno"> 1264</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape,</div>
+<div class="line"><a id="l01265" name="l01265"></a><span class="lineno"> 1265</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides,</div>
+<div class="line"><a id="l01266" name="l01266"></a><span class="lineno"> 1266</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims,</div>
+<div class="line"><a id="l01267" name="l01267"></a><span class="lineno"> 1267</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape,</div>
+<div class="line"><a id="l01268" name="l01268"></a><span class="lineno"> 1268</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides,</div>
+<div class="line"><a id="l01269" name="l01269"></a><span class="lineno"> 1269</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides,</div>
+<div class="line"><a id="l01270" name="l01270"></a><span class="lineno"> 1270</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides,</div>
+<div class="line"><a id="l01271" name="l01271"></a><span class="lineno"> 1271</span>    uint3 tid [[threadgroup_position_in_grid]]) {</div>
+<div class="line"><a id="l01272" name="l01272"></a><span class="lineno"> 1272</span>  <span class="comment">// Set the input/output matrices</span></div>
+<div class="line"><a id="l01273" name="l01273"></a><span class="lineno"> 1273</span>  uint32_t x_idx;</div>
+<div class="line"><a id="l01274" name="l01274"></a><span class="lineno"> 1274</span>  uint32_t w_idx;</div>
+<div class="line"><a id="l01275" name="l01275"></a><span class="lineno"> 1275</span>  <span class="keywordflow">if</span> (batch_ndims == 1) {</div>
+<div class="line"><a id="l01276" name="l01276"></a><span class="lineno"> 1276</span>    x_idx = lhs_indices[tid.z * lhs_strides[0]];</div>
+<div class="line"><a id="l01277" name="l01277"></a><span class="lineno"> 1277</span>    w_idx = rhs_indices[tid.z * rhs_strides[0]];</div>
+<div class="line"><a id="l01278" name="l01278"></a><span class="lineno"> 1278</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01279" name="l01279"></a><span class="lineno"> 1279</span>    ulong2 idx = <a class="code hl_function" href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a>(</div>
+<div class="line"><a id="l01280" name="l01280"></a><span class="lineno"> 1280</span>        tid.z, batch_shape, lhs_strides, rhs_strides, batch_ndims);</div>
+<div class="line"><a id="l01281" name="l01281"></a><span class="lineno"> 1281</span>    x_idx = lhs_indices[idx.x];</div>
+<div class="line"><a id="l01282" name="l01282"></a><span class="lineno"> 1282</span>    w_idx = rhs_indices[idx.y];</div>
+<div class="line"><a id="l01283" name="l01283"></a><span class="lineno"> 1283</span>  }</div>
+<div class="line"><a id="l01284" name="l01284"></a><span class="lineno"> 1284</span>  <span class="keywordflow">if</span> (x_batch_ndims == 1) {</div>
+<div class="line"><a id="l01285" name="l01285"></a><span class="lineno"> 1285</span>    x += x_idx * x_strides[0];</div>
+<div class="line"><a id="l01286" name="l01286"></a><span class="lineno"> 1286</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01287" name="l01287"></a><span class="lineno"> 1287</span>    x += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(x_idx, x_shape, x_strides, x_batch_ndims);</div>
+<div class="line"><a id="l01288" name="l01288"></a><span class="lineno"> 1288</span>  }</div>
+<div class="line"><a id="l01289" name="l01289"></a><span class="lineno"> 1289</span>  <span class="keywordflow">if</span> (w_batch_ndims == 1) {</div>
+<div class="line"><a id="l01290" name="l01290"></a><span class="lineno"> 1290</span>    w += w_idx * w_strides[0];</div>
+<div class="line"><a id="l01291" name="l01291"></a><span class="lineno"> 1291</span>    scales += w_idx * s_strides[0];</div>
+<div class="line"><a id="l01292" name="l01292"></a><span class="lineno"> 1292</span>    biases += w_idx * b_strides[0];</div>
+<div class="line"><a id="l01293" name="l01293"></a><span class="lineno"> 1293</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01294" name="l01294"></a><span class="lineno"> 1294</span>    ulong3 idx = <a class="code hl_function" href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a>(</div>
+<div class="line"><a id="l01295" name="l01295"></a><span class="lineno"> 1295</span>        w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);</div>
+<div class="line"><a id="l01296" name="l01296"></a><span class="lineno"> 1296</span>    w += idx.x;</div>
+<div class="line"><a id="l01297" name="l01297"></a><span class="lineno"> 1297</span>    scales += idx.y;</div>
+<div class="line"><a id="l01298" name="l01298"></a><span class="lineno"> 1298</span>    biases += idx.z;</div>
+<div class="line"><a id="l01299" name="l01299"></a><span class="lineno"> 1299</span>  }</div>
+<div class="line"><a id="l01300" name="l01300"></a><span class="lineno"> 1300</span>  y += tid.z * output_stride;</div>
+<div class="line"><a id="l01301" name="l01301"></a><span class="lineno"> 1301</span>}</div>
 </div>
-<div class="line"><a id="l01198" name="l01198"></a><span class="lineno"> 1198</span> </div>
-<div class="line"><a id="l01199" name="l01199"></a><span class="lineno"> 1199</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits, <span class="keywordtype">bool</span> batched&gt;</div>
-<div class="foldopen" id="foldopen01200" data-start="{" data-end="}">
-<div class="line"><a id="l01200" name="l01200"></a><span class="lineno"><a class="line" href="quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd"> 1200</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd">qmv</a>(</div>
-<div class="line"><a id="l01201" name="l01201"></a><span class="lineno"> 1201</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01202" name="l01202"></a><span class="lineno"> 1202</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01203" name="l01203"></a><span class="lineno"> 1203</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01204" name="l01204"></a><span class="lineno"> 1204</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01205" name="l01205"></a><span class="lineno"> 1205</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01206" name="l01206"></a><span class="lineno"> 1206</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01207" name="l01207"></a><span class="lineno"> 1207</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01208" name="l01208"></a><span class="lineno"> 1208</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01209" name="l01209"></a><span class="lineno"> 1209</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01210" name="l01210"></a><span class="lineno"> 1210</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01211" name="l01211"></a><span class="lineno"> 1211</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01212" name="l01212"></a><span class="lineno"> 1212</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01213" name="l01213"></a><span class="lineno"> 1213</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01214" name="l01214"></a><span class="lineno"> 1214</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01215" name="l01215"></a><span class="lineno"> 1215</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01216" name="l01216"></a><span class="lineno"> 1216</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01217" name="l01217"></a><span class="lineno"> 1217</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01218" name="l01218"></a><span class="lineno"> 1218</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01219" name="l01219"></a><span class="lineno"> 1219</span>  <span class="keywordflow">if</span> (batched) {</div>
-<div class="line"><a id="l01220" name="l01220"></a><span class="lineno"> 1220</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01221" name="l01221"></a><span class="lineno"> 1221</span>        x,</div>
-<div class="line"><a id="l01222" name="l01222"></a><span class="lineno"> 1222</span>        w,</div>
-<div class="line"><a id="l01223" name="l01223"></a><span class="lineno"> 1223</span>        scales,</div>
-<div class="line"><a id="l01224" name="l01224"></a><span class="lineno"> 1224</span>        biases,</div>
-<div class="line"><a id="l01225" name="l01225"></a><span class="lineno"> 1225</span>        y,</div>
-<div class="line"><a id="l01226" name="l01226"></a><span class="lineno"> 1226</span>        out_vec_size,</div>
-<div class="line"><a id="l01227" name="l01227"></a><span class="lineno"> 1227</span>        x_batch_ndims,</div>
-<div class="line"><a id="l01228" name="l01228"></a><span class="lineno"> 1228</span>        x_shape,</div>
-<div class="line"><a id="l01229" name="l01229"></a><span class="lineno"> 1229</span>        x_strides,</div>
-<div class="line"><a id="l01230" name="l01230"></a><span class="lineno"> 1230</span>        w_batch_ndims,</div>
-<div class="line"><a id="l01231" name="l01231"></a><span class="lineno"> 1231</span>        w_shape,</div>
-<div class="line"><a id="l01232" name="l01232"></a><span class="lineno"> 1232</span>        w_strides,</div>
-<div class="line"><a id="l01233" name="l01233"></a><span class="lineno"> 1233</span>        s_strides,</div>
-<div class="line"><a id="l01234" name="l01234"></a><span class="lineno"> 1234</span>        b_strides,</div>
-<div class="line"><a id="l01235" name="l01235"></a><span class="lineno"> 1235</span>        tid);</div>
-<div class="line"><a id="l01236" name="l01236"></a><span class="lineno"> 1236</span>  }</div>
-<div class="line"><a id="l01237" name="l01237"></a><span class="lineno"> 1237</span>  <a class="code hl_function" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01238" name="l01238"></a><span class="lineno"> 1238</span>      w,</div>
-<div class="line"><a id="l01239" name="l01239"></a><span class="lineno"> 1239</span>      scales,</div>
-<div class="line"><a id="l01240" name="l01240"></a><span class="lineno"> 1240</span>      biases,</div>
-<div class="line"><a id="l01241" name="l01241"></a><span class="lineno"> 1241</span>      x,</div>
-<div class="line"><a id="l01242" name="l01242"></a><span class="lineno"> 1242</span>      y,</div>
-<div class="line"><a id="l01243" name="l01243"></a><span class="lineno"> 1243</span>      in_vec_size,</div>
-<div class="line"><a id="l01244" name="l01244"></a><span class="lineno"> 1244</span>      out_vec_size,</div>
-<div class="line"><a id="l01245" name="l01245"></a><span class="lineno"> 1245</span>      tid,</div>
-<div class="line"><a id="l01246" name="l01246"></a><span class="lineno"> 1246</span>      simd_gid,</div>
-<div class="line"><a id="l01247" name="l01247"></a><span class="lineno"> 1247</span>      simd_lid);</div>
-<div class="line"><a id="l01248" name="l01248"></a><span class="lineno"> 1248</span>}</div>
+<div class="line"><a id="l01302" name="l01302"></a><span class="lineno"> 1302</span> </div>
+<div class="line"><a id="l01303" name="l01303"></a><span class="lineno"> 1303</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits, <span class="keywordtype">int</span> D, <span class="keywordtype">bool</span> batched&gt;</div>
+<div class="foldopen" id="foldopen01304" data-start="{" data-end="}">
+<div class="line"><a id="l01304" name="l01304"></a><span class="lineno"><a class="line" href="quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad"> 1304</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad">qmv_quad</a>(</div>
+<div class="line"><a id="l01305" name="l01305"></a><span class="lineno"> 1305</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01306" name="l01306"></a><span class="lineno"> 1306</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01307" name="l01307"></a><span class="lineno"> 1307</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01308" name="l01308"></a><span class="lineno"> 1308</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01309" name="l01309"></a><span class="lineno"> 1309</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01310" name="l01310"></a><span class="lineno"> 1310</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01311" name="l01311"></a><span class="lineno"> 1311</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01312" name="l01312"></a><span class="lineno"> 1312</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01313" name="l01313"></a><span class="lineno"> 1313</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01314" name="l01314"></a><span class="lineno"> 1314</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01315" name="l01315"></a><span class="lineno"> 1315</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01316" name="l01316"></a><span class="lineno"> 1316</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01317" name="l01317"></a><span class="lineno"> 1317</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01318" name="l01318"></a><span class="lineno"> 1318</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01319" name="l01319"></a><span class="lineno"> 1319</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01320" name="l01320"></a><span class="lineno"> 1320</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01321" name="l01321"></a><span class="lineno"> 1321</span>    uint quad_gid [[quadgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01322" name="l01322"></a><span class="lineno"> 1322</span>    uint quad_lid [[thread_index_in_quadgroup]]) {</div>
+<div class="line"><a id="l01323" name="l01323"></a><span class="lineno"> 1323</span>  <span class="keywordflow">if</span> (batched) {</div>
+<div class="line"><a id="l01324" name="l01324"></a><span class="lineno"> 1324</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01325" name="l01325"></a><span class="lineno"> 1325</span>        x,</div>
+<div class="line"><a id="l01326" name="l01326"></a><span class="lineno"> 1326</span>        w,</div>
+<div class="line"><a id="l01327" name="l01327"></a><span class="lineno"> 1327</span>        scales,</div>
+<div class="line"><a id="l01328" name="l01328"></a><span class="lineno"> 1328</span>        biases,</div>
+<div class="line"><a id="l01329" name="l01329"></a><span class="lineno"> 1329</span>        y,</div>
+<div class="line"><a id="l01330" name="l01330"></a><span class="lineno"> 1330</span>        out_vec_size,</div>
+<div class="line"><a id="l01331" name="l01331"></a><span class="lineno"> 1331</span>        x_batch_ndims,</div>
+<div class="line"><a id="l01332" name="l01332"></a><span class="lineno"> 1332</span>        x_shape,</div>
+<div class="line"><a id="l01333" name="l01333"></a><span class="lineno"> 1333</span>        x_strides,</div>
+<div class="line"><a id="l01334" name="l01334"></a><span class="lineno"> 1334</span>        w_batch_ndims,</div>
+<div class="line"><a id="l01335" name="l01335"></a><span class="lineno"> 1335</span>        w_shape,</div>
+<div class="line"><a id="l01336" name="l01336"></a><span class="lineno"> 1336</span>        w_strides,</div>
+<div class="line"><a id="l01337" name="l01337"></a><span class="lineno"> 1337</span>        s_strides,</div>
+<div class="line"><a id="l01338" name="l01338"></a><span class="lineno"> 1338</span>        b_strides,</div>
+<div class="line"><a id="l01339" name="l01339"></a><span class="lineno"> 1339</span>        tid);</div>
+<div class="line"><a id="l01340" name="l01340"></a><span class="lineno"> 1340</span>  }</div>
+<div class="line"><a id="l01341" name="l01341"></a><span class="lineno"> 1341</span>  <a class="code hl_function" href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">qmv_quad_impl&lt;T, group_size, bits, D&gt;</a>(</div>
+<div class="line"><a id="l01342" name="l01342"></a><span class="lineno"> 1342</span>      w,</div>
+<div class="line"><a id="l01343" name="l01343"></a><span class="lineno"> 1343</span>      scales,</div>
+<div class="line"><a id="l01344" name="l01344"></a><span class="lineno"> 1344</span>      biases,</div>
+<div class="line"><a id="l01345" name="l01345"></a><span class="lineno"> 1345</span>      x,</div>
+<div class="line"><a id="l01346" name="l01346"></a><span class="lineno"> 1346</span>      y,</div>
+<div class="line"><a id="l01347" name="l01347"></a><span class="lineno"> 1347</span>      in_vec_size,</div>
+<div class="line"><a id="l01348" name="l01348"></a><span class="lineno"> 1348</span>      out_vec_size,</div>
+<div class="line"><a id="l01349" name="l01349"></a><span class="lineno"> 1349</span>      tid,</div>
+<div class="line"><a id="l01350" name="l01350"></a><span class="lineno"> 1350</span>      quad_gid,</div>
+<div class="line"><a id="l01351" name="l01351"></a><span class="lineno"> 1351</span>      quad_lid);</div>
+<div class="line"><a id="l01352" name="l01352"></a><span class="lineno"> 1352</span>}</div>
 </div>
-<div class="line"><a id="l01249" name="l01249"></a><span class="lineno"> 1249</span> </div>
-<div class="line"><a id="l01250" name="l01250"></a><span class="lineno"> 1250</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits, <span class="keywordtype">bool</span> batched&gt;</div>
-<div class="foldopen" id="foldopen01251" data-start="{" data-end="}">
-<div class="line"><a id="l01251" name="l01251"></a><span class="lineno"><a class="line" href="quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5"> 1251</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5">qvm</a>(</div>
-<div class="line"><a id="l01252" name="l01252"></a><span class="lineno"> 1252</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01253" name="l01253"></a><span class="lineno"> 1253</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01254" name="l01254"></a><span class="lineno"> 1254</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01255" name="l01255"></a><span class="lineno"> 1255</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01256" name="l01256"></a><span class="lineno"> 1256</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01257" name="l01257"></a><span class="lineno"> 1257</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01258" name="l01258"></a><span class="lineno"> 1258</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01259" name="l01259"></a><span class="lineno"> 1259</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01260" name="l01260"></a><span class="lineno"> 1260</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01261" name="l01261"></a><span class="lineno"> 1261</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01262" name="l01262"></a><span class="lineno"> 1262</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01263" name="l01263"></a><span class="lineno"> 1263</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01264" name="l01264"></a><span class="lineno"> 1264</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01265" name="l01265"></a><span class="lineno"> 1265</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01266" name="l01266"></a><span class="lineno"> 1266</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01267" name="l01267"></a><span class="lineno"> 1267</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01268" name="l01268"></a><span class="lineno"> 1268</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01269" name="l01269"></a><span class="lineno"> 1269</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01270" name="l01270"></a><span class="lineno"> 1270</span>  <span class="keywordflow">if</span> (batched) {</div>
-<div class="line"><a id="l01271" name="l01271"></a><span class="lineno"> 1271</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01272" name="l01272"></a><span class="lineno"> 1272</span>        x,</div>
-<div class="line"><a id="l01273" name="l01273"></a><span class="lineno"> 1273</span>        w,</div>
-<div class="line"><a id="l01274" name="l01274"></a><span class="lineno"> 1274</span>        scales,</div>
-<div class="line"><a id="l01275" name="l01275"></a><span class="lineno"> 1275</span>        biases,</div>
-<div class="line"><a id="l01276" name="l01276"></a><span class="lineno"> 1276</span>        y,</div>
-<div class="line"><a id="l01277" name="l01277"></a><span class="lineno"> 1277</span>        out_vec_size,</div>
-<div class="line"><a id="l01278" name="l01278"></a><span class="lineno"> 1278</span>        x_batch_ndims,</div>
-<div class="line"><a id="l01279" name="l01279"></a><span class="lineno"> 1279</span>        x_shape,</div>
-<div class="line"><a id="l01280" name="l01280"></a><span class="lineno"> 1280</span>        x_strides,</div>
-<div class="line"><a id="l01281" name="l01281"></a><span class="lineno"> 1281</span>        w_batch_ndims,</div>
-<div class="line"><a id="l01282" name="l01282"></a><span class="lineno"> 1282</span>        w_shape,</div>
-<div class="line"><a id="l01283" name="l01283"></a><span class="lineno"> 1283</span>        w_strides,</div>
-<div class="line"><a id="l01284" name="l01284"></a><span class="lineno"> 1284</span>        s_strides,</div>
-<div class="line"><a id="l01285" name="l01285"></a><span class="lineno"> 1285</span>        b_strides,</div>
-<div class="line"><a id="l01286" name="l01286"></a><span class="lineno"> 1286</span>        tid);</div>
-<div class="line"><a id="l01287" name="l01287"></a><span class="lineno"> 1287</span>  }</div>
-<div class="line"><a id="l01288" name="l01288"></a><span class="lineno"> 1288</span>  <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01289" name="l01289"></a><span class="lineno"> 1289</span>      w,</div>
-<div class="line"><a id="l01290" name="l01290"></a><span class="lineno"> 1290</span>      scales,</div>
-<div class="line"><a id="l01291" name="l01291"></a><span class="lineno"> 1291</span>      biases,</div>
-<div class="line"><a id="l01292" name="l01292"></a><span class="lineno"> 1292</span>      x,</div>
-<div class="line"><a id="l01293" name="l01293"></a><span class="lineno"> 1293</span>      y,</div>
-<div class="line"><a id="l01294" name="l01294"></a><span class="lineno"> 1294</span>      in_vec_size,</div>
-<div class="line"><a id="l01295" name="l01295"></a><span class="lineno"> 1295</span>      out_vec_size,</div>
-<div class="line"><a id="l01296" name="l01296"></a><span class="lineno"> 1296</span>      tid,</div>
-<div class="line"><a id="l01297" name="l01297"></a><span class="lineno"> 1297</span>      simd_gid,</div>
-<div class="line"><a id="l01298" name="l01298"></a><span class="lineno"> 1298</span>      simd_lid);</div>
-<div class="line"><a id="l01299" name="l01299"></a><span class="lineno"> 1299</span>}</div>
+<div class="line"><a id="l01353" name="l01353"></a><span class="lineno"> 1353</span> </div>
+<div class="line"><a id="l01354" name="l01354"></a><span class="lineno"> 1354</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits, <span class="keywordtype">bool</span> batched&gt;</div>
+<div class="foldopen" id="foldopen01355" data-start="{" data-end="}">
+<div class="line"><a id="l01355" name="l01355"></a><span class="lineno"><a class="line" href="quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f"> 1355</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f">qmv_fast</a>(</div>
+<div class="line"><a id="l01356" name="l01356"></a><span class="lineno"> 1356</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01357" name="l01357"></a><span class="lineno"> 1357</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01358" name="l01358"></a><span class="lineno"> 1358</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01359" name="l01359"></a><span class="lineno"> 1359</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01360" name="l01360"></a><span class="lineno"> 1360</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01361" name="l01361"></a><span class="lineno"> 1361</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01362" name="l01362"></a><span class="lineno"> 1362</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01363" name="l01363"></a><span class="lineno"> 1363</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01364" name="l01364"></a><span class="lineno"> 1364</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01365" name="l01365"></a><span class="lineno"> 1365</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01366" name="l01366"></a><span class="lineno"> 1366</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01367" name="l01367"></a><span class="lineno"> 1367</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01368" name="l01368"></a><span class="lineno"> 1368</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01369" name="l01369"></a><span class="lineno"> 1369</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01370" name="l01370"></a><span class="lineno"> 1370</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01371" name="l01371"></a><span class="lineno"> 1371</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01372" name="l01372"></a><span class="lineno"> 1372</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01373" name="l01373"></a><span class="lineno"> 1373</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01374" name="l01374"></a><span class="lineno"> 1374</span>  <span class="keywordflow">if</span> (batched) {</div>
+<div class="line"><a id="l01375" name="l01375"></a><span class="lineno"> 1375</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01376" name="l01376"></a><span class="lineno"> 1376</span>        x,</div>
+<div class="line"><a id="l01377" name="l01377"></a><span class="lineno"> 1377</span>        w,</div>
+<div class="line"><a id="l01378" name="l01378"></a><span class="lineno"> 1378</span>        scales,</div>
+<div class="line"><a id="l01379" name="l01379"></a><span class="lineno"> 1379</span>        biases,</div>
+<div class="line"><a id="l01380" name="l01380"></a><span class="lineno"> 1380</span>        y,</div>
+<div class="line"><a id="l01381" name="l01381"></a><span class="lineno"> 1381</span>        out_vec_size,</div>
+<div class="line"><a id="l01382" name="l01382"></a><span class="lineno"> 1382</span>        x_batch_ndims,</div>
+<div class="line"><a id="l01383" name="l01383"></a><span class="lineno"> 1383</span>        x_shape,</div>
+<div class="line"><a id="l01384" name="l01384"></a><span class="lineno"> 1384</span>        x_strides,</div>
+<div class="line"><a id="l01385" name="l01385"></a><span class="lineno"> 1385</span>        w_batch_ndims,</div>
+<div class="line"><a id="l01386" name="l01386"></a><span class="lineno"> 1386</span>        w_shape,</div>
+<div class="line"><a id="l01387" name="l01387"></a><span class="lineno"> 1387</span>        w_strides,</div>
+<div class="line"><a id="l01388" name="l01388"></a><span class="lineno"> 1388</span>        s_strides,</div>
+<div class="line"><a id="l01389" name="l01389"></a><span class="lineno"> 1389</span>        b_strides,</div>
+<div class="line"><a id="l01390" name="l01390"></a><span class="lineno"> 1390</span>        tid);</div>
+<div class="line"><a id="l01391" name="l01391"></a><span class="lineno"> 1391</span>  }</div>
+<div class="line"><a id="l01392" name="l01392"></a><span class="lineno"> 1392</span>  <a class="code hl_function" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01393" name="l01393"></a><span class="lineno"> 1393</span>      w,</div>
+<div class="line"><a id="l01394" name="l01394"></a><span class="lineno"> 1394</span>      scales,</div>
+<div class="line"><a id="l01395" name="l01395"></a><span class="lineno"> 1395</span>      biases,</div>
+<div class="line"><a id="l01396" name="l01396"></a><span class="lineno"> 1396</span>      x,</div>
+<div class="line"><a id="l01397" name="l01397"></a><span class="lineno"> 1397</span>      y,</div>
+<div class="line"><a id="l01398" name="l01398"></a><span class="lineno"> 1398</span>      in_vec_size,</div>
+<div class="line"><a id="l01399" name="l01399"></a><span class="lineno"> 1399</span>      out_vec_size,</div>
+<div class="line"><a id="l01400" name="l01400"></a><span class="lineno"> 1400</span>      tid,</div>
+<div class="line"><a id="l01401" name="l01401"></a><span class="lineno"> 1401</span>      simd_gid,</div>
+<div class="line"><a id="l01402" name="l01402"></a><span class="lineno"> 1402</span>      simd_lid);</div>
+<div class="line"><a id="l01403" name="l01403"></a><span class="lineno"> 1403</span>}</div>
 </div>
-<div class="line"><a id="l01300" name="l01300"></a><span class="lineno"> 1300</span> </div>
-<div class="line"><a id="l01301" name="l01301"></a><span class="lineno"> 1301</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits, <span class="keywordtype">int</span> split_k = 32&gt;</div>
-<div class="foldopen" id="foldopen01302" data-start="{" data-end="}">
-<div class="line"><a id="l01302" name="l01302"></a><span class="lineno"><a class="line" href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8"> 1302</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8">qvm_split_k</a>(</div>
-<div class="line"><a id="l01303" name="l01303"></a><span class="lineno"> 1303</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01304" name="l01304"></a><span class="lineno"> 1304</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01305" name="l01305"></a><span class="lineno"> 1305</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01306" name="l01306"></a><span class="lineno"> 1306</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01307" name="l01307"></a><span class="lineno"> 1307</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01308" name="l01308"></a><span class="lineno"> 1308</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01309" name="l01309"></a><span class="lineno"> 1309</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01310" name="l01310"></a><span class="lineno"> 1310</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01311" name="l01311"></a><span class="lineno"> 1311</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01312" name="l01312"></a><span class="lineno"> 1312</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01313" name="l01313"></a><span class="lineno"> 1313</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01314" name="l01314"></a><span class="lineno"> 1314</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01315" name="l01315"></a><span class="lineno"> 1315</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01316" name="l01316"></a><span class="lineno"> 1316</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01317" name="l01317"></a><span class="lineno"> 1317</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01318" name="l01318"></a><span class="lineno"> 1318</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; final_block_size [[buffer(15)]],</div>
-<div class="line"><a id="l01319" name="l01319"></a><span class="lineno"> 1319</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01320" name="l01320"></a><span class="lineno"> 1320</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01321" name="l01321"></a><span class="lineno"> 1321</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01322" name="l01322"></a><span class="lineno"> 1322</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01323" name="l01323"></a><span class="lineno"> 1323</span>      x,</div>
-<div class="line"><a id="l01324" name="l01324"></a><span class="lineno"> 1324</span>      w,</div>
-<div class="line"><a id="l01325" name="l01325"></a><span class="lineno"> 1325</span>      scales,</div>
-<div class="line"><a id="l01326" name="l01326"></a><span class="lineno"> 1326</span>      biases,</div>
-<div class="line"><a id="l01327" name="l01327"></a><span class="lineno"> 1327</span>      y,</div>
-<div class="line"><a id="l01328" name="l01328"></a><span class="lineno"> 1328</span>      out_vec_size,</div>
-<div class="line"><a id="l01329" name="l01329"></a><span class="lineno"> 1329</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01330" name="l01330"></a><span class="lineno"> 1330</span>      x_shape,</div>
-<div class="line"><a id="l01331" name="l01331"></a><span class="lineno"> 1331</span>      x_strides,</div>
-<div class="line"><a id="l01332" name="l01332"></a><span class="lineno"> 1332</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01333" name="l01333"></a><span class="lineno"> 1333</span>      w_shape,</div>
-<div class="line"><a id="l01334" name="l01334"></a><span class="lineno"> 1334</span>      w_strides,</div>
-<div class="line"><a id="l01335" name="l01335"></a><span class="lineno"> 1335</span>      s_strides,</div>
-<div class="line"><a id="l01336" name="l01336"></a><span class="lineno"> 1336</span>      b_strides,</div>
-<div class="line"><a id="l01337" name="l01337"></a><span class="lineno"> 1337</span>      tid);</div>
-<div class="line"><a id="l01338" name="l01338"></a><span class="lineno"> 1338</span> </div>
-<div class="line"><a id="l01339" name="l01339"></a><span class="lineno"> 1339</span>  <span class="comment">// When (in_vec_size % split_k != 0) the final block needs to be smaller</span></div>
-<div class="line"><a id="l01340" name="l01340"></a><span class="lineno"> 1340</span>  <span class="keywordtype">int</span> in_vec_size_adj =</div>
-<div class="line"><a id="l01341" name="l01341"></a><span class="lineno"> 1341</span>      tid.z % split_k == split_k - 1 ? final_block_size : in_vec_size;</div>
-<div class="line"><a id="l01342" name="l01342"></a><span class="lineno"> 1342</span> </div>
-<div class="line"><a id="l01343" name="l01343"></a><span class="lineno"> 1343</span>  <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01344" name="l01344"></a><span class="lineno"> 1344</span>      w,</div>
-<div class="line"><a id="l01345" name="l01345"></a><span class="lineno"> 1345</span>      scales,</div>
-<div class="line"><a id="l01346" name="l01346"></a><span class="lineno"> 1346</span>      biases,</div>
-<div class="line"><a id="l01347" name="l01347"></a><span class="lineno"> 1347</span>      x,</div>
-<div class="line"><a id="l01348" name="l01348"></a><span class="lineno"> 1348</span>      y,</div>
-<div class="line"><a id="l01349" name="l01349"></a><span class="lineno"> 1349</span>      in_vec_size_adj,</div>
-<div class="line"><a id="l01350" name="l01350"></a><span class="lineno"> 1350</span>      out_vec_size,</div>
-<div class="line"><a id="l01351" name="l01351"></a><span class="lineno"> 1351</span>      tid,</div>
-<div class="line"><a id="l01352" name="l01352"></a><span class="lineno"> 1352</span>      simd_gid,</div>
-<div class="line"><a id="l01353" name="l01353"></a><span class="lineno"> 1353</span>      simd_lid);</div>
-<div class="line"><a id="l01354" name="l01354"></a><span class="lineno"> 1354</span>}</div>
+<div class="line"><a id="l01404" name="l01404"></a><span class="lineno"> 1404</span> </div>
+<div class="line"><a id="l01405" name="l01405"></a><span class="lineno"> 1405</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits, <span class="keywordtype">bool</span> batched&gt;</div>
+<div class="foldopen" id="foldopen01406" data-start="{" data-end="}">
+<div class="line"><a id="l01406" name="l01406"></a><span class="lineno"><a class="line" href="quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd"> 1406</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd">qmv</a>(</div>
+<div class="line"><a id="l01407" name="l01407"></a><span class="lineno"> 1407</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01408" name="l01408"></a><span class="lineno"> 1408</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01409" name="l01409"></a><span class="lineno"> 1409</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01410" name="l01410"></a><span class="lineno"> 1410</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01411" name="l01411"></a><span class="lineno"> 1411</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01412" name="l01412"></a><span class="lineno"> 1412</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01413" name="l01413"></a><span class="lineno"> 1413</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01414" name="l01414"></a><span class="lineno"> 1414</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01415" name="l01415"></a><span class="lineno"> 1415</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01416" name="l01416"></a><span class="lineno"> 1416</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01417" name="l01417"></a><span class="lineno"> 1417</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01418" name="l01418"></a><span class="lineno"> 1418</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01419" name="l01419"></a><span class="lineno"> 1419</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01420" name="l01420"></a><span class="lineno"> 1420</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01421" name="l01421"></a><span class="lineno"> 1421</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01422" name="l01422"></a><span class="lineno"> 1422</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01423" name="l01423"></a><span class="lineno"> 1423</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01424" name="l01424"></a><span class="lineno"> 1424</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01425" name="l01425"></a><span class="lineno"> 1425</span>  <span class="keywordflow">if</span> (batched) {</div>
+<div class="line"><a id="l01426" name="l01426"></a><span class="lineno"> 1426</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01427" name="l01427"></a><span class="lineno"> 1427</span>        x,</div>
+<div class="line"><a id="l01428" name="l01428"></a><span class="lineno"> 1428</span>        w,</div>
+<div class="line"><a id="l01429" name="l01429"></a><span class="lineno"> 1429</span>        scales,</div>
+<div class="line"><a id="l01430" name="l01430"></a><span class="lineno"> 1430</span>        biases,</div>
+<div class="line"><a id="l01431" name="l01431"></a><span class="lineno"> 1431</span>        y,</div>
+<div class="line"><a id="l01432" name="l01432"></a><span class="lineno"> 1432</span>        out_vec_size,</div>
+<div class="line"><a id="l01433" name="l01433"></a><span class="lineno"> 1433</span>        x_batch_ndims,</div>
+<div class="line"><a id="l01434" name="l01434"></a><span class="lineno"> 1434</span>        x_shape,</div>
+<div class="line"><a id="l01435" name="l01435"></a><span class="lineno"> 1435</span>        x_strides,</div>
+<div class="line"><a id="l01436" name="l01436"></a><span class="lineno"> 1436</span>        w_batch_ndims,</div>
+<div class="line"><a id="l01437" name="l01437"></a><span class="lineno"> 1437</span>        w_shape,</div>
+<div class="line"><a id="l01438" name="l01438"></a><span class="lineno"> 1438</span>        w_strides,</div>
+<div class="line"><a id="l01439" name="l01439"></a><span class="lineno"> 1439</span>        s_strides,</div>
+<div class="line"><a id="l01440" name="l01440"></a><span class="lineno"> 1440</span>        b_strides,</div>
+<div class="line"><a id="l01441" name="l01441"></a><span class="lineno"> 1441</span>        tid);</div>
+<div class="line"><a id="l01442" name="l01442"></a><span class="lineno"> 1442</span>  }</div>
+<div class="line"><a id="l01443" name="l01443"></a><span class="lineno"> 1443</span>  <a class="code hl_function" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01444" name="l01444"></a><span class="lineno"> 1444</span>      w,</div>
+<div class="line"><a id="l01445" name="l01445"></a><span class="lineno"> 1445</span>      scales,</div>
+<div class="line"><a id="l01446" name="l01446"></a><span class="lineno"> 1446</span>      biases,</div>
+<div class="line"><a id="l01447" name="l01447"></a><span class="lineno"> 1447</span>      x,</div>
+<div class="line"><a id="l01448" name="l01448"></a><span class="lineno"> 1448</span>      y,</div>
+<div class="line"><a id="l01449" name="l01449"></a><span class="lineno"> 1449</span>      in_vec_size,</div>
+<div class="line"><a id="l01450" name="l01450"></a><span class="lineno"> 1450</span>      out_vec_size,</div>
+<div class="line"><a id="l01451" name="l01451"></a><span class="lineno"> 1451</span>      tid,</div>
+<div class="line"><a id="l01452" name="l01452"></a><span class="lineno"> 1452</span>      simd_gid,</div>
+<div class="line"><a id="l01453" name="l01453"></a><span class="lineno"> 1453</span>      simd_lid);</div>
+<div class="line"><a id="l01454" name="l01454"></a><span class="lineno"> 1454</span>}</div>
 </div>
-<div class="line"><a id="l01355" name="l01355"></a><span class="lineno"> 1355</span> </div>
-<div class="line"><a id="l01356" name="l01356"></a><span class="lineno"> 1356</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l01357" name="l01357"></a><span class="lineno"> 1357</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l01358" name="l01358"></a><span class="lineno"> 1358</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l01359" name="l01359"></a><span class="lineno"> 1359</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l01360" name="l01360"></a><span class="lineno"> 1360</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
-<div class="line"><a id="l01361" name="l01361"></a><span class="lineno"> 1361</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> batched,</div>
-<div class="line"><a id="l01362" name="l01362"></a><span class="lineno"> 1362</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l01363" name="l01363"></a><span class="lineno"> 1363</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l01364" name="l01364"></a><span class="lineno"> 1364</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen01365" data-start="{" data-end="}">
-<div class="line"><a id="l01365" name="l01365"></a><span class="lineno"><a class="line" href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10"> 1365</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10">qmm_t</a>(</div>
-<div class="line"><a id="l01366" name="l01366"></a><span class="lineno"> 1366</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01367" name="l01367"></a><span class="lineno"> 1367</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01368" name="l01368"></a><span class="lineno"> 1368</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01369" name="l01369"></a><span class="lineno"> 1369</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01370" name="l01370"></a><span class="lineno"> 1370</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01371" name="l01371"></a><span class="lineno"> 1371</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
-<div class="line"><a id="l01372" name="l01372"></a><span class="lineno"> 1372</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
-<div class="line"><a id="l01373" name="l01373"></a><span class="lineno"> 1373</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
-<div class="line"><a id="l01374" name="l01374"></a><span class="lineno"> 1374</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
-<div class="line"><a id="l01375" name="l01375"></a><span class="lineno"> 1375</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
-<div class="line"><a id="l01376" name="l01376"></a><span class="lineno"> 1376</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
-<div class="line"><a id="l01377" name="l01377"></a><span class="lineno"> 1377</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
-<div class="line"><a id="l01378" name="l01378"></a><span class="lineno"> 1378</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
-<div class="line"><a id="l01379" name="l01379"></a><span class="lineno"> 1379</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01380" name="l01380"></a><span class="lineno"> 1380</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01381" name="l01381"></a><span class="lineno"> 1381</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
-<div class="line"><a id="l01382" name="l01382"></a><span class="lineno"> 1382</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01383" name="l01383"></a><span class="lineno"> 1383</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l01384" name="l01384"></a><span class="lineno"> 1384</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01385" name="l01385"></a><span class="lineno"> 1385</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01386" name="l01386"></a><span class="lineno"> 1386</span>  (void)lid;</div>
-<div class="line"><a id="l01387" name="l01387"></a><span class="lineno"> 1387</span> </div>
-<div class="line"><a id="l01388" name="l01388"></a><span class="lineno"> 1388</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01389" name="l01389"></a><span class="lineno"> 1389</span> </div>
-<div class="line"><a id="l01390" name="l01390"></a><span class="lineno"> 1390</span>  threadgroup T Xs[BM * BK_padded];</div>
-<div class="line"><a id="l01391" name="l01391"></a><span class="lineno"> 1391</span>  threadgroup T Ws[BN * BK_padded];</div>
-<div class="line"><a id="l01392" name="l01392"></a><span class="lineno"> 1392</span> </div>
-<div class="line"><a id="l01393" name="l01393"></a><span class="lineno"> 1393</span>  <span class="keywordflow">if</span> (batched) {</div>
-<div class="line"><a id="l01394" name="l01394"></a><span class="lineno"> 1394</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01395" name="l01395"></a><span class="lineno"> 1395</span>        x,</div>
-<div class="line"><a id="l01396" name="l01396"></a><span class="lineno"> 1396</span>        w,</div>
-<div class="line"><a id="l01397" name="l01397"></a><span class="lineno"> 1397</span>        scales,</div>
-<div class="line"><a id="l01398" name="l01398"></a><span class="lineno"> 1398</span>        biases,</div>
-<div class="line"><a id="l01399" name="l01399"></a><span class="lineno"> 1399</span>        y,</div>
-<div class="line"><a id="l01400" name="l01400"></a><span class="lineno"> 1400</span>        M * N,</div>
-<div class="line"><a id="l01401" name="l01401"></a><span class="lineno"> 1401</span>        x_batch_ndims,</div>
-<div class="line"><a id="l01402" name="l01402"></a><span class="lineno"> 1402</span>        x_shape,</div>
-<div class="line"><a id="l01403" name="l01403"></a><span class="lineno"> 1403</span>        x_strides,</div>
-<div class="line"><a id="l01404" name="l01404"></a><span class="lineno"> 1404</span>        w_batch_ndims,</div>
-<div class="line"><a id="l01405" name="l01405"></a><span class="lineno"> 1405</span>        w_shape,</div>
-<div class="line"><a id="l01406" name="l01406"></a><span class="lineno"> 1406</span>        w_strides,</div>
-<div class="line"><a id="l01407" name="l01407"></a><span class="lineno"> 1407</span>        s_strides,</div>
-<div class="line"><a id="l01408" name="l01408"></a><span class="lineno"> 1408</span>        b_strides,</div>
-<div class="line"><a id="l01409" name="l01409"></a><span class="lineno"> 1409</span>        tid);</div>
-<div class="line"><a id="l01410" name="l01410"></a><span class="lineno"> 1410</span>  }</div>
-<div class="line"><a id="l01411" name="l01411"></a><span class="lineno"> 1411</span>  <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl&lt;T, group_size, bits, aligned_N, BM, BK, BN&gt;</a>(</div>
-<div class="line"><a id="l01412" name="l01412"></a><span class="lineno"> 1412</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
-<div class="line"><a id="l01413" name="l01413"></a><span class="lineno"> 1413</span>}</div>
+<div class="line"><a id="l01455" name="l01455"></a><span class="lineno"> 1455</span> </div>
+<div class="line"><a id="l01456" name="l01456"></a><span class="lineno"> 1456</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits, <span class="keywordtype">bool</span> batched&gt;</div>
+<div class="foldopen" id="foldopen01457" data-start="{" data-end="}">
+<div class="line"><a id="l01457" name="l01457"></a><span class="lineno"><a class="line" href="quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5"> 1457</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5">qvm</a>(</div>
+<div class="line"><a id="l01458" name="l01458"></a><span class="lineno"> 1458</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01459" name="l01459"></a><span class="lineno"> 1459</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01460" name="l01460"></a><span class="lineno"> 1460</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01461" name="l01461"></a><span class="lineno"> 1461</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01462" name="l01462"></a><span class="lineno"> 1462</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01463" name="l01463"></a><span class="lineno"> 1463</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01464" name="l01464"></a><span class="lineno"> 1464</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01465" name="l01465"></a><span class="lineno"> 1465</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01466" name="l01466"></a><span class="lineno"> 1466</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01467" name="l01467"></a><span class="lineno"> 1467</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01468" name="l01468"></a><span class="lineno"> 1468</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01469" name="l01469"></a><span class="lineno"> 1469</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01470" name="l01470"></a><span class="lineno"> 1470</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01471" name="l01471"></a><span class="lineno"> 1471</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01472" name="l01472"></a><span class="lineno"> 1472</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01473" name="l01473"></a><span class="lineno"> 1473</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01474" name="l01474"></a><span class="lineno"> 1474</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01475" name="l01475"></a><span class="lineno"> 1475</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01476" name="l01476"></a><span class="lineno"> 1476</span>  <span class="keywordflow">if</span> (batched) {</div>
+<div class="line"><a id="l01477" name="l01477"></a><span class="lineno"> 1477</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01478" name="l01478"></a><span class="lineno"> 1478</span>        x,</div>
+<div class="line"><a id="l01479" name="l01479"></a><span class="lineno"> 1479</span>        w,</div>
+<div class="line"><a id="l01480" name="l01480"></a><span class="lineno"> 1480</span>        scales,</div>
+<div class="line"><a id="l01481" name="l01481"></a><span class="lineno"> 1481</span>        biases,</div>
+<div class="line"><a id="l01482" name="l01482"></a><span class="lineno"> 1482</span>        y,</div>
+<div class="line"><a id="l01483" name="l01483"></a><span class="lineno"> 1483</span>        out_vec_size,</div>
+<div class="line"><a id="l01484" name="l01484"></a><span class="lineno"> 1484</span>        x_batch_ndims,</div>
+<div class="line"><a id="l01485" name="l01485"></a><span class="lineno"> 1485</span>        x_shape,</div>
+<div class="line"><a id="l01486" name="l01486"></a><span class="lineno"> 1486</span>        x_strides,</div>
+<div class="line"><a id="l01487" name="l01487"></a><span class="lineno"> 1487</span>        w_batch_ndims,</div>
+<div class="line"><a id="l01488" name="l01488"></a><span class="lineno"> 1488</span>        w_shape,</div>
+<div class="line"><a id="l01489" name="l01489"></a><span class="lineno"> 1489</span>        w_strides,</div>
+<div class="line"><a id="l01490" name="l01490"></a><span class="lineno"> 1490</span>        s_strides,</div>
+<div class="line"><a id="l01491" name="l01491"></a><span class="lineno"> 1491</span>        b_strides,</div>
+<div class="line"><a id="l01492" name="l01492"></a><span class="lineno"> 1492</span>        tid);</div>
+<div class="line"><a id="l01493" name="l01493"></a><span class="lineno"> 1493</span>  }</div>
+<div class="line"><a id="l01494" name="l01494"></a><span class="lineno"> 1494</span>  <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01495" name="l01495"></a><span class="lineno"> 1495</span>      w,</div>
+<div class="line"><a id="l01496" name="l01496"></a><span class="lineno"> 1496</span>      scales,</div>
+<div class="line"><a id="l01497" name="l01497"></a><span class="lineno"> 1497</span>      biases,</div>
+<div class="line"><a id="l01498" name="l01498"></a><span class="lineno"> 1498</span>      x,</div>
+<div class="line"><a id="l01499" name="l01499"></a><span class="lineno"> 1499</span>      y,</div>
+<div class="line"><a id="l01500" name="l01500"></a><span class="lineno"> 1500</span>      in_vec_size,</div>
+<div class="line"><a id="l01501" name="l01501"></a><span class="lineno"> 1501</span>      out_vec_size,</div>
+<div class="line"><a id="l01502" name="l01502"></a><span class="lineno"> 1502</span>      tid,</div>
+<div class="line"><a id="l01503" name="l01503"></a><span class="lineno"> 1503</span>      simd_gid,</div>
+<div class="line"><a id="l01504" name="l01504"></a><span class="lineno"> 1504</span>      simd_lid);</div>
+<div class="line"><a id="l01505" name="l01505"></a><span class="lineno"> 1505</span>}</div>
 </div>
-<div class="line"><a id="l01414" name="l01414"></a><span class="lineno"> 1414</span> </div>
-<div class="line"><a id="l01415" name="l01415"></a><span class="lineno"> 1415</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l01416" name="l01416"></a><span class="lineno"> 1416</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l01417" name="l01417"></a><span class="lineno"> 1417</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l01418" name="l01418"></a><span class="lineno"> 1418</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l01419" name="l01419"></a><span class="lineno"> 1419</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> batched,</div>
-<div class="line"><a id="l01420" name="l01420"></a><span class="lineno"> 1420</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l01421" name="l01421"></a><span class="lineno"> 1421</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l01422" name="l01422"></a><span class="lineno"> 1422</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen01423" data-start="{" data-end="}">
-<div class="line"><a id="l01423" name="l01423"></a><span class="lineno"><a class="line" href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7"> 1423</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7">qmm_n</a>(</div>
-<div class="line"><a id="l01424" name="l01424"></a><span class="lineno"> 1424</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01425" name="l01425"></a><span class="lineno"> 1425</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01426" name="l01426"></a><span class="lineno"> 1426</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01427" name="l01427"></a><span class="lineno"> 1427</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01428" name="l01428"></a><span class="lineno"> 1428</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01429" name="l01429"></a><span class="lineno"> 1429</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
-<div class="line"><a id="l01430" name="l01430"></a><span class="lineno"> 1430</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
-<div class="line"><a id="l01431" name="l01431"></a><span class="lineno"> 1431</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
-<div class="line"><a id="l01432" name="l01432"></a><span class="lineno"> 1432</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
-<div class="line"><a id="l01433" name="l01433"></a><span class="lineno"> 1433</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
-<div class="line"><a id="l01434" name="l01434"></a><span class="lineno"> 1434</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
-<div class="line"><a id="l01435" name="l01435"></a><span class="lineno"> 1435</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
-<div class="line"><a id="l01436" name="l01436"></a><span class="lineno"> 1436</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
-<div class="line"><a id="l01437" name="l01437"></a><span class="lineno"> 1437</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01438" name="l01438"></a><span class="lineno"> 1438</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01439" name="l01439"></a><span class="lineno"> 1439</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
-<div class="line"><a id="l01440" name="l01440"></a><span class="lineno"> 1440</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01441" name="l01441"></a><span class="lineno"> 1441</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l01442" name="l01442"></a><span class="lineno"> 1442</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01443" name="l01443"></a><span class="lineno"> 1443</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01444" name="l01444"></a><span class="lineno"> 1444</span>  (void)lid;</div>
-<div class="line"><a id="l01445" name="l01445"></a><span class="lineno"> 1445</span> </div>
-<div class="line"><a id="l01446" name="l01446"></a><span class="lineno"> 1446</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01447" name="l01447"></a><span class="lineno"> 1447</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01448" name="l01448"></a><span class="lineno"> 1448</span> </div>
-<div class="line"><a id="l01449" name="l01449"></a><span class="lineno"> 1449</span>  threadgroup T Xs[BM * BK_padded];</div>
-<div class="line"><a id="l01450" name="l01450"></a><span class="lineno"> 1450</span>  threadgroup T Ws[BK * BN_padded];</div>
-<div class="line"><a id="l01451" name="l01451"></a><span class="lineno"> 1451</span> </div>
-<div class="line"><a id="l01452" name="l01452"></a><span class="lineno"> 1452</span>  <span class="keywordflow">if</span> (batched) {</div>
-<div class="line"><a id="l01453" name="l01453"></a><span class="lineno"> 1453</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01454" name="l01454"></a><span class="lineno"> 1454</span>        x,</div>
-<div class="line"><a id="l01455" name="l01455"></a><span class="lineno"> 1455</span>        w,</div>
-<div class="line"><a id="l01456" name="l01456"></a><span class="lineno"> 1456</span>        scales,</div>
-<div class="line"><a id="l01457" name="l01457"></a><span class="lineno"> 1457</span>        biases,</div>
-<div class="line"><a id="l01458" name="l01458"></a><span class="lineno"> 1458</span>        y,</div>
-<div class="line"><a id="l01459" name="l01459"></a><span class="lineno"> 1459</span>        M * N,</div>
-<div class="line"><a id="l01460" name="l01460"></a><span class="lineno"> 1460</span>        x_batch_ndims,</div>
-<div class="line"><a id="l01461" name="l01461"></a><span class="lineno"> 1461</span>        x_shape,</div>
-<div class="line"><a id="l01462" name="l01462"></a><span class="lineno"> 1462</span>        x_strides,</div>
-<div class="line"><a id="l01463" name="l01463"></a><span class="lineno"> 1463</span>        w_batch_ndims,</div>
-<div class="line"><a id="l01464" name="l01464"></a><span class="lineno"> 1464</span>        w_shape,</div>
-<div class="line"><a id="l01465" name="l01465"></a><span class="lineno"> 1465</span>        w_strides,</div>
-<div class="line"><a id="l01466" name="l01466"></a><span class="lineno"> 1466</span>        s_strides,</div>
-<div class="line"><a id="l01467" name="l01467"></a><span class="lineno"> 1467</span>        b_strides,</div>
-<div class="line"><a id="l01468" name="l01468"></a><span class="lineno"> 1468</span>        tid);</div>
-<div class="line"><a id="l01469" name="l01469"></a><span class="lineno"> 1469</span>  }</div>
-<div class="line"><a id="l01470" name="l01470"></a><span class="lineno"> 1470</span> </div>
-<div class="line"><a id="l01471" name="l01471"></a><span class="lineno"> 1471</span>  <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl&lt;T, group_size, bits, BM, BK, BN&gt;</a>(</div>
-<div class="line"><a id="l01472" name="l01472"></a><span class="lineno"> 1472</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
-<div class="line"><a id="l01473" name="l01473"></a><span class="lineno"> 1473</span>}</div>
+<div class="line"><a id="l01506" name="l01506"></a><span class="lineno"> 1506</span> </div>
+<div class="line"><a id="l01507" name="l01507"></a><span class="lineno"> 1507</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits, <span class="keywordtype">int</span> split_k = 32&gt;</div>
+<div class="foldopen" id="foldopen01508" data-start="{" data-end="}">
+<div class="line"><a id="l01508" name="l01508"></a><span class="lineno"><a class="line" href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8"> 1508</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8">qvm_split_k</a>(</div>
+<div class="line"><a id="l01509" name="l01509"></a><span class="lineno"> 1509</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01510" name="l01510"></a><span class="lineno"> 1510</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01511" name="l01511"></a><span class="lineno"> 1511</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01512" name="l01512"></a><span class="lineno"> 1512</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01513" name="l01513"></a><span class="lineno"> 1513</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01514" name="l01514"></a><span class="lineno"> 1514</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01515" name="l01515"></a><span class="lineno"> 1515</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01516" name="l01516"></a><span class="lineno"> 1516</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01517" name="l01517"></a><span class="lineno"> 1517</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01518" name="l01518"></a><span class="lineno"> 1518</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01519" name="l01519"></a><span class="lineno"> 1519</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01520" name="l01520"></a><span class="lineno"> 1520</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01521" name="l01521"></a><span class="lineno"> 1521</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01522" name="l01522"></a><span class="lineno"> 1522</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01523" name="l01523"></a><span class="lineno"> 1523</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01524" name="l01524"></a><span class="lineno"> 1524</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; final_block_size [[buffer(15)]],</div>
+<div class="line"><a id="l01525" name="l01525"></a><span class="lineno"> 1525</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01526" name="l01526"></a><span class="lineno"> 1526</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01527" name="l01527"></a><span class="lineno"> 1527</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01528" name="l01528"></a><span class="lineno"> 1528</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01529" name="l01529"></a><span class="lineno"> 1529</span>      x,</div>
+<div class="line"><a id="l01530" name="l01530"></a><span class="lineno"> 1530</span>      w,</div>
+<div class="line"><a id="l01531" name="l01531"></a><span class="lineno"> 1531</span>      scales,</div>
+<div class="line"><a id="l01532" name="l01532"></a><span class="lineno"> 1532</span>      biases,</div>
+<div class="line"><a id="l01533" name="l01533"></a><span class="lineno"> 1533</span>      y,</div>
+<div class="line"><a id="l01534" name="l01534"></a><span class="lineno"> 1534</span>      out_vec_size,</div>
+<div class="line"><a id="l01535" name="l01535"></a><span class="lineno"> 1535</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01536" name="l01536"></a><span class="lineno"> 1536</span>      x_shape,</div>
+<div class="line"><a id="l01537" name="l01537"></a><span class="lineno"> 1537</span>      x_strides,</div>
+<div class="line"><a id="l01538" name="l01538"></a><span class="lineno"> 1538</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01539" name="l01539"></a><span class="lineno"> 1539</span>      w_shape,</div>
+<div class="line"><a id="l01540" name="l01540"></a><span class="lineno"> 1540</span>      w_strides,</div>
+<div class="line"><a id="l01541" name="l01541"></a><span class="lineno"> 1541</span>      s_strides,</div>
+<div class="line"><a id="l01542" name="l01542"></a><span class="lineno"> 1542</span>      b_strides,</div>
+<div class="line"><a id="l01543" name="l01543"></a><span class="lineno"> 1543</span>      tid);</div>
+<div class="line"><a id="l01544" name="l01544"></a><span class="lineno"> 1544</span> </div>
+<div class="line"><a id="l01545" name="l01545"></a><span class="lineno"> 1545</span>  <span class="comment">// When (in_vec_size % split_k != 0) the final block needs to be smaller</span></div>
+<div class="line"><a id="l01546" name="l01546"></a><span class="lineno"> 1546</span>  <span class="keywordtype">int</span> in_vec_size_adj =</div>
+<div class="line"><a id="l01547" name="l01547"></a><span class="lineno"> 1547</span>      tid.z % split_k == split_k - 1 ? final_block_size : in_vec_size;</div>
+<div class="line"><a id="l01548" name="l01548"></a><span class="lineno"> 1548</span> </div>
+<div class="line"><a id="l01549" name="l01549"></a><span class="lineno"> 1549</span>  <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01550" name="l01550"></a><span class="lineno"> 1550</span>      w,</div>
+<div class="line"><a id="l01551" name="l01551"></a><span class="lineno"> 1551</span>      scales,</div>
+<div class="line"><a id="l01552" name="l01552"></a><span class="lineno"> 1552</span>      biases,</div>
+<div class="line"><a id="l01553" name="l01553"></a><span class="lineno"> 1553</span>      x,</div>
+<div class="line"><a id="l01554" name="l01554"></a><span class="lineno"> 1554</span>      y,</div>
+<div class="line"><a id="l01555" name="l01555"></a><span class="lineno"> 1555</span>      in_vec_size_adj,</div>
+<div class="line"><a id="l01556" name="l01556"></a><span class="lineno"> 1556</span>      out_vec_size,</div>
+<div class="line"><a id="l01557" name="l01557"></a><span class="lineno"> 1557</span>      tid,</div>
+<div class="line"><a id="l01558" name="l01558"></a><span class="lineno"> 1558</span>      simd_gid,</div>
+<div class="line"><a id="l01559" name="l01559"></a><span class="lineno"> 1559</span>      simd_lid);</div>
+<div class="line"><a id="l01560" name="l01560"></a><span class="lineno"> 1560</span>}</div>
 </div>
-<div class="line"><a id="l01474" name="l01474"></a><span class="lineno"> 1474</span> </div>
-<div class="line"><a id="l01475" name="l01475"></a><span class="lineno"> 1475</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01476" data-start="{" data-end="}">
-<div class="line"><a id="l01476" name="l01476"></a><span class="lineno"><a class="line" href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7"> 1476</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7">bs_qmv_fast</a>(</div>
-<div class="line"><a id="l01477" name="l01477"></a><span class="lineno"> 1477</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01478" name="l01478"></a><span class="lineno"> 1478</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01479" name="l01479"></a><span class="lineno"> 1479</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01480" name="l01480"></a><span class="lineno"> 1480</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01481" name="l01481"></a><span class="lineno"> 1481</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01482" name="l01482"></a><span class="lineno"> 1482</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01483" name="l01483"></a><span class="lineno"> 1483</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01484" name="l01484"></a><span class="lineno"> 1484</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01485" name="l01485"></a><span class="lineno"> 1485</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01486" name="l01486"></a><span class="lineno"> 1486</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01487" name="l01487"></a><span class="lineno"> 1487</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01488" name="l01488"></a><span class="lineno"> 1488</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01489" name="l01489"></a><span class="lineno"> 1489</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01490" name="l01490"></a><span class="lineno"> 1490</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01491" name="l01491"></a><span class="lineno"> 1491</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01492" name="l01492"></a><span class="lineno"> 1492</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
-<div class="line"><a id="l01493" name="l01493"></a><span class="lineno"> 1493</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
-<div class="line"><a id="l01494" name="l01494"></a><span class="lineno"> 1494</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
-<div class="line"><a id="l01495" name="l01495"></a><span class="lineno"> 1495</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01496" name="l01496"></a><span class="lineno"> 1496</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
-<div class="line"><a id="l01497" name="l01497"></a><span class="lineno"> 1497</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01498" name="l01498"></a><span class="lineno"> 1498</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01499" name="l01499"></a><span class="lineno"> 1499</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01500" name="l01500"></a><span class="lineno"> 1500</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01501" name="l01501"></a><span class="lineno"> 1501</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01502" name="l01502"></a><span class="lineno"> 1502</span>      x,</div>
-<div class="line"><a id="l01503" name="l01503"></a><span class="lineno"> 1503</span>      w,</div>
-<div class="line"><a id="l01504" name="l01504"></a><span class="lineno"> 1504</span>      scales,</div>
-<div class="line"><a id="l01505" name="l01505"></a><span class="lineno"> 1505</span>      biases,</div>
-<div class="line"><a id="l01506" name="l01506"></a><span class="lineno"> 1506</span>      lhs_indices,</div>
-<div class="line"><a id="l01507" name="l01507"></a><span class="lineno"> 1507</span>      rhs_indices,</div>
-<div class="line"><a id="l01508" name="l01508"></a><span class="lineno"> 1508</span>      y,</div>
-<div class="line"><a id="l01509" name="l01509"></a><span class="lineno"> 1509</span>      out_vec_size,</div>
-<div class="line"><a id="l01510" name="l01510"></a><span class="lineno"> 1510</span>      batch_ndims,</div>
-<div class="line"><a id="l01511" name="l01511"></a><span class="lineno"> 1511</span>      batch_shape,</div>
-<div class="line"><a id="l01512" name="l01512"></a><span class="lineno"> 1512</span>      lhs_strides,</div>
-<div class="line"><a id="l01513" name="l01513"></a><span class="lineno"> 1513</span>      rhs_strides,</div>
-<div class="line"><a id="l01514" name="l01514"></a><span class="lineno"> 1514</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01515" name="l01515"></a><span class="lineno"> 1515</span>      x_shape,</div>
-<div class="line"><a id="l01516" name="l01516"></a><span class="lineno"> 1516</span>      x_strides,</div>
-<div class="line"><a id="l01517" name="l01517"></a><span class="lineno"> 1517</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01518" name="l01518"></a><span class="lineno"> 1518</span>      w_shape,</div>
-<div class="line"><a id="l01519" name="l01519"></a><span class="lineno"> 1519</span>      w_strides,</div>
-<div class="line"><a id="l01520" name="l01520"></a><span class="lineno"> 1520</span>      s_strides,</div>
-<div class="line"><a id="l01521" name="l01521"></a><span class="lineno"> 1521</span>      b_strides,</div>
-<div class="line"><a id="l01522" name="l01522"></a><span class="lineno"> 1522</span>      tid);</div>
-<div class="line"><a id="l01523" name="l01523"></a><span class="lineno"> 1523</span>  <a class="code hl_function" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01524" name="l01524"></a><span class="lineno"> 1524</span>      w,</div>
-<div class="line"><a id="l01525" name="l01525"></a><span class="lineno"> 1525</span>      scales,</div>
-<div class="line"><a id="l01526" name="l01526"></a><span class="lineno"> 1526</span>      biases,</div>
-<div class="line"><a id="l01527" name="l01527"></a><span class="lineno"> 1527</span>      x,</div>
-<div class="line"><a id="l01528" name="l01528"></a><span class="lineno"> 1528</span>      y,</div>
-<div class="line"><a id="l01529" name="l01529"></a><span class="lineno"> 1529</span>      in_vec_size,</div>
-<div class="line"><a id="l01530" name="l01530"></a><span class="lineno"> 1530</span>      out_vec_size,</div>
-<div class="line"><a id="l01531" name="l01531"></a><span class="lineno"> 1531</span>      tid,</div>
-<div class="line"><a id="l01532" name="l01532"></a><span class="lineno"> 1532</span>      simd_gid,</div>
-<div class="line"><a id="l01533" name="l01533"></a><span class="lineno"> 1533</span>      simd_lid);</div>
-<div class="line"><a id="l01534" name="l01534"></a><span class="lineno"> 1534</span>}</div>
-</div>
-<div class="line"><a id="l01535" name="l01535"></a><span class="lineno"> 1535</span> </div>
-<div class="line"><a id="l01536" name="l01536"></a><span class="lineno"> 1536</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01537" data-start="{" data-end="}">
-<div class="line"><a id="l01537" name="l01537"></a><span class="lineno"><a class="line" href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed"> 1537</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed">bs_qmv</a>(</div>
-<div class="line"><a id="l01538" name="l01538"></a><span class="lineno"> 1538</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01539" name="l01539"></a><span class="lineno"> 1539</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01540" name="l01540"></a><span class="lineno"> 1540</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01541" name="l01541"></a><span class="lineno"> 1541</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01542" name="l01542"></a><span class="lineno"> 1542</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01543" name="l01543"></a><span class="lineno"> 1543</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01544" name="l01544"></a><span class="lineno"> 1544</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01545" name="l01545"></a><span class="lineno"> 1545</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01546" name="l01546"></a><span class="lineno"> 1546</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01547" name="l01547"></a><span class="lineno"> 1547</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01548" name="l01548"></a><span class="lineno"> 1548</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01549" name="l01549"></a><span class="lineno"> 1549</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01550" name="l01550"></a><span class="lineno"> 1550</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01551" name="l01551"></a><span class="lineno"> 1551</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01552" name="l01552"></a><span class="lineno"> 1552</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01553" name="l01553"></a><span class="lineno"> 1553</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
-<div class="line"><a id="l01554" name="l01554"></a><span class="lineno"> 1554</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
-<div class="line"><a id="l01555" name="l01555"></a><span class="lineno"> 1555</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
-<div class="line"><a id="l01556" name="l01556"></a><span class="lineno"> 1556</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01557" name="l01557"></a><span class="lineno"> 1557</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
-<div class="line"><a id="l01558" name="l01558"></a><span class="lineno"> 1558</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01559" name="l01559"></a><span class="lineno"> 1559</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01560" name="l01560"></a><span class="lineno"> 1560</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01561" name="l01561"></a><span class="lineno"> 1561</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01562" name="l01562"></a><span class="lineno"> 1562</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01563" name="l01563"></a><span class="lineno"> 1563</span>      x,</div>
-<div class="line"><a id="l01564" name="l01564"></a><span class="lineno"> 1564</span>      w,</div>
-<div class="line"><a id="l01565" name="l01565"></a><span class="lineno"> 1565</span>      scales,</div>
-<div class="line"><a id="l01566" name="l01566"></a><span class="lineno"> 1566</span>      biases,</div>
-<div class="line"><a id="l01567" name="l01567"></a><span class="lineno"> 1567</span>      lhs_indices,</div>
-<div class="line"><a id="l01568" name="l01568"></a><span class="lineno"> 1568</span>      rhs_indices,</div>
-<div class="line"><a id="l01569" name="l01569"></a><span class="lineno"> 1569</span>      y,</div>
-<div class="line"><a id="l01570" name="l01570"></a><span class="lineno"> 1570</span>      out_vec_size,</div>
-<div class="line"><a id="l01571" name="l01571"></a><span class="lineno"> 1571</span>      batch_ndims,</div>
-<div class="line"><a id="l01572" name="l01572"></a><span class="lineno"> 1572</span>      batch_shape,</div>
-<div class="line"><a id="l01573" name="l01573"></a><span class="lineno"> 1573</span>      lhs_strides,</div>
-<div class="line"><a id="l01574" name="l01574"></a><span class="lineno"> 1574</span>      rhs_strides,</div>
-<div class="line"><a id="l01575" name="l01575"></a><span class="lineno"> 1575</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01576" name="l01576"></a><span class="lineno"> 1576</span>      x_shape,</div>
-<div class="line"><a id="l01577" name="l01577"></a><span class="lineno"> 1577</span>      x_strides,</div>
-<div class="line"><a id="l01578" name="l01578"></a><span class="lineno"> 1578</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01579" name="l01579"></a><span class="lineno"> 1579</span>      w_shape,</div>
-<div class="line"><a id="l01580" name="l01580"></a><span class="lineno"> 1580</span>      w_strides,</div>
-<div class="line"><a id="l01581" name="l01581"></a><span class="lineno"> 1581</span>      s_strides,</div>
-<div class="line"><a id="l01582" name="l01582"></a><span class="lineno"> 1582</span>      b_strides,</div>
-<div class="line"><a id="l01583" name="l01583"></a><span class="lineno"> 1583</span>      tid);</div>
-<div class="line"><a id="l01584" name="l01584"></a><span class="lineno"> 1584</span>  <a class="code hl_function" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01585" name="l01585"></a><span class="lineno"> 1585</span>      w,</div>
-<div class="line"><a id="l01586" name="l01586"></a><span class="lineno"> 1586</span>      scales,</div>
-<div class="line"><a id="l01587" name="l01587"></a><span class="lineno"> 1587</span>      biases,</div>
-<div class="line"><a id="l01588" name="l01588"></a><span class="lineno"> 1588</span>      x,</div>
-<div class="line"><a id="l01589" name="l01589"></a><span class="lineno"> 1589</span>      y,</div>
-<div class="line"><a id="l01590" name="l01590"></a><span class="lineno"> 1590</span>      in_vec_size,</div>
-<div class="line"><a id="l01591" name="l01591"></a><span class="lineno"> 1591</span>      out_vec_size,</div>
-<div class="line"><a id="l01592" name="l01592"></a><span class="lineno"> 1592</span>      tid,</div>
-<div class="line"><a id="l01593" name="l01593"></a><span class="lineno"> 1593</span>      simd_gid,</div>
-<div class="line"><a id="l01594" name="l01594"></a><span class="lineno"> 1594</span>      simd_lid);</div>
-<div class="line"><a id="l01595" name="l01595"></a><span class="lineno"> 1595</span>}</div>
-</div>
-<div class="line"><a id="l01596" name="l01596"></a><span class="lineno"> 1596</span> </div>
-<div class="line"><a id="l01597" name="l01597"></a><span class="lineno"> 1597</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01598" data-start="{" data-end="}">
-<div class="line"><a id="l01598" name="l01598"></a><span class="lineno"><a class="line" href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494"> 1598</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494">bs_qvm</a>(</div>
-<div class="line"><a id="l01599" name="l01599"></a><span class="lineno"> 1599</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01600" name="l01600"></a><span class="lineno"> 1600</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01601" name="l01601"></a><span class="lineno"> 1601</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01602" name="l01602"></a><span class="lineno"> 1602</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01603" name="l01603"></a><span class="lineno"> 1603</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01604" name="l01604"></a><span class="lineno"> 1604</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01605" name="l01605"></a><span class="lineno"> 1605</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01606" name="l01606"></a><span class="lineno"> 1606</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01607" name="l01607"></a><span class="lineno"> 1607</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01608" name="l01608"></a><span class="lineno"> 1608</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01609" name="l01609"></a><span class="lineno"> 1609</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01610" name="l01610"></a><span class="lineno"> 1610</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01611" name="l01611"></a><span class="lineno"> 1611</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01612" name="l01612"></a><span class="lineno"> 1612</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01613" name="l01613"></a><span class="lineno"> 1613</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01614" name="l01614"></a><span class="lineno"> 1614</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
-<div class="line"><a id="l01615" name="l01615"></a><span class="lineno"> 1615</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
-<div class="line"><a id="l01616" name="l01616"></a><span class="lineno"> 1616</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
-<div class="line"><a id="l01617" name="l01617"></a><span class="lineno"> 1617</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01618" name="l01618"></a><span class="lineno"> 1618</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
-<div class="line"><a id="l01619" name="l01619"></a><span class="lineno"> 1619</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01620" name="l01620"></a><span class="lineno"> 1620</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01621" name="l01621"></a><span class="lineno"> 1621</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01622" name="l01622"></a><span class="lineno"> 1622</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01623" name="l01623"></a><span class="lineno"> 1623</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01624" name="l01624"></a><span class="lineno"> 1624</span>      x,</div>
-<div class="line"><a id="l01625" name="l01625"></a><span class="lineno"> 1625</span>      w,</div>
-<div class="line"><a id="l01626" name="l01626"></a><span class="lineno"> 1626</span>      scales,</div>
-<div class="line"><a id="l01627" name="l01627"></a><span class="lineno"> 1627</span>      biases,</div>
-<div class="line"><a id="l01628" name="l01628"></a><span class="lineno"> 1628</span>      lhs_indices,</div>
-<div class="line"><a id="l01629" name="l01629"></a><span class="lineno"> 1629</span>      rhs_indices,</div>
-<div class="line"><a id="l01630" name="l01630"></a><span class="lineno"> 1630</span>      y,</div>
-<div class="line"><a id="l01631" name="l01631"></a><span class="lineno"> 1631</span>      out_vec_size,</div>
-<div class="line"><a id="l01632" name="l01632"></a><span class="lineno"> 1632</span>      batch_ndims,</div>
-<div class="line"><a id="l01633" name="l01633"></a><span class="lineno"> 1633</span>      batch_shape,</div>
-<div class="line"><a id="l01634" name="l01634"></a><span class="lineno"> 1634</span>      lhs_strides,</div>
-<div class="line"><a id="l01635" name="l01635"></a><span class="lineno"> 1635</span>      rhs_strides,</div>
-<div class="line"><a id="l01636" name="l01636"></a><span class="lineno"> 1636</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01637" name="l01637"></a><span class="lineno"> 1637</span>      x_shape,</div>
-<div class="line"><a id="l01638" name="l01638"></a><span class="lineno"> 1638</span>      x_strides,</div>
-<div class="line"><a id="l01639" name="l01639"></a><span class="lineno"> 1639</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01640" name="l01640"></a><span class="lineno"> 1640</span>      w_shape,</div>
-<div class="line"><a id="l01641" name="l01641"></a><span class="lineno"> 1641</span>      w_strides,</div>
-<div class="line"><a id="l01642" name="l01642"></a><span class="lineno"> 1642</span>      s_strides,</div>
-<div class="line"><a id="l01643" name="l01643"></a><span class="lineno"> 1643</span>      b_strides,</div>
-<div class="line"><a id="l01644" name="l01644"></a><span class="lineno"> 1644</span>      tid);</div>
-<div class="line"><a id="l01645" name="l01645"></a><span class="lineno"> 1645</span>  <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01646" name="l01646"></a><span class="lineno"> 1646</span>      w,</div>
-<div class="line"><a id="l01647" name="l01647"></a><span class="lineno"> 1647</span>      scales,</div>
-<div class="line"><a id="l01648" name="l01648"></a><span class="lineno"> 1648</span>      biases,</div>
-<div class="line"><a id="l01649" name="l01649"></a><span class="lineno"> 1649</span>      x,</div>
-<div class="line"><a id="l01650" name="l01650"></a><span class="lineno"> 1650</span>      y,</div>
-<div class="line"><a id="l01651" name="l01651"></a><span class="lineno"> 1651</span>      in_vec_size,</div>
-<div class="line"><a id="l01652" name="l01652"></a><span class="lineno"> 1652</span>      out_vec_size,</div>
-<div class="line"><a id="l01653" name="l01653"></a><span class="lineno"> 1653</span>      tid,</div>
-<div class="line"><a id="l01654" name="l01654"></a><span class="lineno"> 1654</span>      simd_gid,</div>
-<div class="line"><a id="l01655" name="l01655"></a><span class="lineno"> 1655</span>      simd_lid);</div>
-<div class="line"><a id="l01656" name="l01656"></a><span class="lineno"> 1656</span>}</div>
+<div class="line"><a id="l01561" name="l01561"></a><span class="lineno"> 1561</span> </div>
+<div class="line"><a id="l01562" name="l01562"></a><span class="lineno"> 1562</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l01563" name="l01563"></a><span class="lineno"> 1563</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l01564" name="l01564"></a><span class="lineno"> 1564</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01565" name="l01565"></a><span class="lineno"> 1565</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l01566" name="l01566"></a><span class="lineno"> 1566</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
+<div class="line"><a id="l01567" name="l01567"></a><span class="lineno"> 1567</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> batched,</div>
+<div class="line"><a id="l01568" name="l01568"></a><span class="lineno"> 1568</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l01569" name="l01569"></a><span class="lineno"> 1569</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l01570" name="l01570"></a><span class="lineno"> 1570</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen01571" data-start="{" data-end="}">
+<div class="line"><a id="l01571" name="l01571"></a><span class="lineno"><a class="line" href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10"> 1571</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10">qmm_t</a>(</div>
+<div class="line"><a id="l01572" name="l01572"></a><span class="lineno"> 1572</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01573" name="l01573"></a><span class="lineno"> 1573</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01574" name="l01574"></a><span class="lineno"> 1574</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01575" name="l01575"></a><span class="lineno"> 1575</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01576" name="l01576"></a><span class="lineno"> 1576</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01577" name="l01577"></a><span class="lineno"> 1577</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
+<div class="line"><a id="l01578" name="l01578"></a><span class="lineno"> 1578</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
+<div class="line"><a id="l01579" name="l01579"></a><span class="lineno"> 1579</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
+<div class="line"><a id="l01580" name="l01580"></a><span class="lineno"> 1580</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
+<div class="line"><a id="l01581" name="l01581"></a><span class="lineno"> 1581</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
+<div class="line"><a id="l01582" name="l01582"></a><span class="lineno"> 1582</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
+<div class="line"><a id="l01583" name="l01583"></a><span class="lineno"> 1583</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
+<div class="line"><a id="l01584" name="l01584"></a><span class="lineno"> 1584</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
+<div class="line"><a id="l01585" name="l01585"></a><span class="lineno"> 1585</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01586" name="l01586"></a><span class="lineno"> 1586</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01587" name="l01587"></a><span class="lineno"> 1587</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
+<div class="line"><a id="l01588" name="l01588"></a><span class="lineno"> 1588</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01589" name="l01589"></a><span class="lineno"> 1589</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l01590" name="l01590"></a><span class="lineno"> 1590</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01591" name="l01591"></a><span class="lineno"> 1591</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01592" name="l01592"></a><span class="lineno"> 1592</span>  (void)lid;</div>
+<div class="line"><a id="l01593" name="l01593"></a><span class="lineno"> 1593</span> </div>
+<div class="line"><a id="l01594" name="l01594"></a><span class="lineno"> 1594</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01595" name="l01595"></a><span class="lineno"> 1595</span> </div>
+<div class="line"><a id="l01596" name="l01596"></a><span class="lineno"> 1596</span>  threadgroup T Xs[BM * BK_padded];</div>
+<div class="line"><a id="l01597" name="l01597"></a><span class="lineno"> 1597</span>  threadgroup T Ws[BN * BK_padded];</div>
+<div class="line"><a id="l01598" name="l01598"></a><span class="lineno"> 1598</span> </div>
+<div class="line"><a id="l01599" name="l01599"></a><span class="lineno"> 1599</span>  <span class="keywordflow">if</span> (batched) {</div>
+<div class="line"><a id="l01600" name="l01600"></a><span class="lineno"> 1600</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01601" name="l01601"></a><span class="lineno"> 1601</span>        x,</div>
+<div class="line"><a id="l01602" name="l01602"></a><span class="lineno"> 1602</span>        w,</div>
+<div class="line"><a id="l01603" name="l01603"></a><span class="lineno"> 1603</span>        scales,</div>
+<div class="line"><a id="l01604" name="l01604"></a><span class="lineno"> 1604</span>        biases,</div>
+<div class="line"><a id="l01605" name="l01605"></a><span class="lineno"> 1605</span>        y,</div>
+<div class="line"><a id="l01606" name="l01606"></a><span class="lineno"> 1606</span>        M * N,</div>
+<div class="line"><a id="l01607" name="l01607"></a><span class="lineno"> 1607</span>        x_batch_ndims,</div>
+<div class="line"><a id="l01608" name="l01608"></a><span class="lineno"> 1608</span>        x_shape,</div>
+<div class="line"><a id="l01609" name="l01609"></a><span class="lineno"> 1609</span>        x_strides,</div>
+<div class="line"><a id="l01610" name="l01610"></a><span class="lineno"> 1610</span>        w_batch_ndims,</div>
+<div class="line"><a id="l01611" name="l01611"></a><span class="lineno"> 1611</span>        w_shape,</div>
+<div class="line"><a id="l01612" name="l01612"></a><span class="lineno"> 1612</span>        w_strides,</div>
+<div class="line"><a id="l01613" name="l01613"></a><span class="lineno"> 1613</span>        s_strides,</div>
+<div class="line"><a id="l01614" name="l01614"></a><span class="lineno"> 1614</span>        b_strides,</div>
+<div class="line"><a id="l01615" name="l01615"></a><span class="lineno"> 1615</span>        tid);</div>
+<div class="line"><a id="l01616" name="l01616"></a><span class="lineno"> 1616</span>  }</div>
+<div class="line"><a id="l01617" name="l01617"></a><span class="lineno"> 1617</span>  <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl&lt;T, group_size, bits, aligned_N, BM, BK, BN&gt;</a>(</div>
+<div class="line"><a id="l01618" name="l01618"></a><span class="lineno"> 1618</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01619" name="l01619"></a><span class="lineno"> 1619</span>}</div>
 </div>
+<div class="line"><a id="l01620" name="l01620"></a><span class="lineno"> 1620</span> </div>
+<div class="line"><a id="l01621" name="l01621"></a><span class="lineno"> 1621</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l01622" name="l01622"></a><span class="lineno"> 1622</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l01623" name="l01623"></a><span class="lineno"> 1623</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01624" name="l01624"></a><span class="lineno"> 1624</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l01625" name="l01625"></a><span class="lineno"> 1625</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> batched,</div>
+<div class="line"><a id="l01626" name="l01626"></a><span class="lineno"> 1626</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l01627" name="l01627"></a><span class="lineno"> 1627</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l01628" name="l01628"></a><span class="lineno"> 1628</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen01629" data-start="{" data-end="}">
+<div class="line"><a id="l01629" name="l01629"></a><span class="lineno"><a class="line" href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7"> 1629</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7">qmm_n</a>(</div>
+<div class="line"><a id="l01630" name="l01630"></a><span class="lineno"> 1630</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01631" name="l01631"></a><span class="lineno"> 1631</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01632" name="l01632"></a><span class="lineno"> 1632</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01633" name="l01633"></a><span class="lineno"> 1633</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01634" name="l01634"></a><span class="lineno"> 1634</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01635" name="l01635"></a><span class="lineno"> 1635</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
+<div class="line"><a id="l01636" name="l01636"></a><span class="lineno"> 1636</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
+<div class="line"><a id="l01637" name="l01637"></a><span class="lineno"> 1637</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
+<div class="line"><a id="l01638" name="l01638"></a><span class="lineno"> 1638</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
+<div class="line"><a id="l01639" name="l01639"></a><span class="lineno"> 1639</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
+<div class="line"><a id="l01640" name="l01640"></a><span class="lineno"> 1640</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
+<div class="line"><a id="l01641" name="l01641"></a><span class="lineno"> 1641</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
+<div class="line"><a id="l01642" name="l01642"></a><span class="lineno"> 1642</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
+<div class="line"><a id="l01643" name="l01643"></a><span class="lineno"> 1643</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01644" name="l01644"></a><span class="lineno"> 1644</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01645" name="l01645"></a><span class="lineno"> 1645</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
+<div class="line"><a id="l01646" name="l01646"></a><span class="lineno"> 1646</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01647" name="l01647"></a><span class="lineno"> 1647</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l01648" name="l01648"></a><span class="lineno"> 1648</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01649" name="l01649"></a><span class="lineno"> 1649</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01650" name="l01650"></a><span class="lineno"> 1650</span>  (void)lid;</div>
+<div class="line"><a id="l01651" name="l01651"></a><span class="lineno"> 1651</span> </div>
+<div class="line"><a id="l01652" name="l01652"></a><span class="lineno"> 1652</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01653" name="l01653"></a><span class="lineno"> 1653</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01654" name="l01654"></a><span class="lineno"> 1654</span> </div>
+<div class="line"><a id="l01655" name="l01655"></a><span class="lineno"> 1655</span>  threadgroup T Xs[BM * BK_padded];</div>
+<div class="line"><a id="l01656" name="l01656"></a><span class="lineno"> 1656</span>  threadgroup T Ws[BK * BN_padded];</div>
 <div class="line"><a id="l01657" name="l01657"></a><span class="lineno"> 1657</span> </div>
-<div class="line"><a id="l01658" name="l01658"></a><span class="lineno"> 1658</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l01659" name="l01659"></a><span class="lineno"> 1659</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l01660" name="l01660"></a><span class="lineno"> 1660</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l01661" name="l01661"></a><span class="lineno"> 1661</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l01662" name="l01662"></a><span class="lineno"> 1662</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
-<div class="line"><a id="l01663" name="l01663"></a><span class="lineno"> 1663</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l01664" name="l01664"></a><span class="lineno"> 1664</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l01665" name="l01665"></a><span class="lineno"> 1665</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen01666" data-start="{" data-end="}">
-<div class="line"><a id="l01666" name="l01666"></a><span class="lineno"><a class="line" href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84"> 1666</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84">bs_qmm_t</a>(</div>
-<div class="line"><a id="l01667" name="l01667"></a><span class="lineno"> 1667</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01668" name="l01668"></a><span class="lineno"> 1668</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01669" name="l01669"></a><span class="lineno"> 1669</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01670" name="l01670"></a><span class="lineno"> 1670</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01671" name="l01671"></a><span class="lineno"> 1671</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01672" name="l01672"></a><span class="lineno"> 1672</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
-<div class="line"><a id="l01673" name="l01673"></a><span class="lineno"> 1673</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
-<div class="line"><a id="l01674" name="l01674"></a><span class="lineno"> 1674</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
-<div class="line"><a id="l01675" name="l01675"></a><span class="lineno"> 1675</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
-<div class="line"><a id="l01676" name="l01676"></a><span class="lineno"> 1676</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
-<div class="line"><a id="l01677" name="l01677"></a><span class="lineno"> 1677</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
-<div class="line"><a id="l01678" name="l01678"></a><span class="lineno"> 1678</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
-<div class="line"><a id="l01679" name="l01679"></a><span class="lineno"> 1679</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
-<div class="line"><a id="l01680" name="l01680"></a><span class="lineno"> 1680</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01681" name="l01681"></a><span class="lineno"> 1681</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01682" name="l01682"></a><span class="lineno"> 1682</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
-<div class="line"><a id="l01683" name="l01683"></a><span class="lineno"> 1683</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(16)]],</div>
-<div class="line"><a id="l01684" name="l01684"></a><span class="lineno"> 1684</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(17)]],</div>
-<div class="line"><a id="l01685" name="l01685"></a><span class="lineno"> 1685</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01686" name="l01686"></a><span class="lineno"> 1686</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(19)]],</div>
-<div class="line"><a id="l01687" name="l01687"></a><span class="lineno"> 1687</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01688" name="l01688"></a><span class="lineno"> 1688</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(21)]],</div>
-<div class="line"><a id="l01689" name="l01689"></a><span class="lineno"> 1689</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01690" name="l01690"></a><span class="lineno"> 1690</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l01691" name="l01691"></a><span class="lineno"> 1691</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01692" name="l01692"></a><span class="lineno"> 1692</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01693" name="l01693"></a><span class="lineno"> 1693</span>  (void)lid;</div>
-<div class="line"><a id="l01694" name="l01694"></a><span class="lineno"> 1694</span> </div>
-<div class="line"><a id="l01695" name="l01695"></a><span class="lineno"> 1695</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01696" name="l01696"></a><span class="lineno"> 1696</span> </div>
-<div class="line"><a id="l01697" name="l01697"></a><span class="lineno"> 1697</span>  threadgroup T Xs[BM * BK_padded];</div>
-<div class="line"><a id="l01698" name="l01698"></a><span class="lineno"> 1698</span>  threadgroup T Ws[BN * BK_padded];</div>
-<div class="line"><a id="l01699" name="l01699"></a><span class="lineno"> 1699</span> </div>
-<div class="line"><a id="l01700" name="l01700"></a><span class="lineno"> 1700</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01701" name="l01701"></a><span class="lineno"> 1701</span>      x,</div>
-<div class="line"><a id="l01702" name="l01702"></a><span class="lineno"> 1702</span>      w,</div>
-<div class="line"><a id="l01703" name="l01703"></a><span class="lineno"> 1703</span>      scales,</div>
-<div class="line"><a id="l01704" name="l01704"></a><span class="lineno"> 1704</span>      biases,</div>
-<div class="line"><a id="l01705" name="l01705"></a><span class="lineno"> 1705</span>      lhs_indices,</div>
-<div class="line"><a id="l01706" name="l01706"></a><span class="lineno"> 1706</span>      rhs_indices,</div>
-<div class="line"><a id="l01707" name="l01707"></a><span class="lineno"> 1707</span>      y,</div>
-<div class="line"><a id="l01708" name="l01708"></a><span class="lineno"> 1708</span>      M * N,</div>
-<div class="line"><a id="l01709" name="l01709"></a><span class="lineno"> 1709</span>      batch_ndims,</div>
-<div class="line"><a id="l01710" name="l01710"></a><span class="lineno"> 1710</span>      batch_shape,</div>
-<div class="line"><a id="l01711" name="l01711"></a><span class="lineno"> 1711</span>      lhs_strides,</div>
-<div class="line"><a id="l01712" name="l01712"></a><span class="lineno"> 1712</span>      rhs_strides,</div>
-<div class="line"><a id="l01713" name="l01713"></a><span class="lineno"> 1713</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01714" name="l01714"></a><span class="lineno"> 1714</span>      x_shape,</div>
-<div class="line"><a id="l01715" name="l01715"></a><span class="lineno"> 1715</span>      x_strides,</div>
-<div class="line"><a id="l01716" name="l01716"></a><span class="lineno"> 1716</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01717" name="l01717"></a><span class="lineno"> 1717</span>      w_shape,</div>
-<div class="line"><a id="l01718" name="l01718"></a><span class="lineno"> 1718</span>      w_strides,</div>
-<div class="line"><a id="l01719" name="l01719"></a><span class="lineno"> 1719</span>      s_strides,</div>
-<div class="line"><a id="l01720" name="l01720"></a><span class="lineno"> 1720</span>      b_strides,</div>
-<div class="line"><a id="l01721" name="l01721"></a><span class="lineno"> 1721</span>      tid);</div>
-<div class="line"><a id="l01722" name="l01722"></a><span class="lineno"> 1722</span>  <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl&lt;T, group_size, bits, aligned_N, BM, BK, BN&gt;</a>(</div>
-<div class="line"><a id="l01723" name="l01723"></a><span class="lineno"> 1723</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
-<div class="line"><a id="l01724" name="l01724"></a><span class="lineno"> 1724</span>}</div>
+<div class="line"><a id="l01658" name="l01658"></a><span class="lineno"> 1658</span>  <span class="keywordflow">if</span> (batched) {</div>
+<div class="line"><a id="l01659" name="l01659"></a><span class="lineno"> 1659</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01660" name="l01660"></a><span class="lineno"> 1660</span>        x,</div>
+<div class="line"><a id="l01661" name="l01661"></a><span class="lineno"> 1661</span>        w,</div>
+<div class="line"><a id="l01662" name="l01662"></a><span class="lineno"> 1662</span>        scales,</div>
+<div class="line"><a id="l01663" name="l01663"></a><span class="lineno"> 1663</span>        biases,</div>
+<div class="line"><a id="l01664" name="l01664"></a><span class="lineno"> 1664</span>        y,</div>
+<div class="line"><a id="l01665" name="l01665"></a><span class="lineno"> 1665</span>        M * N,</div>
+<div class="line"><a id="l01666" name="l01666"></a><span class="lineno"> 1666</span>        x_batch_ndims,</div>
+<div class="line"><a id="l01667" name="l01667"></a><span class="lineno"> 1667</span>        x_shape,</div>
+<div class="line"><a id="l01668" name="l01668"></a><span class="lineno"> 1668</span>        x_strides,</div>
+<div class="line"><a id="l01669" name="l01669"></a><span class="lineno"> 1669</span>        w_batch_ndims,</div>
+<div class="line"><a id="l01670" name="l01670"></a><span class="lineno"> 1670</span>        w_shape,</div>
+<div class="line"><a id="l01671" name="l01671"></a><span class="lineno"> 1671</span>        w_strides,</div>
+<div class="line"><a id="l01672" name="l01672"></a><span class="lineno"> 1672</span>        s_strides,</div>
+<div class="line"><a id="l01673" name="l01673"></a><span class="lineno"> 1673</span>        b_strides,</div>
+<div class="line"><a id="l01674" name="l01674"></a><span class="lineno"> 1674</span>        tid);</div>
+<div class="line"><a id="l01675" name="l01675"></a><span class="lineno"> 1675</span>  }</div>
+<div class="line"><a id="l01676" name="l01676"></a><span class="lineno"> 1676</span> </div>
+<div class="line"><a id="l01677" name="l01677"></a><span class="lineno"> 1677</span>  <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl&lt;T, group_size, bits, BM, BK, BN&gt;</a>(</div>
+<div class="line"><a id="l01678" name="l01678"></a><span class="lineno"> 1678</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01679" name="l01679"></a><span class="lineno"> 1679</span>}</div>
 </div>
-<div class="line"><a id="l01725" name="l01725"></a><span class="lineno"> 1725</span> </div>
-<div class="line"><a id="l01726" name="l01726"></a><span class="lineno"> 1726</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l01727" name="l01727"></a><span class="lineno"> 1727</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l01728" name="l01728"></a><span class="lineno"> 1728</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l01729" name="l01729"></a><span class="lineno"> 1729</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l01730" name="l01730"></a><span class="lineno"> 1730</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l01731" name="l01731"></a><span class="lineno"> 1731</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l01732" name="l01732"></a><span class="lineno"> 1732</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen01733" data-start="{" data-end="}">
-<div class="line"><a id="l01733" name="l01733"></a><span class="lineno"><a class="line" href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f"> 1733</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f">bs_qmm_n</a>(</div>
-<div class="line"><a id="l01734" name="l01734"></a><span class="lineno"> 1734</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01735" name="l01735"></a><span class="lineno"> 1735</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01736" name="l01736"></a><span class="lineno"> 1736</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01737" name="l01737"></a><span class="lineno"> 1737</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01738" name="l01738"></a><span class="lineno"> 1738</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01739" name="l01739"></a><span class="lineno"> 1739</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
-<div class="line"><a id="l01740" name="l01740"></a><span class="lineno"> 1740</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
-<div class="line"><a id="l01741" name="l01741"></a><span class="lineno"> 1741</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
-<div class="line"><a id="l01742" name="l01742"></a><span class="lineno"> 1742</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
-<div class="line"><a id="l01743" name="l01743"></a><span class="lineno"> 1743</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
-<div class="line"><a id="l01744" name="l01744"></a><span class="lineno"> 1744</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
-<div class="line"><a id="l01745" name="l01745"></a><span class="lineno"> 1745</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
-<div class="line"><a id="l01746" name="l01746"></a><span class="lineno"> 1746</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
-<div class="line"><a id="l01747" name="l01747"></a><span class="lineno"> 1747</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01748" name="l01748"></a><span class="lineno"> 1748</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01749" name="l01749"></a><span class="lineno"> 1749</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
-<div class="line"><a id="l01750" name="l01750"></a><span class="lineno"> 1750</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(16)]],</div>
-<div class="line"><a id="l01751" name="l01751"></a><span class="lineno"> 1751</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(17)]],</div>
-<div class="line"><a id="l01752" name="l01752"></a><span class="lineno"> 1752</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01753" name="l01753"></a><span class="lineno"> 1753</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(19)]],</div>
-<div class="line"><a id="l01754" name="l01754"></a><span class="lineno"> 1754</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01755" name="l01755"></a><span class="lineno"> 1755</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(21)]],</div>
-<div class="line"><a id="l01756" name="l01756"></a><span class="lineno"> 1756</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01757" name="l01757"></a><span class="lineno"> 1757</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l01758" name="l01758"></a><span class="lineno"> 1758</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01759" name="l01759"></a><span class="lineno"> 1759</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01760" name="l01760"></a><span class="lineno"> 1760</span>  (void)lid;</div>
-<div class="line"><a id="l01761" name="l01761"></a><span class="lineno"> 1761</span> </div>
-<div class="line"><a id="l01762" name="l01762"></a><span class="lineno"> 1762</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01763" name="l01763"></a><span class="lineno"> 1763</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01764" name="l01764"></a><span class="lineno"> 1764</span> </div>
-<div class="line"><a id="l01765" name="l01765"></a><span class="lineno"> 1765</span>  threadgroup T Xs[BM * BK_padded];</div>
-<div class="line"><a id="l01766" name="l01766"></a><span class="lineno"> 1766</span>  threadgroup T Ws[BK * BN_padded];</div>
-<div class="line"><a id="l01767" name="l01767"></a><span class="lineno"> 1767</span> </div>
+<div class="line"><a id="l01680" name="l01680"></a><span class="lineno"> 1680</span> </div>
+<div class="line"><a id="l01681" name="l01681"></a><span class="lineno"> 1681</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen01682" data-start="{" data-end="}">
+<div class="line"><a id="l01682" name="l01682"></a><span class="lineno"><a class="line" href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7"> 1682</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7">bs_qmv_fast</a>(</div>
+<div class="line"><a id="l01683" name="l01683"></a><span class="lineno"> 1683</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01684" name="l01684"></a><span class="lineno"> 1684</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01685" name="l01685"></a><span class="lineno"> 1685</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01686" name="l01686"></a><span class="lineno"> 1686</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01687" name="l01687"></a><span class="lineno"> 1687</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01688" name="l01688"></a><span class="lineno"> 1688</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01689" name="l01689"></a><span class="lineno"> 1689</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01690" name="l01690"></a><span class="lineno"> 1690</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01691" name="l01691"></a><span class="lineno"> 1691</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01692" name="l01692"></a><span class="lineno"> 1692</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01693" name="l01693"></a><span class="lineno"> 1693</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01694" name="l01694"></a><span class="lineno"> 1694</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01695" name="l01695"></a><span class="lineno"> 1695</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01696" name="l01696"></a><span class="lineno"> 1696</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01697" name="l01697"></a><span class="lineno"> 1697</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01698" name="l01698"></a><span class="lineno"> 1698</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
+<div class="line"><a id="l01699" name="l01699"></a><span class="lineno"> 1699</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
+<div class="line"><a id="l01700" name="l01700"></a><span class="lineno"> 1700</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
+<div class="line"><a id="l01701" name="l01701"></a><span class="lineno"> 1701</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01702" name="l01702"></a><span class="lineno"> 1702</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
+<div class="line"><a id="l01703" name="l01703"></a><span class="lineno"> 1703</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01704" name="l01704"></a><span class="lineno"> 1704</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01705" name="l01705"></a><span class="lineno"> 1705</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01706" name="l01706"></a><span class="lineno"> 1706</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01707" name="l01707"></a><span class="lineno"> 1707</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01708" name="l01708"></a><span class="lineno"> 1708</span>      x,</div>
+<div class="line"><a id="l01709" name="l01709"></a><span class="lineno"> 1709</span>      w,</div>
+<div class="line"><a id="l01710" name="l01710"></a><span class="lineno"> 1710</span>      scales,</div>
+<div class="line"><a id="l01711" name="l01711"></a><span class="lineno"> 1711</span>      biases,</div>
+<div class="line"><a id="l01712" name="l01712"></a><span class="lineno"> 1712</span>      lhs_indices,</div>
+<div class="line"><a id="l01713" name="l01713"></a><span class="lineno"> 1713</span>      rhs_indices,</div>
+<div class="line"><a id="l01714" name="l01714"></a><span class="lineno"> 1714</span>      y,</div>
+<div class="line"><a id="l01715" name="l01715"></a><span class="lineno"> 1715</span>      out_vec_size,</div>
+<div class="line"><a id="l01716" name="l01716"></a><span class="lineno"> 1716</span>      batch_ndims,</div>
+<div class="line"><a id="l01717" name="l01717"></a><span class="lineno"> 1717</span>      batch_shape,</div>
+<div class="line"><a id="l01718" name="l01718"></a><span class="lineno"> 1718</span>      lhs_strides,</div>
+<div class="line"><a id="l01719" name="l01719"></a><span class="lineno"> 1719</span>      rhs_strides,</div>
+<div class="line"><a id="l01720" name="l01720"></a><span class="lineno"> 1720</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01721" name="l01721"></a><span class="lineno"> 1721</span>      x_shape,</div>
+<div class="line"><a id="l01722" name="l01722"></a><span class="lineno"> 1722</span>      x_strides,</div>
+<div class="line"><a id="l01723" name="l01723"></a><span class="lineno"> 1723</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01724" name="l01724"></a><span class="lineno"> 1724</span>      w_shape,</div>
+<div class="line"><a id="l01725" name="l01725"></a><span class="lineno"> 1725</span>      w_strides,</div>
+<div class="line"><a id="l01726" name="l01726"></a><span class="lineno"> 1726</span>      s_strides,</div>
+<div class="line"><a id="l01727" name="l01727"></a><span class="lineno"> 1727</span>      b_strides,</div>
+<div class="line"><a id="l01728" name="l01728"></a><span class="lineno"> 1728</span>      tid);</div>
+<div class="line"><a id="l01729" name="l01729"></a><span class="lineno"> 1729</span>  <a class="code hl_function" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01730" name="l01730"></a><span class="lineno"> 1730</span>      w,</div>
+<div class="line"><a id="l01731" name="l01731"></a><span class="lineno"> 1731</span>      scales,</div>
+<div class="line"><a id="l01732" name="l01732"></a><span class="lineno"> 1732</span>      biases,</div>
+<div class="line"><a id="l01733" name="l01733"></a><span class="lineno"> 1733</span>      x,</div>
+<div class="line"><a id="l01734" name="l01734"></a><span class="lineno"> 1734</span>      y,</div>
+<div class="line"><a id="l01735" name="l01735"></a><span class="lineno"> 1735</span>      in_vec_size,</div>
+<div class="line"><a id="l01736" name="l01736"></a><span class="lineno"> 1736</span>      out_vec_size,</div>
+<div class="line"><a id="l01737" name="l01737"></a><span class="lineno"> 1737</span>      tid,</div>
+<div class="line"><a id="l01738" name="l01738"></a><span class="lineno"> 1738</span>      simd_gid,</div>
+<div class="line"><a id="l01739" name="l01739"></a><span class="lineno"> 1739</span>      simd_lid);</div>
+<div class="line"><a id="l01740" name="l01740"></a><span class="lineno"> 1740</span>}</div>
+</div>
+<div class="line"><a id="l01741" name="l01741"></a><span class="lineno"> 1741</span> </div>
+<div class="line"><a id="l01742" name="l01742"></a><span class="lineno"> 1742</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen01743" data-start="{" data-end="}">
+<div class="line"><a id="l01743" name="l01743"></a><span class="lineno"><a class="line" href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed"> 1743</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed">bs_qmv</a>(</div>
+<div class="line"><a id="l01744" name="l01744"></a><span class="lineno"> 1744</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01745" name="l01745"></a><span class="lineno"> 1745</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01746" name="l01746"></a><span class="lineno"> 1746</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01747" name="l01747"></a><span class="lineno"> 1747</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01748" name="l01748"></a><span class="lineno"> 1748</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01749" name="l01749"></a><span class="lineno"> 1749</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01750" name="l01750"></a><span class="lineno"> 1750</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01751" name="l01751"></a><span class="lineno"> 1751</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01752" name="l01752"></a><span class="lineno"> 1752</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01753" name="l01753"></a><span class="lineno"> 1753</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01754" name="l01754"></a><span class="lineno"> 1754</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01755" name="l01755"></a><span class="lineno"> 1755</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01756" name="l01756"></a><span class="lineno"> 1756</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01757" name="l01757"></a><span class="lineno"> 1757</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01758" name="l01758"></a><span class="lineno"> 1758</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01759" name="l01759"></a><span class="lineno"> 1759</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
+<div class="line"><a id="l01760" name="l01760"></a><span class="lineno"> 1760</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
+<div class="line"><a id="l01761" name="l01761"></a><span class="lineno"> 1761</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
+<div class="line"><a id="l01762" name="l01762"></a><span class="lineno"> 1762</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01763" name="l01763"></a><span class="lineno"> 1763</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
+<div class="line"><a id="l01764" name="l01764"></a><span class="lineno"> 1764</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01765" name="l01765"></a><span class="lineno"> 1765</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01766" name="l01766"></a><span class="lineno"> 1766</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01767" name="l01767"></a><span class="lineno"> 1767</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
 <div class="line"><a id="l01768" name="l01768"></a><span class="lineno"> 1768</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
 <div class="line"><a id="l01769" name="l01769"></a><span class="lineno"> 1769</span>      x,</div>
 <div class="line"><a id="l01770" name="l01770"></a><span class="lineno"> 1770</span>      w,</div>
@@ -1927,7 +1921,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l01773" name="l01773"></a><span class="lineno"> 1773</span>      lhs_indices,</div>
 <div class="line"><a id="l01774" name="l01774"></a><span class="lineno"> 1774</span>      rhs_indices,</div>
 <div class="line"><a id="l01775" name="l01775"></a><span class="lineno"> 1775</span>      y,</div>
-<div class="line"><a id="l01776" name="l01776"></a><span class="lineno"> 1776</span>      M * N,</div>
+<div class="line"><a id="l01776" name="l01776"></a><span class="lineno"> 1776</span>      out_vec_size,</div>
 <div class="line"><a id="l01777" name="l01777"></a><span class="lineno"> 1777</span>      batch_ndims,</div>
 <div class="line"><a id="l01778" name="l01778"></a><span class="lineno"> 1778</span>      batch_shape,</div>
 <div class="line"><a id="l01779" name="l01779"></a><span class="lineno"> 1779</span>      lhs_strides,</div>
@@ -1941,229 +1935,442 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l01787" name="l01787"></a><span class="lineno"> 1787</span>      s_strides,</div>
 <div class="line"><a id="l01788" name="l01788"></a><span class="lineno"> 1788</span>      b_strides,</div>
 <div class="line"><a id="l01789" name="l01789"></a><span class="lineno"> 1789</span>      tid);</div>
-<div class="line"><a id="l01790" name="l01790"></a><span class="lineno"> 1790</span>  <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl&lt;T, group_size, bits, BM, BK, BN&gt;</a>(</div>
-<div class="line"><a id="l01791" name="l01791"></a><span class="lineno"> 1791</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
-<div class="line"><a id="l01792" name="l01792"></a><span class="lineno"> 1792</span>}</div>
+<div class="line"><a id="l01790" name="l01790"></a><span class="lineno"> 1790</span>  <a class="code hl_function" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01791" name="l01791"></a><span class="lineno"> 1791</span>      w,</div>
+<div class="line"><a id="l01792" name="l01792"></a><span class="lineno"> 1792</span>      scales,</div>
+<div class="line"><a id="l01793" name="l01793"></a><span class="lineno"> 1793</span>      biases,</div>
+<div class="line"><a id="l01794" name="l01794"></a><span class="lineno"> 1794</span>      x,</div>
+<div class="line"><a id="l01795" name="l01795"></a><span class="lineno"> 1795</span>      y,</div>
+<div class="line"><a id="l01796" name="l01796"></a><span class="lineno"> 1796</span>      in_vec_size,</div>
+<div class="line"><a id="l01797" name="l01797"></a><span class="lineno"> 1797</span>      out_vec_size,</div>
+<div class="line"><a id="l01798" name="l01798"></a><span class="lineno"> 1798</span>      tid,</div>
+<div class="line"><a id="l01799" name="l01799"></a><span class="lineno"> 1799</span>      simd_gid,</div>
+<div class="line"><a id="l01800" name="l01800"></a><span class="lineno"> 1800</span>      simd_lid);</div>
+<div class="line"><a id="l01801" name="l01801"></a><span class="lineno"> 1801</span>}</div>
 </div>
-<div class="line"><a id="l01793" name="l01793"></a><span class="lineno"> 1793</span> </div>
-<div class="line"><a id="l01794" name="l01794"></a><span class="lineno"> 1794</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01795" data-start="{" data-end="}">
-<div class="line"><a id="l01795" name="l01795"></a><span class="lineno"><a class="line" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59"> 1795</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">affine_quantize</a>(</div>
-<div class="line"><a id="l01796" name="l01796"></a><span class="lineno"> 1796</span>    <span class="keyword">const</span> device T* w [[buffer(0)]],</div>
-<div class="line"><a id="l01797" name="l01797"></a><span class="lineno"> 1797</span>    device uint8_t* out [[buffer(1)]],</div>
-<div class="line"><a id="l01798" name="l01798"></a><span class="lineno"> 1798</span>    device T* scales [[buffer(2)]],</div>
-<div class="line"><a id="l01799" name="l01799"></a><span class="lineno"> 1799</span>    device T* biases [[buffer(3)]],</div>
-<div class="line"><a id="l01800" name="l01800"></a><span class="lineno"> 1800</span>    uint2 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l01801" name="l01801"></a><span class="lineno"> 1801</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l01802" name="l01802"></a><span class="lineno"> 1802</span>  <span class="keyword">constexpr</span> T eps = T(1e-7);</div>
-<div class="line"><a id="l01803" name="l01803"></a><span class="lineno"> 1803</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> = 32;</div>
-<div class="line"><a id="l01804" name="l01804"></a><span class="lineno"> 1804</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> uint8_bits = 8;</div>
-<div class="line"><a id="l01805" name="l01805"></a><span class="lineno"> 1805</span>  <span class="keyword">constexpr</span> T n_bins = (1 &lt;&lt; bits) - 1;</div>
-<div class="line"><a id="l01806" name="l01806"></a><span class="lineno"> 1806</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = uint8_bits / bits;</div>
-<div class="line"><a id="l01807" name="l01807"></a><span class="lineno"> 1807</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_reduce = group_size / <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
-<div class="line"><a id="l01808" name="l01808"></a><span class="lineno"> 1808</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> writes_per_reduce = packs_per_int / values_per_reduce;</div>
-<div class="line"><a id="l01809" name="l01809"></a><span class="lineno"> 1809</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> writes_per_pack =</div>
-<div class="line"><a id="l01810" name="l01810"></a><span class="lineno"> 1810</span>      writes_per_reduce &gt; 1 ? 1 : values_per_reduce / packs_per_int;</div>
-<div class="line"><a id="l01811" name="l01811"></a><span class="lineno"> 1811</span> </div>
-<div class="line"><a id="l01812" name="l01812"></a><span class="lineno"> 1812</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l01813" name="l01813"></a><span class="lineno"> 1813</span>      group_size % <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> == 0,</div>
-<div class="line"><a id="l01814" name="l01814"></a><span class="lineno"> 1814</span>      <span class="stringliteral">&quot;Group size must be divisible by simd size.&quot;</span>);</div>
-<div class="line"><a id="l01815" name="l01815"></a><span class="lineno"> 1815</span> </div>
-<div class="line"><a id="l01816" name="l01816"></a><span class="lineno"> 1816</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
-<div class="line"><a id="l01817" name="l01817"></a><span class="lineno"> 1817</span>  <span class="keywordtype">size_t</span> in_index = offset * values_per_reduce;</div>
-<div class="line"><a id="l01818" name="l01818"></a><span class="lineno"> 1818</span>  <span class="keywordtype">size_t</span> out_index = offset * writes_per_pack;</div>
-<div class="line"><a id="l01819" name="l01819"></a><span class="lineno"> 1819</span> </div>
-<div class="line"><a id="l01820" name="l01820"></a><span class="lineno"> 1820</span>  T w_thread[values_per_reduce];</div>
-<div class="line"><a id="l01821" name="l01821"></a><span class="lineno"> 1821</span>  T w_min = <a class="code hl_struct" href="struct_limits.html">Limits&lt;T&gt;::max</a>;</div>
-<div class="line"><a id="l01822" name="l01822"></a><span class="lineno"> 1822</span>  T w_max = 0;</div>
-<div class="line"><a id="l01823" name="l01823"></a><span class="lineno"> 1823</span> </div>
-<div class="line"><a id="l01824" name="l01824"></a><span class="lineno"> 1824</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01825" name="l01825"></a><span class="lineno"> 1825</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_reduce; i++) {</div>
-<div class="line"><a id="l01826" name="l01826"></a><span class="lineno"> 1826</span>    T val = w[in_index + i];</div>
-<div class="line"><a id="l01827" name="l01827"></a><span class="lineno"> 1827</span>    w_thread[i] = val;</div>
-<div class="line"><a id="l01828" name="l01828"></a><span class="lineno"> 1828</span>    w_min = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(w_min, val);</div>
-<div class="line"><a id="l01829" name="l01829"></a><span class="lineno"> 1829</span>    w_max = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>(w_max, val);</div>
-<div class="line"><a id="l01830" name="l01830"></a><span class="lineno"> 1830</span>  }</div>
-<div class="line"><a id="l01831" name="l01831"></a><span class="lineno"> 1831</span> </div>
-<div class="line"><a id="l01832" name="l01832"></a><span class="lineno"> 1832</span>  w_min = <a class="code hl_function" href="namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b">simd_min</a>(w_min);</div>
-<div class="line"><a id="l01833" name="l01833"></a><span class="lineno"> 1833</span>  w_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(w_max);</div>
-<div class="line"><a id="l01834" name="l01834"></a><span class="lineno"> 1834</span> </div>
-<div class="line"><a id="l01835" name="l01835"></a><span class="lineno"> 1835</span>  T scale = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>((w_max - w_min) / n_bins, eps);</div>
-<div class="line"><a id="l01836" name="l01836"></a><span class="lineno"> 1836</span>  <span class="keywordtype">bool</span> side = <a class="code hl_function" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">abs</a>(w_min) &gt; <a class="code hl_function" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">abs</a>(w_max);</div>
-<div class="line"><a id="l01837" name="l01837"></a><span class="lineno"> 1837</span>  scale = side ? scale : -scale;</div>
-<div class="line"><a id="l01838" name="l01838"></a><span class="lineno"> 1838</span>  T edge = side ? w_min : w_max;</div>
-<div class="line"><a id="l01839" name="l01839"></a><span class="lineno"> 1839</span>  T q0 = <a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>(edge / scale);</div>
-<div class="line"><a id="l01840" name="l01840"></a><span class="lineno"> 1840</span>  <span class="keywordtype">bool</span> at_zero = q0 == 0.0f;</div>
-<div class="line"><a id="l01841" name="l01841"></a><span class="lineno"> 1841</span>  scale = at_zero ? scale : edge / q0;</div>
-<div class="line"><a id="l01842" name="l01842"></a><span class="lineno"> 1842</span>  T bias = at_zero ? T(0) : edge;</div>
-<div class="line"><a id="l01843" name="l01843"></a><span class="lineno"> 1843</span> </div>
-<div class="line"><a id="l01844" name="l01844"></a><span class="lineno"> 1844</span>  <span class="comment">// Write out the scales and biases</span></div>
-<div class="line"><a id="l01845" name="l01845"></a><span class="lineno"> 1845</span>  <span class="keywordtype">size_t</span> gindex = in_index / group_size;</div>
-<div class="line"><a id="l01846" name="l01846"></a><span class="lineno"> 1846</span>  <span class="keywordflow">if</span> (in_index % group_size == 0) {</div>
-<div class="line"><a id="l01847" name="l01847"></a><span class="lineno"> 1847</span>    scales[gindex] = scale;</div>
-<div class="line"><a id="l01848" name="l01848"></a><span class="lineno"> 1848</span>    biases[gindex] = bias;</div>
-<div class="line"><a id="l01849" name="l01849"></a><span class="lineno"> 1849</span>  }</div>
-<div class="line"><a id="l01850" name="l01850"></a><span class="lineno"> 1850</span> </div>
-<div class="line"><a id="l01851" name="l01851"></a><span class="lineno"> 1851</span>  uint8_t output = 0;</div>
-<div class="line"><a id="l01852" name="l01852"></a><span class="lineno"> 1852</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01853" name="l01853"></a><span class="lineno"> 1853</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_reduce; i++) {</div>
-<div class="line"><a id="l01854" name="l01854"></a><span class="lineno"> 1854</span>    uint8_t val = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(<a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>((w_thread[i] - bias) / scale), n_bins);</div>
-<div class="line"><a id="l01855" name="l01855"></a><span class="lineno"> 1855</span>    <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l01856" name="l01856"></a><span class="lineno"> 1856</span>      output = val;</div>
-<div class="line"><a id="l01857" name="l01857"></a><span class="lineno"> 1857</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01858" name="l01858"></a><span class="lineno"> 1858</span>      output += val &lt;&lt; (bits * (i % packs_per_int));</div>
-<div class="line"><a id="l01859" name="l01859"></a><span class="lineno"> 1859</span>    }</div>
-<div class="line"><a id="l01860" name="l01860"></a><span class="lineno"> 1860</span> </div>
-<div class="line"><a id="l01861" name="l01861"></a><span class="lineno"> 1861</span>    <span class="keywordflow">if</span> (packs_per_int &lt; values_per_reduce &amp;&amp;</div>
-<div class="line"><a id="l01862" name="l01862"></a><span class="lineno"> 1862</span>        i % packs_per_int == packs_per_int - 1) {</div>
-<div class="line"><a id="l01863" name="l01863"></a><span class="lineno"> 1863</span>      out[out_index + i / packs_per_int] = output;</div>
-<div class="line"><a id="l01864" name="l01864"></a><span class="lineno"> 1864</span>      output = 0;</div>
-<div class="line"><a id="l01865" name="l01865"></a><span class="lineno"> 1865</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01866" name="l01866"></a><span class="lineno"> 1866</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01867" name="l01867"></a><span class="lineno"> 1867</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j &lt; writes_per_reduce - 1; j++) {</div>
-<div class="line"><a id="l01868" name="l01868"></a><span class="lineno"> 1868</span>        uint8_t sval = <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(val, j + 1);</div>
-<div class="line"><a id="l01869" name="l01869"></a><span class="lineno"> 1869</span>        output += sval &lt;&lt; (bits * (values_per_reduce + j + i));</div>
-<div class="line"><a id="l01870" name="l01870"></a><span class="lineno"> 1870</span>      }</div>
-<div class="line"><a id="l01871" name="l01871"></a><span class="lineno"> 1871</span>    }</div>
-<div class="line"><a id="l01872" name="l01872"></a><span class="lineno"> 1872</span>  }</div>
-<div class="line"><a id="l01873" name="l01873"></a><span class="lineno"> 1873</span>  <span class="keywordflow">if</span> (writes_per_reduce &gt; 0 &amp;&amp; out_index % writes_per_reduce == 0) {</div>
-<div class="line"><a id="l01874" name="l01874"></a><span class="lineno"> 1874</span>    out[out_index / writes_per_reduce] = output;</div>
-<div class="line"><a id="l01875" name="l01875"></a><span class="lineno"> 1875</span>  }</div>
-<div class="line"><a id="l01876" name="l01876"></a><span class="lineno"> 1876</span>}</div>
+<div class="line"><a id="l01802" name="l01802"></a><span class="lineno"> 1802</span> </div>
+<div class="line"><a id="l01803" name="l01803"></a><span class="lineno"> 1803</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen01804" data-start="{" data-end="}">
+<div class="line"><a id="l01804" name="l01804"></a><span class="lineno"><a class="line" href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494"> 1804</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494">bs_qvm</a>(</div>
+<div class="line"><a id="l01805" name="l01805"></a><span class="lineno"> 1805</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01806" name="l01806"></a><span class="lineno"> 1806</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01807" name="l01807"></a><span class="lineno"> 1807</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01808" name="l01808"></a><span class="lineno"> 1808</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01809" name="l01809"></a><span class="lineno"> 1809</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01810" name="l01810"></a><span class="lineno"> 1810</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01811" name="l01811"></a><span class="lineno"> 1811</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01812" name="l01812"></a><span class="lineno"> 1812</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01813" name="l01813"></a><span class="lineno"> 1813</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01814" name="l01814"></a><span class="lineno"> 1814</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01815" name="l01815"></a><span class="lineno"> 1815</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01816" name="l01816"></a><span class="lineno"> 1816</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01817" name="l01817"></a><span class="lineno"> 1817</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01818" name="l01818"></a><span class="lineno"> 1818</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01819" name="l01819"></a><span class="lineno"> 1819</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01820" name="l01820"></a><span class="lineno"> 1820</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
+<div class="line"><a id="l01821" name="l01821"></a><span class="lineno"> 1821</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
+<div class="line"><a id="l01822" name="l01822"></a><span class="lineno"> 1822</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
+<div class="line"><a id="l01823" name="l01823"></a><span class="lineno"> 1823</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01824" name="l01824"></a><span class="lineno"> 1824</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
+<div class="line"><a id="l01825" name="l01825"></a><span class="lineno"> 1825</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01826" name="l01826"></a><span class="lineno"> 1826</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01827" name="l01827"></a><span class="lineno"> 1827</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01828" name="l01828"></a><span class="lineno"> 1828</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01829" name="l01829"></a><span class="lineno"> 1829</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01830" name="l01830"></a><span class="lineno"> 1830</span>      x,</div>
+<div class="line"><a id="l01831" name="l01831"></a><span class="lineno"> 1831</span>      w,</div>
+<div class="line"><a id="l01832" name="l01832"></a><span class="lineno"> 1832</span>      scales,</div>
+<div class="line"><a id="l01833" name="l01833"></a><span class="lineno"> 1833</span>      biases,</div>
+<div class="line"><a id="l01834" name="l01834"></a><span class="lineno"> 1834</span>      lhs_indices,</div>
+<div class="line"><a id="l01835" name="l01835"></a><span class="lineno"> 1835</span>      rhs_indices,</div>
+<div class="line"><a id="l01836" name="l01836"></a><span class="lineno"> 1836</span>      y,</div>
+<div class="line"><a id="l01837" name="l01837"></a><span class="lineno"> 1837</span>      out_vec_size,</div>
+<div class="line"><a id="l01838" name="l01838"></a><span class="lineno"> 1838</span>      batch_ndims,</div>
+<div class="line"><a id="l01839" name="l01839"></a><span class="lineno"> 1839</span>      batch_shape,</div>
+<div class="line"><a id="l01840" name="l01840"></a><span class="lineno"> 1840</span>      lhs_strides,</div>
+<div class="line"><a id="l01841" name="l01841"></a><span class="lineno"> 1841</span>      rhs_strides,</div>
+<div class="line"><a id="l01842" name="l01842"></a><span class="lineno"> 1842</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01843" name="l01843"></a><span class="lineno"> 1843</span>      x_shape,</div>
+<div class="line"><a id="l01844" name="l01844"></a><span class="lineno"> 1844</span>      x_strides,</div>
+<div class="line"><a id="l01845" name="l01845"></a><span class="lineno"> 1845</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01846" name="l01846"></a><span class="lineno"> 1846</span>      w_shape,</div>
+<div class="line"><a id="l01847" name="l01847"></a><span class="lineno"> 1847</span>      w_strides,</div>
+<div class="line"><a id="l01848" name="l01848"></a><span class="lineno"> 1848</span>      s_strides,</div>
+<div class="line"><a id="l01849" name="l01849"></a><span class="lineno"> 1849</span>      b_strides,</div>
+<div class="line"><a id="l01850" name="l01850"></a><span class="lineno"> 1850</span>      tid);</div>
+<div class="line"><a id="l01851" name="l01851"></a><span class="lineno"> 1851</span>  <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01852" name="l01852"></a><span class="lineno"> 1852</span>      w,</div>
+<div class="line"><a id="l01853" name="l01853"></a><span class="lineno"> 1853</span>      scales,</div>
+<div class="line"><a id="l01854" name="l01854"></a><span class="lineno"> 1854</span>      biases,</div>
+<div class="line"><a id="l01855" name="l01855"></a><span class="lineno"> 1855</span>      x,</div>
+<div class="line"><a id="l01856" name="l01856"></a><span class="lineno"> 1856</span>      y,</div>
+<div class="line"><a id="l01857" name="l01857"></a><span class="lineno"> 1857</span>      in_vec_size,</div>
+<div class="line"><a id="l01858" name="l01858"></a><span class="lineno"> 1858</span>      out_vec_size,</div>
+<div class="line"><a id="l01859" name="l01859"></a><span class="lineno"> 1859</span>      tid,</div>
+<div class="line"><a id="l01860" name="l01860"></a><span class="lineno"> 1860</span>      simd_gid,</div>
+<div class="line"><a id="l01861" name="l01861"></a><span class="lineno"> 1861</span>      simd_lid);</div>
+<div class="line"><a id="l01862" name="l01862"></a><span class="lineno"> 1862</span>}</div>
 </div>
-<div class="line"><a id="l01877" name="l01877"></a><span class="lineno"> 1877</span> </div>
-<div class="line"><a id="l01878" name="l01878"></a><span class="lineno"> 1878</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01879" data-start="{" data-end="}">
-<div class="line"><a id="l01879" name="l01879"></a><span class="lineno"><a class="line" href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c"> 1879</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c">affine_quantize_scales_biases</a>(</div>
-<div class="line"><a id="l01880" name="l01880"></a><span class="lineno"> 1880</span>    <span class="keyword">const</span> device T* w [[buffer(0)]],</div>
-<div class="line"><a id="l01881" name="l01881"></a><span class="lineno"> 1881</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01882" name="l01882"></a><span class="lineno"> 1882</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01883" name="l01883"></a><span class="lineno"> 1883</span>    device uint8_t* out [[buffer(3)]],</div>
-<div class="line"><a id="l01884" name="l01884"></a><span class="lineno"> 1884</span>    uint2 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l01885" name="l01885"></a><span class="lineno"> 1885</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l01886" name="l01886"></a><span class="lineno"> 1886</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> uint8_bits = 8;</div>
-<div class="line"><a id="l01887" name="l01887"></a><span class="lineno"> 1887</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = uint8_bits / bits;</div>
-<div class="line"><a id="l01888" name="l01888"></a><span class="lineno"> 1888</span>  <span class="keyword">constexpr</span> T n_bins = (1 &lt;&lt; bits) - 1;</div>
-<div class="line"><a id="l01889" name="l01889"></a><span class="lineno"> 1889</span> </div>
-<div class="line"><a id="l01890" name="l01890"></a><span class="lineno"> 1890</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
-<div class="line"><a id="l01891" name="l01891"></a><span class="lineno"> 1891</span>  <span class="keywordtype">size_t</span> in_index = offset * packs_per_int;</div>
-<div class="line"><a id="l01892" name="l01892"></a><span class="lineno"> 1892</span>  <span class="keywordtype">size_t</span> gindex = in_index / group_size;</div>
-<div class="line"><a id="l01893" name="l01893"></a><span class="lineno"> 1893</span> </div>
-<div class="line"><a id="l01894" name="l01894"></a><span class="lineno"> 1894</span>  T scale = scales[gindex];</div>
-<div class="line"><a id="l01895" name="l01895"></a><span class="lineno"> 1895</span>  T bias = biases[gindex];</div>
-<div class="line"><a id="l01896" name="l01896"></a><span class="lineno"> 1896</span> </div>
-<div class="line"><a id="l01897" name="l01897"></a><span class="lineno"> 1897</span>  uint8_t output = 0;</div>
-<div class="line"><a id="l01898" name="l01898"></a><span class="lineno"> 1898</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01899" name="l01899"></a><span class="lineno"> 1899</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; packs_per_int; i++) {</div>
-<div class="line"><a id="l01900" name="l01900"></a><span class="lineno"> 1900</span>    uint8_t val = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(<a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>((w[in_index + i] - bias) / scale), n_bins);</div>
-<div class="line"><a id="l01901" name="l01901"></a><span class="lineno"> 1901</span>    <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l01902" name="l01902"></a><span class="lineno"> 1902</span>      output = val;</div>
-<div class="line"><a id="l01903" name="l01903"></a><span class="lineno"> 1903</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01904" name="l01904"></a><span class="lineno"> 1904</span>      output += val &lt;&lt; (bits * i);</div>
-<div class="line"><a id="l01905" name="l01905"></a><span class="lineno"> 1905</span>    }</div>
-<div class="line"><a id="l01906" name="l01906"></a><span class="lineno"> 1906</span>  }</div>
-<div class="line"><a id="l01907" name="l01907"></a><span class="lineno"> 1907</span>  out[offset] = output;</div>
-<div class="line"><a id="l01908" name="l01908"></a><span class="lineno"> 1908</span>}</div>
+<div class="line"><a id="l01863" name="l01863"></a><span class="lineno"> 1863</span> </div>
+<div class="line"><a id="l01864" name="l01864"></a><span class="lineno"> 1864</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l01865" name="l01865"></a><span class="lineno"> 1865</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l01866" name="l01866"></a><span class="lineno"> 1866</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01867" name="l01867"></a><span class="lineno"> 1867</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l01868" name="l01868"></a><span class="lineno"> 1868</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
+<div class="line"><a id="l01869" name="l01869"></a><span class="lineno"> 1869</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l01870" name="l01870"></a><span class="lineno"> 1870</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l01871" name="l01871"></a><span class="lineno"> 1871</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen01872" data-start="{" data-end="}">
+<div class="line"><a id="l01872" name="l01872"></a><span class="lineno"><a class="line" href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84"> 1872</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84">bs_qmm_t</a>(</div>
+<div class="line"><a id="l01873" name="l01873"></a><span class="lineno"> 1873</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01874" name="l01874"></a><span class="lineno"> 1874</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01875" name="l01875"></a><span class="lineno"> 1875</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01876" name="l01876"></a><span class="lineno"> 1876</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01877" name="l01877"></a><span class="lineno"> 1877</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01878" name="l01878"></a><span class="lineno"> 1878</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
+<div class="line"><a id="l01879" name="l01879"></a><span class="lineno"> 1879</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
+<div class="line"><a id="l01880" name="l01880"></a><span class="lineno"> 1880</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
+<div class="line"><a id="l01881" name="l01881"></a><span class="lineno"> 1881</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
+<div class="line"><a id="l01882" name="l01882"></a><span class="lineno"> 1882</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
+<div class="line"><a id="l01883" name="l01883"></a><span class="lineno"> 1883</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
+<div class="line"><a id="l01884" name="l01884"></a><span class="lineno"> 1884</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
+<div class="line"><a id="l01885" name="l01885"></a><span class="lineno"> 1885</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
+<div class="line"><a id="l01886" name="l01886"></a><span class="lineno"> 1886</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01887" name="l01887"></a><span class="lineno"> 1887</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01888" name="l01888"></a><span class="lineno"> 1888</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
+<div class="line"><a id="l01889" name="l01889"></a><span class="lineno"> 1889</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(16)]],</div>
+<div class="line"><a id="l01890" name="l01890"></a><span class="lineno"> 1890</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(17)]],</div>
+<div class="line"><a id="l01891" name="l01891"></a><span class="lineno"> 1891</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01892" name="l01892"></a><span class="lineno"> 1892</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(19)]],</div>
+<div class="line"><a id="l01893" name="l01893"></a><span class="lineno"> 1893</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01894" name="l01894"></a><span class="lineno"> 1894</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(21)]],</div>
+<div class="line"><a id="l01895" name="l01895"></a><span class="lineno"> 1895</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01896" name="l01896"></a><span class="lineno"> 1896</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l01897" name="l01897"></a><span class="lineno"> 1897</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01898" name="l01898"></a><span class="lineno"> 1898</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01899" name="l01899"></a><span class="lineno"> 1899</span>  (void)lid;</div>
+<div class="line"><a id="l01900" name="l01900"></a><span class="lineno"> 1900</span> </div>
+<div class="line"><a id="l01901" name="l01901"></a><span class="lineno"> 1901</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01902" name="l01902"></a><span class="lineno"> 1902</span> </div>
+<div class="line"><a id="l01903" name="l01903"></a><span class="lineno"> 1903</span>  threadgroup T Xs[BM * BK_padded];</div>
+<div class="line"><a id="l01904" name="l01904"></a><span class="lineno"> 1904</span>  threadgroup T Ws[BN * BK_padded];</div>
+<div class="line"><a id="l01905" name="l01905"></a><span class="lineno"> 1905</span> </div>
+<div class="line"><a id="l01906" name="l01906"></a><span class="lineno"> 1906</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01907" name="l01907"></a><span class="lineno"> 1907</span>      x,</div>
+<div class="line"><a id="l01908" name="l01908"></a><span class="lineno"> 1908</span>      w,</div>
+<div class="line"><a id="l01909" name="l01909"></a><span class="lineno"> 1909</span>      scales,</div>
+<div class="line"><a id="l01910" name="l01910"></a><span class="lineno"> 1910</span>      biases,</div>
+<div class="line"><a id="l01911" name="l01911"></a><span class="lineno"> 1911</span>      lhs_indices,</div>
+<div class="line"><a id="l01912" name="l01912"></a><span class="lineno"> 1912</span>      rhs_indices,</div>
+<div class="line"><a id="l01913" name="l01913"></a><span class="lineno"> 1913</span>      y,</div>
+<div class="line"><a id="l01914" name="l01914"></a><span class="lineno"> 1914</span>      M * N,</div>
+<div class="line"><a id="l01915" name="l01915"></a><span class="lineno"> 1915</span>      batch_ndims,</div>
+<div class="line"><a id="l01916" name="l01916"></a><span class="lineno"> 1916</span>      batch_shape,</div>
+<div class="line"><a id="l01917" name="l01917"></a><span class="lineno"> 1917</span>      lhs_strides,</div>
+<div class="line"><a id="l01918" name="l01918"></a><span class="lineno"> 1918</span>      rhs_strides,</div>
+<div class="line"><a id="l01919" name="l01919"></a><span class="lineno"> 1919</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01920" name="l01920"></a><span class="lineno"> 1920</span>      x_shape,</div>
+<div class="line"><a id="l01921" name="l01921"></a><span class="lineno"> 1921</span>      x_strides,</div>
+<div class="line"><a id="l01922" name="l01922"></a><span class="lineno"> 1922</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01923" name="l01923"></a><span class="lineno"> 1923</span>      w_shape,</div>
+<div class="line"><a id="l01924" name="l01924"></a><span class="lineno"> 1924</span>      w_strides,</div>
+<div class="line"><a id="l01925" name="l01925"></a><span class="lineno"> 1925</span>      s_strides,</div>
+<div class="line"><a id="l01926" name="l01926"></a><span class="lineno"> 1926</span>      b_strides,</div>
+<div class="line"><a id="l01927" name="l01927"></a><span class="lineno"> 1927</span>      tid);</div>
+<div class="line"><a id="l01928" name="l01928"></a><span class="lineno"> 1928</span>  <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl&lt;T, group_size, bits, aligned_N, BM, BK, BN&gt;</a>(</div>
+<div class="line"><a id="l01929" name="l01929"></a><span class="lineno"> 1929</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01930" name="l01930"></a><span class="lineno"> 1930</span>}</div>
 </div>
-<div class="line"><a id="l01909" name="l01909"></a><span class="lineno"> 1909</span> </div>
-<div class="line"><a id="l01910" name="l01910"></a><span class="lineno"> 1910</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01911" data-start="{" data-end="}">
-<div class="line"><a id="l01911" name="l01911"></a><span class="lineno"><a class="line" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6"> 1911</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">affine_dequantize</a>(</div>
-<div class="line"><a id="l01912" name="l01912"></a><span class="lineno"> 1912</span>    <span class="keyword">const</span> device uint8_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01913" name="l01913"></a><span class="lineno"> 1913</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01914" name="l01914"></a><span class="lineno"> 1914</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01915" name="l01915"></a><span class="lineno"> 1915</span>    device T* out [[buffer(3)]],</div>
-<div class="line"><a id="l01916" name="l01916"></a><span class="lineno"> 1916</span>    uint2 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l01917" name="l01917"></a><span class="lineno"> 1917</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l01918" name="l01918"></a><span class="lineno"> 1918</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> uint8_bits = 8;</div>
-<div class="line"><a id="l01919" name="l01919"></a><span class="lineno"> 1919</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = uint8_bits / bits;</div>
-<div class="line"><a id="l01920" name="l01920"></a><span class="lineno"> 1920</span> </div>
-<div class="line"><a id="l01921" name="l01921"></a><span class="lineno"> 1921</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
-<div class="line"><a id="l01922" name="l01922"></a><span class="lineno"> 1922</span>  <span class="keywordtype">size_t</span> oindex = offset * packs_per_int;</div>
-<div class="line"><a id="l01923" name="l01923"></a><span class="lineno"> 1923</span>  <span class="keywordtype">size_t</span> gindex = oindex / group_size;</div>
-<div class="line"><a id="l01924" name="l01924"></a><span class="lineno"> 1924</span>  T scale = scales[gindex];</div>
-<div class="line"><a id="l01925" name="l01925"></a><span class="lineno"> 1925</span>  T bias = biases[gindex];</div>
-<div class="line"><a id="l01926" name="l01926"></a><span class="lineno"> 1926</span>  uint val = w[offset];</div>
-<div class="line"><a id="l01927" name="l01927"></a><span class="lineno"> 1927</span> </div>
-<div class="line"><a id="l01928" name="l01928"></a><span class="lineno"> 1928</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01929" name="l01929"></a><span class="lineno"> 1929</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; packs_per_int; i++) {</div>
-<div class="line"><a id="l01930" name="l01930"></a><span class="lineno"> 1930</span>    uint8_t d;</div>
-<div class="line"><a id="l01931" name="l01931"></a><span class="lineno"> 1931</span>    <span class="keywordflow">if</span> (bits == 2) {</div>
-<div class="line"><a id="l01932" name="l01932"></a><span class="lineno"> 1932</span>      d = (val &gt;&gt; (bits * i)) &amp; 0x03;</div>
-<div class="line"><a id="l01933" name="l01933"></a><span class="lineno"> 1933</span>    } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
-<div class="line"><a id="l01934" name="l01934"></a><span class="lineno"> 1934</span>      d = (val &gt;&gt; (bits * i)) &amp; 0x0f;</div>
-<div class="line"><a id="l01935" name="l01935"></a><span class="lineno"> 1935</span>    } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l01936" name="l01936"></a><span class="lineno"> 1936</span>      d = val;</div>
-<div class="line"><a id="l01937" name="l01937"></a><span class="lineno"> 1937</span>    }</div>
-<div class="line"><a id="l01938" name="l01938"></a><span class="lineno"> 1938</span>    out[oindex + i] = scale * d + bias;</div>
-<div class="line"><a id="l01939" name="l01939"></a><span class="lineno"> 1939</span>  }</div>
-<div class="line"><a id="l01940" name="l01940"></a><span class="lineno"> 1940</span>}</div>
+<div class="line"><a id="l01931" name="l01931"></a><span class="lineno"> 1931</span> </div>
+<div class="line"><a id="l01932" name="l01932"></a><span class="lineno"> 1932</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l01933" name="l01933"></a><span class="lineno"> 1933</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l01934" name="l01934"></a><span class="lineno"> 1934</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01935" name="l01935"></a><span class="lineno"> 1935</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l01936" name="l01936"></a><span class="lineno"> 1936</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l01937" name="l01937"></a><span class="lineno"> 1937</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l01938" name="l01938"></a><span class="lineno"> 1938</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen01939" data-start="{" data-end="}">
+<div class="line"><a id="l01939" name="l01939"></a><span class="lineno"><a class="line" href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f"> 1939</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f">bs_qmm_n</a>(</div>
+<div class="line"><a id="l01940" name="l01940"></a><span class="lineno"> 1940</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01941" name="l01941"></a><span class="lineno"> 1941</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01942" name="l01942"></a><span class="lineno"> 1942</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01943" name="l01943"></a><span class="lineno"> 1943</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01944" name="l01944"></a><span class="lineno"> 1944</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01945" name="l01945"></a><span class="lineno"> 1945</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
+<div class="line"><a id="l01946" name="l01946"></a><span class="lineno"> 1946</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
+<div class="line"><a id="l01947" name="l01947"></a><span class="lineno"> 1947</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
+<div class="line"><a id="l01948" name="l01948"></a><span class="lineno"> 1948</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
+<div class="line"><a id="l01949" name="l01949"></a><span class="lineno"> 1949</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
+<div class="line"><a id="l01950" name="l01950"></a><span class="lineno"> 1950</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
+<div class="line"><a id="l01951" name="l01951"></a><span class="lineno"> 1951</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
+<div class="line"><a id="l01952" name="l01952"></a><span class="lineno"> 1952</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
+<div class="line"><a id="l01953" name="l01953"></a><span class="lineno"> 1953</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01954" name="l01954"></a><span class="lineno"> 1954</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01955" name="l01955"></a><span class="lineno"> 1955</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
+<div class="line"><a id="l01956" name="l01956"></a><span class="lineno"> 1956</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(16)]],</div>
+<div class="line"><a id="l01957" name="l01957"></a><span class="lineno"> 1957</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(17)]],</div>
+<div class="line"><a id="l01958" name="l01958"></a><span class="lineno"> 1958</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01959" name="l01959"></a><span class="lineno"> 1959</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(19)]],</div>
+<div class="line"><a id="l01960" name="l01960"></a><span class="lineno"> 1960</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01961" name="l01961"></a><span class="lineno"> 1961</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(21)]],</div>
+<div class="line"><a id="l01962" name="l01962"></a><span class="lineno"> 1962</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01963" name="l01963"></a><span class="lineno"> 1963</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l01964" name="l01964"></a><span class="lineno"> 1964</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01965" name="l01965"></a><span class="lineno"> 1965</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01966" name="l01966"></a><span class="lineno"> 1966</span>  (void)lid;</div>
+<div class="line"><a id="l01967" name="l01967"></a><span class="lineno"> 1967</span> </div>
+<div class="line"><a id="l01968" name="l01968"></a><span class="lineno"> 1968</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01969" name="l01969"></a><span class="lineno"> 1969</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01970" name="l01970"></a><span class="lineno"> 1970</span> </div>
+<div class="line"><a id="l01971" name="l01971"></a><span class="lineno"> 1971</span>  threadgroup T Xs[BM * BK_padded];</div>
+<div class="line"><a id="l01972" name="l01972"></a><span class="lineno"> 1972</span>  threadgroup T Ws[BK * BN_padded];</div>
+<div class="line"><a id="l01973" name="l01973"></a><span class="lineno"> 1973</span> </div>
+<div class="line"><a id="l01974" name="l01974"></a><span class="lineno"> 1974</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01975" name="l01975"></a><span class="lineno"> 1975</span>      x,</div>
+<div class="line"><a id="l01976" name="l01976"></a><span class="lineno"> 1976</span>      w,</div>
+<div class="line"><a id="l01977" name="l01977"></a><span class="lineno"> 1977</span>      scales,</div>
+<div class="line"><a id="l01978" name="l01978"></a><span class="lineno"> 1978</span>      biases,</div>
+<div class="line"><a id="l01979" name="l01979"></a><span class="lineno"> 1979</span>      lhs_indices,</div>
+<div class="line"><a id="l01980" name="l01980"></a><span class="lineno"> 1980</span>      rhs_indices,</div>
+<div class="line"><a id="l01981" name="l01981"></a><span class="lineno"> 1981</span>      y,</div>
+<div class="line"><a id="l01982" name="l01982"></a><span class="lineno"> 1982</span>      M * N,</div>
+<div class="line"><a id="l01983" name="l01983"></a><span class="lineno"> 1983</span>      batch_ndims,</div>
+<div class="line"><a id="l01984" name="l01984"></a><span class="lineno"> 1984</span>      batch_shape,</div>
+<div class="line"><a id="l01985" name="l01985"></a><span class="lineno"> 1985</span>      lhs_strides,</div>
+<div class="line"><a id="l01986" name="l01986"></a><span class="lineno"> 1986</span>      rhs_strides,</div>
+<div class="line"><a id="l01987" name="l01987"></a><span class="lineno"> 1987</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01988" name="l01988"></a><span class="lineno"> 1988</span>      x_shape,</div>
+<div class="line"><a id="l01989" name="l01989"></a><span class="lineno"> 1989</span>      x_strides,</div>
+<div class="line"><a id="l01990" name="l01990"></a><span class="lineno"> 1990</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01991" name="l01991"></a><span class="lineno"> 1991</span>      w_shape,</div>
+<div class="line"><a id="l01992" name="l01992"></a><span class="lineno"> 1992</span>      w_strides,</div>
+<div class="line"><a id="l01993" name="l01993"></a><span class="lineno"> 1993</span>      s_strides,</div>
+<div class="line"><a id="l01994" name="l01994"></a><span class="lineno"> 1994</span>      b_strides,</div>
+<div class="line"><a id="l01995" name="l01995"></a><span class="lineno"> 1995</span>      tid);</div>
+<div class="line"><a id="l01996" name="l01996"></a><span class="lineno"> 1996</span>  <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl&lt;T, group_size, bits, BM, BK, BN&gt;</a>(</div>
+<div class="line"><a id="l01997" name="l01997"></a><span class="lineno"> 1997</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01998" name="l01998"></a><span class="lineno"> 1998</span>}</div>
+</div>
+<div class="line"><a id="l01999" name="l01999"></a><span class="lineno"> 1999</span> </div>
+<div class="line"><a id="l02000" name="l02000"></a><span class="lineno"> 2000</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen02001" data-start="{" data-end="}">
+<div class="line"><a id="l02001" name="l02001"></a><span class="lineno"><a class="line" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59"> 2001</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">affine_quantize</a>(</div>
+<div class="line"><a id="l02002" name="l02002"></a><span class="lineno"> 2002</span>    <span class="keyword">const</span> device T* w [[buffer(0)]],</div>
+<div class="line"><a id="l02003" name="l02003"></a><span class="lineno"> 2003</span>    device uint8_t* out [[buffer(1)]],</div>
+<div class="line"><a id="l02004" name="l02004"></a><span class="lineno"> 2004</span>    device T* scales [[buffer(2)]],</div>
+<div class="line"><a id="l02005" name="l02005"></a><span class="lineno"> 2005</span>    device T* biases [[buffer(3)]],</div>
+<div class="line"><a id="l02006" name="l02006"></a><span class="lineno"> 2006</span>    uint2 index [[thread_position_in_grid]],</div>
+<div class="line"><a id="l02007" name="l02007"></a><span class="lineno"> 2007</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
+<div class="line"><a id="l02008" name="l02008"></a><span class="lineno"> 2008</span>  <span class="keyword">constexpr</span> T eps = T(1e-7);</div>
+<div class="line"><a id="l02009" name="l02009"></a><span class="lineno"> 2009</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> = 32;</div>
+<div class="line"><a id="l02010" name="l02010"></a><span class="lineno"> 2010</span>  <span class="keyword">constexpr</span> T n_bins = (1 &lt;&lt; bits) - 1;</div>
+<div class="line"><a id="l02011" name="l02011"></a><span class="lineno"> 2011</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;</div>
+<div class="line"><a id="l02012" name="l02012"></a><span class="lineno"> 2012</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_reduce = group_size / <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
+<div class="line"><a id="l02013" name="l02013"></a><span class="lineno"> 2013</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> writes_per_reduce = packs_per_int / values_per_reduce;</div>
+<div class="line"><a id="l02014" name="l02014"></a><span class="lineno"> 2014</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> writes_per_pack =</div>
+<div class="line"><a id="l02015" name="l02015"></a><span class="lineno"> 2015</span>      writes_per_reduce &gt; 1 ? 1 : values_per_reduce / packs_per_int;</div>
+<div class="line"><a id="l02016" name="l02016"></a><span class="lineno"> 2016</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> power_of_2_bits = (bits &amp; (bits - 1)) == 0;</div>
+<div class="line"><a id="l02017" name="l02017"></a><span class="lineno"> 2017</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> bytes_per_pack = power_of_2_bits ? 1 : 3;</div>
+<div class="line"><a id="l02018" name="l02018"></a><span class="lineno"> 2018</span> </div>
+<div class="line"><a id="l02019" name="l02019"></a><span class="lineno"> 2019</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l02020" name="l02020"></a><span class="lineno"> 2020</span>      group_size % <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> == 0,</div>
+<div class="line"><a id="l02021" name="l02021"></a><span class="lineno"> 2021</span>      <span class="stringliteral">&quot;Group size must be divisible by simd size.&quot;</span>);</div>
+<div class="line"><a id="l02022" name="l02022"></a><span class="lineno"> 2022</span> </div>
+<div class="line"><a id="l02023" name="l02023"></a><span class="lineno"> 2023</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
+<div class="line"><a id="l02024" name="l02024"></a><span class="lineno"> 2024</span>  <span class="keywordtype">size_t</span> in_index = offset * values_per_reduce;</div>
+<div class="line"><a id="l02025" name="l02025"></a><span class="lineno"> 2025</span>  <span class="keywordtype">size_t</span> out_index = power_of_2_bits</div>
+<div class="line"><a id="l02026" name="l02026"></a><span class="lineno"> 2026</span>      ? offset * writes_per_pack</div>
+<div class="line"><a id="l02027" name="l02027"></a><span class="lineno"> 2027</span>      : offset * bytes_per_pack / writes_per_reduce;</div>
+<div class="line"><a id="l02028" name="l02028"></a><span class="lineno"> 2028</span> </div>
+<div class="line"><a id="l02029" name="l02029"></a><span class="lineno"> 2029</span>  T w_thread[values_per_reduce];</div>
+<div class="line"><a id="l02030" name="l02030"></a><span class="lineno"> 2030</span>  T w_min = <a class="code hl_struct" href="struct_limits.html">Limits&lt;T&gt;::max</a>;</div>
+<div class="line"><a id="l02031" name="l02031"></a><span class="lineno"> 2031</span>  T w_max = 0;</div>
+<div class="line"><a id="l02032" name="l02032"></a><span class="lineno"> 2032</span> </div>
+<div class="line"><a id="l02033" name="l02033"></a><span class="lineno"> 2033</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l02034" name="l02034"></a><span class="lineno"> 2034</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_reduce; i++) {</div>
+<div class="line"><a id="l02035" name="l02035"></a><span class="lineno"> 2035</span>    T val = w[in_index + i];</div>
+<div class="line"><a id="l02036" name="l02036"></a><span class="lineno"> 2036</span>    w_thread[i] = val;</div>
+<div class="line"><a id="l02037" name="l02037"></a><span class="lineno"> 2037</span>    w_min = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(w_min, val);</div>
+<div class="line"><a id="l02038" name="l02038"></a><span class="lineno"> 2038</span>    w_max = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>(w_max, val);</div>
+<div class="line"><a id="l02039" name="l02039"></a><span class="lineno"> 2039</span>  }</div>
+<div class="line"><a id="l02040" name="l02040"></a><span class="lineno"> 2040</span> </div>
+<div class="line"><a id="l02041" name="l02041"></a><span class="lineno"> 2041</span>  w_min = <a class="code hl_function" href="namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b">simd_min</a>(w_min);</div>
+<div class="line"><a id="l02042" name="l02042"></a><span class="lineno"> 2042</span>  w_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(w_max);</div>
+<div class="line"><a id="l02043" name="l02043"></a><span class="lineno"> 2043</span> </div>
+<div class="line"><a id="l02044" name="l02044"></a><span class="lineno"> 2044</span>  T scale = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>((w_max - w_min) / n_bins, eps);</div>
+<div class="line"><a id="l02045" name="l02045"></a><span class="lineno"> 2045</span>  <span class="keywordtype">bool</span> side = <a class="code hl_function" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">abs</a>(w_min) &gt; <a class="code hl_function" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">abs</a>(w_max);</div>
+<div class="line"><a id="l02046" name="l02046"></a><span class="lineno"> 2046</span>  scale = side ? scale : -scale;</div>
+<div class="line"><a id="l02047" name="l02047"></a><span class="lineno"> 2047</span>  T edge = side ? w_min : w_max;</div>
+<div class="line"><a id="l02048" name="l02048"></a><span class="lineno"> 2048</span>  T q0 = <a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>(edge / scale);</div>
+<div class="line"><a id="l02049" name="l02049"></a><span class="lineno"> 2049</span>  <span class="keywordtype">bool</span> at_zero = q0 == 0.0f;</div>
+<div class="line"><a id="l02050" name="l02050"></a><span class="lineno"> 2050</span>  scale = at_zero ? scale : edge / q0;</div>
+<div class="line"><a id="l02051" name="l02051"></a><span class="lineno"> 2051</span>  T bias = at_zero ? T(0) : edge;</div>
+<div class="line"><a id="l02052" name="l02052"></a><span class="lineno"> 2052</span> </div>
+<div class="line"><a id="l02053" name="l02053"></a><span class="lineno"> 2053</span>  <span class="comment">// Write out the scales and biases</span></div>
+<div class="line"><a id="l02054" name="l02054"></a><span class="lineno"> 2054</span>  <span class="keywordtype">size_t</span> gindex = in_index / group_size;</div>
+<div class="line"><a id="l02055" name="l02055"></a><span class="lineno"> 2055</span>  <span class="keywordflow">if</span> (in_index % group_size == 0) {</div>
+<div class="line"><a id="l02056" name="l02056"></a><span class="lineno"> 2056</span>    scales[gindex] = scale;</div>
+<div class="line"><a id="l02057" name="l02057"></a><span class="lineno"> 2057</span>    biases[gindex] = bias;</div>
+<div class="line"><a id="l02058" name="l02058"></a><span class="lineno"> 2058</span>  }</div>
+<div class="line"><a id="l02059" name="l02059"></a><span class="lineno"> 2059</span> </div>
+<div class="line"><a id="l02060" name="l02060"></a><span class="lineno"> 2060</span>  <span class="comment">// We accumulate 3 bytes worth for 3/6 bit so we need a uint32_t</span></div>
+<div class="line"><a id="l02061" name="l02061"></a><span class="lineno"> 2061</span>  uint32_t output = 0;</div>
+<div class="line"><a id="l02062" name="l02062"></a><span class="lineno"> 2062</span> </div>
+<div class="line"><a id="l02063" name="l02063"></a><span class="lineno"> 2063</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l02064" name="l02064"></a><span class="lineno"> 2064</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_reduce; i++) {</div>
+<div class="line"><a id="l02065" name="l02065"></a><span class="lineno"> 2065</span>    uint8_t val = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(<a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>((w_thread[i] - bias) / scale), n_bins);</div>
+<div class="line"><a id="l02066" name="l02066"></a><span class="lineno"> 2066</span>    <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l02067" name="l02067"></a><span class="lineno"> 2067</span>      output = val;</div>
+<div class="line"><a id="l02068" name="l02068"></a><span class="lineno"> 2068</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l02069" name="l02069"></a><span class="lineno"> 2069</span>      output += val &lt;&lt; (bits * (i % packs_per_int));</div>
+<div class="line"><a id="l02070" name="l02070"></a><span class="lineno"> 2070</span>    }</div>
+<div class="line"><a id="l02071" name="l02071"></a><span class="lineno"> 2071</span> </div>
+<div class="line"><a id="l02072" name="l02072"></a><span class="lineno"> 2072</span>    <span class="keywordflow">if</span> (packs_per_int &lt; values_per_reduce &amp;&amp;</div>
+<div class="line"><a id="l02073" name="l02073"></a><span class="lineno"> 2073</span>        i % packs_per_int == packs_per_int - 1) {</div>
+<div class="line"><a id="l02074" name="l02074"></a><span class="lineno"> 2074</span>      out[out_index + i / packs_per_int] = output;</div>
+<div class="line"><a id="l02075" name="l02075"></a><span class="lineno"> 2075</span>      output = 0;</div>
+<div class="line"><a id="l02076" name="l02076"></a><span class="lineno"> 2076</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l02077" name="l02077"></a><span class="lineno"> 2077</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l02078" name="l02078"></a><span class="lineno"> 2078</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 1; j &lt; writes_per_reduce; j++) {</div>
+<div class="line"><a id="l02079" name="l02079"></a><span class="lineno"> 2079</span>        uint8_t sval = <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(val, j);</div>
+<div class="line"><a id="l02080" name="l02080"></a><span class="lineno"> 2080</span>        output += sval &lt;&lt; (bits * (j * values_per_reduce + i));</div>
+<div class="line"><a id="l02081" name="l02081"></a><span class="lineno"> 2081</span>      }</div>
+<div class="line"><a id="l02082" name="l02082"></a><span class="lineno"> 2082</span>    }</div>
+<div class="line"><a id="l02083" name="l02083"></a><span class="lineno"> 2083</span>  }</div>
+<div class="line"><a id="l02084" name="l02084"></a><span class="lineno"> 2084</span>  <span class="keywordflow">if</span> (bits == 3 || bits == 6) {</div>
+<div class="line"><a id="l02085" name="l02085"></a><span class="lineno"> 2085</span>    <span class="keywordflow">if</span> (in_index % packs_per_int == 0 &amp;&amp; out_index % bytes_per_pack == 0) {</div>
+<div class="line"><a id="l02086" name="l02086"></a><span class="lineno"> 2086</span>      out[out_index] = output &amp; 0xff;</div>
+<div class="line"><a id="l02087" name="l02087"></a><span class="lineno"> 2087</span>      out[out_index + 1] = (output &amp; 0xff00) &gt;&gt; 8;</div>
+<div class="line"><a id="l02088" name="l02088"></a><span class="lineno"> 2088</span>      out[out_index + 2] = (output &amp; 0xff0000) &gt;&gt; 16;</div>
+<div class="line"><a id="l02089" name="l02089"></a><span class="lineno"> 2089</span>    }</div>
+<div class="line"><a id="l02090" name="l02090"></a><span class="lineno"> 2090</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l02091" name="l02091"></a><span class="lineno"> 2091</span>    <span class="keywordflow">if</span> (writes_per_reduce &gt; 0 &amp;&amp; out_index % writes_per_reduce == 0) {</div>
+<div class="line"><a id="l02092" name="l02092"></a><span class="lineno"> 2092</span>      out[out_index / writes_per_reduce] = output;</div>
+<div class="line"><a id="l02093" name="l02093"></a><span class="lineno"> 2093</span>    }</div>
+<div class="line"><a id="l02094" name="l02094"></a><span class="lineno"> 2094</span>  }</div>
+<div class="line"><a id="l02095" name="l02095"></a><span class="lineno"> 2095</span>}</div>
+</div>
+<div class="line"><a id="l02096" name="l02096"></a><span class="lineno"> 2096</span> </div>
+<div class="line"><a id="l02097" name="l02097"></a><span class="lineno"> 2097</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen02098" data-start="{" data-end="}">
+<div class="line"><a id="l02098" name="l02098"></a><span class="lineno"><a class="line" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6"> 2098</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">affine_dequantize</a>(</div>
+<div class="line"><a id="l02099" name="l02099"></a><span class="lineno"> 2099</span>    <span class="keyword">const</span> device uint8_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l02100" name="l02100"></a><span class="lineno"> 2100</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l02101" name="l02101"></a><span class="lineno"> 2101</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l02102" name="l02102"></a><span class="lineno"> 2102</span>    device T* out [[buffer(3)]],</div>
+<div class="line"><a id="l02103" name="l02103"></a><span class="lineno"> 2103</span>    uint2 index [[thread_position_in_grid]],</div>
+<div class="line"><a id="l02104" name="l02104"></a><span class="lineno"> 2104</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
+<div class="line"><a id="l02105" name="l02105"></a><span class="lineno"> 2105</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;</div>
+<div class="line"><a id="l02106" name="l02106"></a><span class="lineno"> 2106</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> power_of_2_bits = (bits &amp; (bits - 1)) == 0;</div>
+<div class="line"><a id="l02107" name="l02107"></a><span class="lineno"> 2107</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> bytes_per_pack = power_of_2_bits ? 1 : 3;</div>
+<div class="line"><a id="l02108" name="l02108"></a><span class="lineno"> 2108</span> </div>
+<div class="line"><a id="l02109" name="l02109"></a><span class="lineno"> 2109</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
+<div class="line"><a id="l02110" name="l02110"></a><span class="lineno"> 2110</span>  <span class="keywordtype">size_t</span> oindex = offset * packs_per_int;</div>
+<div class="line"><a id="l02111" name="l02111"></a><span class="lineno"> 2111</span>  <span class="keywordtype">size_t</span> gindex = oindex / group_size;</div>
+<div class="line"><a id="l02112" name="l02112"></a><span class="lineno"> 2112</span>  T scale = scales[gindex];</div>
+<div class="line"><a id="l02113" name="l02113"></a><span class="lineno"> 2113</span>  T bias = biases[gindex];</div>
+<div class="line"><a id="l02114" name="l02114"></a><span class="lineno"> 2114</span> </div>
+<div class="line"><a id="l02115" name="l02115"></a><span class="lineno"> 2115</span>  out += oindex;</div>
+<div class="line"><a id="l02116" name="l02116"></a><span class="lineno"> 2116</span> </div>
+<div class="line"><a id="l02117" name="l02117"></a><span class="lineno"> 2117</span>  <span class="keywordflow">if</span> (bits == 3) {</div>
+<div class="line"><a id="l02118" name="l02118"></a><span class="lineno"> 2118</span>    w += offset * bytes_per_pack;</div>
+<div class="line"><a id="l02119" name="l02119"></a><span class="lineno"> 2119</span>    out[0] = (w[0] &amp; 0x7) * scale + bias;</div>
+<div class="line"><a id="l02120" name="l02120"></a><span class="lineno"> 2120</span>    out[1] = ((w[0] &amp; 0x38) &gt;&gt; 3) * scale + bias;</div>
+<div class="line"><a id="l02121" name="l02121"></a><span class="lineno"> 2121</span>    out[2] = (((w[0] &amp; 0xc0) &gt;&gt; 6) + ((w[1] &amp; 0x1) &lt;&lt; 2)) * scale + bias;</div>
+<div class="line"><a id="l02122" name="l02122"></a><span class="lineno"> 2122</span>    out[3] = ((w[1] &amp; 0xe) &gt;&gt; 1) * scale + bias;</div>
+<div class="line"><a id="l02123" name="l02123"></a><span class="lineno"> 2123</span>    out[4] = ((w[1] &amp; 0x70) &gt;&gt; 4) * scale + bias;</div>
+<div class="line"><a id="l02124" name="l02124"></a><span class="lineno"> 2124</span>    out[5] = (((w[1] &amp; 0x80) &gt;&gt; 7) + ((w[2] &amp; 0x3) &lt;&lt; 1)) * scale + bias;</div>
+<div class="line"><a id="l02125" name="l02125"></a><span class="lineno"> 2125</span>    out[6] = ((w[2] &amp; 0x1c) &gt;&gt; 2) * scale + bias;</div>
+<div class="line"><a id="l02126" name="l02126"></a><span class="lineno"> 2126</span>    out[7] = ((w[2] &amp; 0xe0) &gt;&gt; 5) * scale + bias;</div>
+<div class="line"><a id="l02127" name="l02127"></a><span class="lineno"> 2127</span> </div>
+<div class="line"><a id="l02128" name="l02128"></a><span class="lineno"> 2128</span>  } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 6) {</div>
+<div class="line"><a id="l02129" name="l02129"></a><span class="lineno"> 2129</span>    w += offset * bytes_per_pack;</div>
+<div class="line"><a id="l02130" name="l02130"></a><span class="lineno"> 2130</span>    out[0] = (w[0] &amp; 0x3f) * scale + bias;</div>
+<div class="line"><a id="l02131" name="l02131"></a><span class="lineno"> 2131</span>    out[1] = (((w[0] &gt;&gt; 6) &amp; 0x03) + ((w[1] &amp; 0x0f) &lt;&lt; 2)) * scale + bias;</div>
+<div class="line"><a id="l02132" name="l02132"></a><span class="lineno"> 2132</span>    out[2] = (((w[1] &gt;&gt; 4) &amp; 0x0f) + ((w[2] &amp; 0x03) &lt;&lt; 4)) * scale + bias;</div>
+<div class="line"><a id="l02133" name="l02133"></a><span class="lineno"> 2133</span>    out[3] = ((w[2] &gt;&gt; 2) &amp; 0x3f) * scale + bias;</div>
+<div class="line"><a id="l02134" name="l02134"></a><span class="lineno"> 2134</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l02135" name="l02135"></a><span class="lineno"> 2135</span>    uint val = w[offset];</div>
+<div class="line"><a id="l02136" name="l02136"></a><span class="lineno"> 2136</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l02137" name="l02137"></a><span class="lineno"> 2137</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; packs_per_int; i++) {</div>
+<div class="line"><a id="l02138" name="l02138"></a><span class="lineno"> 2138</span>      uint8_t d;</div>
+<div class="line"><a id="l02139" name="l02139"></a><span class="lineno"> 2139</span>      <span class="keywordflow">if</span> (bits == 2) {</div>
+<div class="line"><a id="l02140" name="l02140"></a><span class="lineno"> 2140</span>        d = (val &gt;&gt; (bits * i)) &amp; 0x03;</div>
+<div class="line"><a id="l02141" name="l02141"></a><span class="lineno"> 2141</span>      } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
+<div class="line"><a id="l02142" name="l02142"></a><span class="lineno"> 2142</span>        d = (val &gt;&gt; (bits * i)) &amp; 0x0f;</div>
+<div class="line"><a id="l02143" name="l02143"></a><span class="lineno"> 2143</span>      } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l02144" name="l02144"></a><span class="lineno"> 2144</span>        d = val;</div>
+<div class="line"><a id="l02145" name="l02145"></a><span class="lineno"> 2145</span>      }</div>
+<div class="line"><a id="l02146" name="l02146"></a><span class="lineno"> 2146</span>      out[i] = scale * d + bias;</div>
+<div class="line"><a id="l02147" name="l02147"></a><span class="lineno"> 2147</span>    }</div>
+<div class="line"><a id="l02148" name="l02148"></a><span class="lineno"> 2148</span>  }</div>
+<div class="line"><a id="l02149" name="l02149"></a><span class="lineno"> 2149</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3"><div class="ttname"><a href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a></div><div class="ttdeci">static constant constexpr const uint8_t simd_size</div><div class="ttdef"><b>Definition</b> ops.h:22</div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html_aaf4974425147d6f26d031691e321637f"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a></div><div class="ttdeci">METAL_FUNC ulong2 elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:7</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a048cad0aca52cb737ebf103e76bd1c49"><div class="ttname"><a href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">metal::simd_max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_max(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:392</div></div>
-<div class="ttc" id="anamespacemetal_html_a46c667e169ff9d51a9204a045305442f"><div class="ttname"><a href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">metal::round</a></div><div class="ttdeci">METAL_FUNC bfloat16_t round(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a85181e37a00cb4a4217f1bb25389bce5"><div class="ttname"><a href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">metal::simd_sum</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_sum(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:392</div></div>
-<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a87c5122c60f9a12afceb9925a5b78ffb"><div class="ttname"><a href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">metal::abs</a></div><div class="ttdeci">METAL_FUNC bfloat16_t abs(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_ae9e2a23e00724ba2d7868bc4112b386b"><div class="ttname"><a href="namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b">metal::simd_min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_min(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:392</div></div>
-<div class="ttc" id="anamespacemetal_html_af6e2dd7ae087aba6abac4f0350b7611c"><div class="ttname"><a href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_down(bfloat16_t data, ushort delta)</div><div class="ttdef"><b>Definition</b> bf16_math.h:391</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a048cad0aca52cb737ebf103e76bd1c49"><div class="ttname"><a href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">metal::simd_max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_max(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:378</div></div>
+<div class="ttc" id="anamespacemetal_html_a46c667e169ff9d51a9204a045305442f"><div class="ttname"><a href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">metal::round</a></div><div class="ttdeci">METAL_FUNC bfloat16_t round(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a85181e37a00cb4a4217f1bb25389bce5"><div class="ttname"><a href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">metal::simd_sum</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_sum(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:378</div></div>
+<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a87c5122c60f9a12afceb9925a5b78ffb"><div class="ttname"><a href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">metal::abs</a></div><div class="ttdeci">METAL_FUNC bfloat16_t abs(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_ae9e2a23e00724ba2d7868bc4112b386b"><div class="ttname"><a href="namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b">metal::simd_min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_min(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:378</div></div>
+<div class="ttc" id="anamespacemetal_html_af6e2dd7ae087aba6abac4f0350b7611c"><div class="ttname"><a href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">metal::simd_shuffle_down</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_shuffle_down(bfloat16_t data, ushort delta)</div><div class="ttdef"><b>Definition</b> bf16_math.h:377</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1random_html_abb895baa477f5a06b5f88e69245f1825"><div class="ttname"><a href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">mlx::core::random::bits</a></div><div class="ttdeci">array bits(const std::vector&lt; int &gt; &amp;shape, int width, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})</div><div class="ttdoc">Generate an array with type uint32 filled with random bits.</div></div>
 <div class="ttc" id="aquantized_8h_html_a0386011c52d03e60885a31e6fbd903dd"><div class="ttname"><a href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a></div><div class="ttdeci">#define MLX_MTL_CONST</div><div class="ttdef"><b>Definition</b> quantized.h:8</div></div>
-<div class="ttc" id="aquantized_8h_html_a07b26d2d0b0d65dfe925c452c453fa42"><div class="ttname"><a href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42">qdot_safe</a></div><div class="ttdeci">U qdot_safe(const device uint8_t *w, const thread U *x_thread, U scale, U bias, U sum, int N)</div><div class="ttdef"><b>Definition</b> quantized.h:142</div></div>
-<div class="ttc" id="aquantized_8h_html_a0ba59096494f1001c195312571523ae9"><div class="ttname"><a href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl</a></div><div class="ttdeci">METAL_FUNC void qmm_n_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, threadgroup T *Xs, threadgroup T *Ws, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:879</div></div>
-<div class="ttc" id="aquantized_8h_html_a1546533c5b925b2fbb3bec870ec7487a"><div class="ttname"><a href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl</a></div><div class="ttdeci">METAL_FUNC void qvm_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const int in_vec_size, const int out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:647</div></div>
-<div class="ttc" id="aquantized_8h_html_a1a66b061c46383952a0f067c3848971f"><div class="ttname"><a href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f">bs_qmm_n</a></div><div class="ttdeci">void bs_qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1733</div></div>
-<div class="ttc" id="aquantized_8h_html_a2ce135e392dbf9a3e5180fb083792ed7"><div class="ttname"><a href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7">qmm_n</a></div><div class="ttdeci">void qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1423</div></div>
-<div class="ttc" id="aquantized_8h_html_a47bcf4a14566e01e14bd3c155811db59"><div class="ttname"><a href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">affine_quantize</a></div><div class="ttdeci">void affine_quantize(const device T *w, device uint8_t *out, device T *scales, device T *biases, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:1795</div></div>
-<div class="ttc" id="aquantized_8h_html_a530b720e123e59d73ea89a0a2d0946b7"><div class="ttname"><a href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7">bs_qmv_fast</a></div><div class="ttdeci">void bs_qmv_fast(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1476</div></div>
-<div class="ttc" id="aquantized_8h_html_a6076203615038eb06816158f7b3869c6"><div class="ttname"><a href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">affine_dequantize</a></div><div class="ttdeci">void affine_dequantize(const device uint8_t *w, const device T *scales, const device T *biases, device T *out, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:1911</div></div>
+<div class="ttc" id="aquantized_8h_html_a07b26d2d0b0d65dfe925c452c453fa42"><div class="ttname"><a href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42">qdot_safe</a></div><div class="ttdeci">U qdot_safe(const device uint8_t *w, const thread U *x_thread, U scale, U bias, U sum, int N)</div><div class="ttdef"><b>Definition</b> quantized.h:225</div></div>
+<div class="ttc" id="aquantized_8h_html_a0ba59096494f1001c195312571523ae9"><div class="ttname"><a href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl</a></div><div class="ttdeci">METAL_FUNC void qmm_n_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, threadgroup T *Xs, threadgroup T *Ws, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1081</div></div>
+<div class="ttc" id="aquantized_8h_html_a1546533c5b925b2fbb3bec870ec7487a"><div class="ttname"><a href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl</a></div><div class="ttdeci">METAL_FUNC void qvm_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const int in_vec_size, const int out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:843</div></div>
+<div class="ttc" id="aquantized_8h_html_a1a66b061c46383952a0f067c3848971f"><div class="ttname"><a href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f">bs_qmm_n</a></div><div class="ttdeci">void bs_qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1939</div></div>
+<div class="ttc" id="aquantized_8h_html_a2ce135e392dbf9a3e5180fb083792ed7"><div class="ttname"><a href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7">qmm_n</a></div><div class="ttdeci">void qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1629</div></div>
+<div class="ttc" id="aquantized_8h_html_a47bcf4a14566e01e14bd3c155811db59"><div class="ttname"><a href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">affine_quantize</a></div><div class="ttdeci">void affine_quantize(const device T *w, device uint8_t *out, device T *scales, device T *biases, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:2001</div></div>
+<div class="ttc" id="aquantized_8h_html_a530b720e123e59d73ea89a0a2d0946b7"><div class="ttname"><a href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7">bs_qmv_fast</a></div><div class="ttdeci">void bs_qmv_fast(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1682</div></div>
+<div class="ttc" id="aquantized_8h_html_a6076203615038eb06816158f7b3869c6"><div class="ttname"><a href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">affine_dequantize</a></div><div class="ttdeci">void affine_dequantize(const device uint8_t *w, const device T *scales, const device T *biases, device T *out, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:2098</div></div>
 <div class="ttc" id="aquantized_8h_html_a62969a218d93680f5e35d0c61b160b99"><div class="ttname"><a href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a></div><div class="ttdeci">static constant constexpr const int SIMD_SIZE</div><div class="ttdef"><b>Definition</b> quantized.h:10</div></div>
-<div class="ttc" id="aquantized_8h_html_a639c50a08b5cf57e8be5279a116274bd"><div class="ttname"><a href="quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd">qmv</a></div><div class="ttdeci">void qmv(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1200</div></div>
-<div class="ttc" id="aquantized_8h_html_a6d6e3c31e44f232e58ae9d605e1f4494"><div class="ttname"><a href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494">bs_qvm</a></div><div class="ttdeci">void bs_qvm(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1598</div></div>
-<div class="ttc" id="aquantized_8h_html_a7561acefd7b55e7e2b25393be08bb99c"><div class="ttname"><a href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c">affine_quantize_scales_biases</a></div><div class="ttdeci">void affine_quantize_scales_biases(const device T *w, const device T *scales, const device T *biases, device uint8_t *out, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:1879</div></div>
-<div class="ttc" id="aquantized_8h_html_a7bd1d9f17c86c8fd34ec13678cff755f"><div class="ttname"><a href="quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f">qmv_fast</a></div><div class="ttdeci">void qmv_fast(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1149</div></div>
-<div class="ttc" id="aquantized_8h_html_a7ce5f53a4d6d1555e9402d545408d0ad"><div class="ttname"><a href="quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad">qmv_quad</a></div><div class="ttdeci">void qmv_quad(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint quad_gid, uint quad_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1098</div></div>
+<div class="ttc" id="aquantized_8h_html_a639c50a08b5cf57e8be5279a116274bd"><div class="ttname"><a href="quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd">qmv</a></div><div class="ttdeci">void qmv(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1406</div></div>
+<div class="ttc" id="aquantized_8h_html_a6d6e3c31e44f232e58ae9d605e1f4494"><div class="ttname"><a href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494">bs_qvm</a></div><div class="ttdeci">void bs_qvm(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1804</div></div>
+<div class="ttc" id="aquantized_8h_html_a7bd1d9f17c86c8fd34ec13678cff755f"><div class="ttname"><a href="quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f">qmv_fast</a></div><div class="ttdeci">void qmv_fast(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1355</div></div>
+<div class="ttc" id="aquantized_8h_html_a7ce5f53a4d6d1555e9402d545408d0ad"><div class="ttname"><a href="quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad">qmv_quad</a></div><div class="ttdeci">void qmv_quad(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint quad_gid, uint quad_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1304</div></div>
 <div class="ttc" id="aquantized_8h_html_a803e4d5a1459844ba647aea5b004e133"><div class="ttname"><a href="quantized_8h.html#a803e4d5a1459844ba647aea5b004e133">QUAD_SIZE</a></div><div class="ttdeci">static constant constexpr const int QUAD_SIZE</div><div class="ttdef"><b>Definition</b> quantized.h:11</div></div>
 <div class="ttc" id="aquantized_8h_html_a8dbace41de9e1e21dd59d016db11b3e9"><div class="ttname"><a href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector</a></div><div class="ttdeci">U load_vector(const device T *x, thread U *x_thread)</div><div class="ttdef"><b>Definition</b> quantized.h:14</div></div>
-<div class="ttc" id="aquantized_8h_html_a8e13c7d895624f738d2a6d9893b687fd"><div class="ttname"><a href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl</a></div><div class="ttdeci">METAL_FUNC void qmv_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:498</div></div>
-<div class="ttc" id="aquantized_8h_html_aa69e143d646fad332c1a53e8c9b337b7"><div class="ttname"><a href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">load_vector_safe</a></div><div class="ttdeci">U load_vector_safe(const device T *x, thread U *x_thread, int N)</div><div class="ttdef"><b>Definition</b> quantized.h:52</div></div>
-<div class="ttc" id="aquantized_8h_html_ab1ae143eba2afceb8df63f38b26f9a84"><div class="ttname"><a href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84">bs_qmm_t</a></div><div class="ttdeci">void bs_qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1666</div></div>
-<div class="ttc" id="aquantized_8h_html_ab364d58ab652e3ad87a8f80910556071"><div class="ttname"><a href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot</a></div><div class="ttdeci">U qdot(const device uint8_t *w, const thread U *x_thread, U scale, U bias, U sum)</div><div class="ttdef"><b>Definition</b> quantized.h:99</div></div>
-<div class="ttc" id="aquantized_8h_html_ab8243818512d6078d23e6ffb65fd7bb8"><div class="ttname"><a href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8">qvm_split_k</a></div><div class="ttdeci">void qvm_split_k(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;final_block_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1302</div></div>
-<div class="ttc" id="aquantized_8h_html_aba7687e6f8f1d29c0a1b2a3db150bd81"><div class="ttname"><a href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl</a></div><div class="ttdeci">METAL_FUNC void qmv_fast_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:434</div></div>
-<div class="ttc" id="aquantized_8h_html_abe2e3ef0ee4ec2cb61dc5330ad463d10"><div class="ttname"><a href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10">qmm_t</a></div><div class="ttdeci">void qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1365</div></div>
-<div class="ttc" id="aquantized_8h_html_accab1f9e17a65242347c051f98e4c0be"><div class="ttname"><a href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets</a></div><div class="ttdeci">METAL_FUNC void adjust_matrix_offsets(const device T *&amp;x, const device uint32_t *&amp;w, const device T *&amp;scales, const device T *&amp;biases, device T *&amp;y, int output_stride, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid)</div><div class="ttdef"><b>Definition</b> quantized.h:1005</div></div>
-<div class="ttc" id="aquantized_8h_html_acf4c7fc77821a83b31aedfb48443d3ed"><div class="ttname"><a href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed">bs_qmv</a></div><div class="ttdeci">void bs_qmv(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1537</div></div>
-<div class="ttc" id="aquantized_8h_html_ad5cf1cf63656bc1780685d22169cd4ef"><div class="ttname"><a href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">qmv_quad_impl</a></div><div class="ttdeci">METAL_FUNC void qmv_quad_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint quad_gid, uint quad_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:376</div></div>
-<div class="ttc" id="aquantized_8h_html_ad84f7d5ab9e32dbbe3ca759ae5d5d5c5"><div class="ttname"><a href="quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5">qvm</a></div><div class="ttdeci">void qvm(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1251</div></div>
-<div class="ttc" id="aquantized_8h_html_ae756f6817b584c60f5dcdd1d9c6b4f58"><div class="ttname"><a href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter</a></div><div class="ttdeci">void qouter(const thread uint8_t *w, U x, U scale, U bias, thread U *result)</div><div class="ttdef"><b>Definition</b> quantized.h:187</div></div>
-<div class="ttc" id="aquantized_8h_html_aecff265b63566d0d5689cfc4e5b037d2"><div class="ttname"><a href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">dequantize</a></div><div class="ttdeci">void dequantize(const device uint8_t *w, U scale, U bias, threadgroup U *w_local)</div><div class="ttdef"><b>Definition</b> quantized.h:219</div></div>
-<div class="ttc" id="aquantized_8h_html_af5750a35e8f5462218effba719f7f5b8"><div class="ttname"><a href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl</a></div><div class="ttdeci">METAL_FUNC void qmm_t_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, threadgroup T *Xs, threadgroup T *Ws, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:758</div></div>
-<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:17</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html"><div class="ttname"><a href="struct_quantized_block_loader.html">QuantizedBlockLoader</a></div><div class="ttdef"><b>Definition</b> quantized.h:262</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a0ace7e3762ecfa5a4106e7dee7e1b6ab"><div class="ttname"><a href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">QuantizedBlockLoader::group_stride</a></div><div class="ttdeci">const int group_stride</div><div class="ttdef"><b>Definition</b> quantized.h:282</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a1392a5278cf6e090ea80ebe7c4ac5fbb"><div class="ttname"><a href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">QuantizedBlockLoader::BCOLS_PACKED</a></div><div class="ttdeci">static constant constexpr const short BCOLS_PACKED</div><div class="ttdef"><b>Definition</b> quantized.h:274</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a17d01a6aba0833b073586ef2c09d0fbd"><div class="ttname"><a href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">QuantizedBlockLoader::biases</a></div><div class="ttdeci">const device T * biases</div><div class="ttdef"><b>Definition</b> quantized.h:291</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a234feacde36a4afc0d740332a3769fb6"><div class="ttname"><a href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">QuantizedBlockLoader::group_step_cnt</a></div><div class="ttdeci">short group_step_cnt</div><div class="ttdef"><b>Definition</b> quantized.h:281</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a31e14175f3d4902d9fe5ab5a219f61ba"><div class="ttname"><a href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">QuantizedBlockLoader::group_steps</a></div><div class="ttdeci">static constant constexpr const short group_steps</div><div class="ttdef"><b>Definition</b> quantized.h:277</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a50821537ea747bc03295a09bb0eef475"><div class="ttname"><a href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">QuantizedBlockLoader::thread_idx</a></div><div class="ttdeci">const short thread_idx</div><div class="ttdef"><b>Definition</b> quantized.h:284</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a6123e4a9209d6eacb58b2c2344ed1ecf"><div class="ttname"><a href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">QuantizedBlockLoader::scales</a></div><div class="ttdeci">const device T * scales</div><div class="ttdef"><b>Definition</b> quantized.h:290</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a6213479f7a6d9314d8879f8856b0b6fb"><div class="ttname"><a href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">QuantizedBlockLoader::n_reads</a></div><div class="ttdeci">static constant constexpr const short n_reads</div><div class="ttdef"><b>Definition</b> quantized.h:275</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a674138ef7c43cc45586ea9f8fd6f6bd9"><div class="ttname"><a href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">QuantizedBlockLoader::next</a></div><div class="ttdeci">void next()</div><div class="ttdef"><b>Definition</b> quantized.h:354</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a699dc9aa284b8fbf870310bbb224465b"><div class="ttname"><a href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">QuantizedBlockLoader::load_safe</a></div><div class="ttdeci">void load_safe(short2 src_tile_dim) const</div><div class="ttdef"><b>Definition</b> quantized.h:327</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a8050977d473d1a24fae5c833e609839e"><div class="ttname"><a href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">QuantizedBlockLoader::src_ld</a></div><div class="ttdeci">const int src_ld</div><div class="ttdef"><b>Definition</b> quantized.h:279</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a85041d72225a2095659c70509291a906"><div class="ttname"><a href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">QuantizedBlockLoader::bi</a></div><div class="ttdeci">const short bi</div><div class="ttdef"><b>Definition</b> quantized.h:285</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a86009527cb4b53e4c21fd6b1f78cfefc"><div class="ttname"><a href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">QuantizedBlockLoader::load_unsafe</a></div><div class="ttdeci">void load_unsafe() const</div><div class="ttdef"><b>Definition</b> quantized.h:314</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a8eae73a0c04bf1e41fb96131f6aa500d"><div class="ttname"><a href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">QuantizedBlockLoader::pack_factor</a></div><div class="ttdeci">static constant constexpr const short pack_factor</div><div class="ttdef"><b>Definition</b> quantized.h:273</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a9857214690fe6abad0e19d1045152f83"><div class="ttname"><a href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">QuantizedBlockLoader::dst</a></div><div class="ttdeci">threadgroup T * dst</div><div class="ttdef"><b>Definition</b> quantized.h:288</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_ac3f651c1a645291d1037a2cc8ded2320"><div class="ttname"><a href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">QuantizedBlockLoader::tile_stride</a></div><div class="ttdeci">const int tile_stride</div><div class="ttdef"><b>Definition</b> quantized.h:280</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_ad85c6b7e07c81307b3b91eb4dd7be30b"><div class="ttname"><a href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">QuantizedBlockLoader::src</a></div><div class="ttdeci">const device uint32_t * src</div><div class="ttdef"><b>Definition</b> quantized.h:289</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_ae2add92b2aaf3414e91f0470b9b0cc00"><div class="ttname"><a href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">QuantizedBlockLoader::bj</a></div><div class="ttdeci">const short bj</div><div class="ttdef"><b>Definition</b> quantized.h:286</div></div>
-<div class="ttc" id="astruct_quantized_block_loader_html_af59b054750a65e7e79c1cd05c4acac93"><div class="ttname"><a href="struct_quantized_block_loader.html#af59b054750a65e7e79c1cd05c4acac93">QuantizedBlockLoader::QuantizedBlockLoader</a></div><div class="ttdeci">QuantizedBlockLoader(const device uint32_t *src_, const device T *scales_, const device T *biases_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> quantized.h:293</div></div>
+<div class="ttc" id="aquantized_8h_html_a8e13c7d895624f738d2a6d9893b687fd"><div class="ttname"><a href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl</a></div><div class="ttdeci">METAL_FUNC void qmv_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:688</div></div>
+<div class="ttc" id="aquantized_8h_html_aa69e143d646fad332c1a53e8c9b337b7"><div class="ttname"><a href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">load_vector_safe</a></div><div class="ttdeci">U load_vector_safe(const device T *x, thread U *x_thread, int N)</div><div class="ttdef"><b>Definition</b> quantized.h:77</div></div>
+<div class="ttc" id="aquantized_8h_html_ab1ae143eba2afceb8df63f38b26f9a84"><div class="ttname"><a href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84">bs_qmm_t</a></div><div class="ttdeci">void bs_qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1872</div></div>
+<div class="ttc" id="aquantized_8h_html_ab364d58ab652e3ad87a8f80910556071"><div class="ttname"><a href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot</a></div><div class="ttdeci">U qdot(const device uint8_t *w, const thread U *x_thread, U scale, U bias, U sum)</div><div class="ttdef"><b>Definition</b> quantized.h:145</div></div>
+<div class="ttc" id="aquantized_8h_html_ab8243818512d6078d23e6ffb65fd7bb8"><div class="ttname"><a href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8">qvm_split_k</a></div><div class="ttdeci">void qvm_split_k(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;final_block_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1508</div></div>
+<div class="ttc" id="aquantized_8h_html_aba7687e6f8f1d29c0a1b2a3db150bd81"><div class="ttname"><a href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl</a></div><div class="ttdeci">METAL_FUNC void qmv_fast_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:620</div></div>
+<div class="ttc" id="aquantized_8h_html_abe2e3ef0ee4ec2cb61dc5330ad463d10"><div class="ttname"><a href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10">qmm_t</a></div><div class="ttdeci">void qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1571</div></div>
+<div class="ttc" id="aquantized_8h_html_accab1f9e17a65242347c051f98e4c0be"><div class="ttname"><a href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets</a></div><div class="ttdeci">METAL_FUNC void adjust_matrix_offsets(const device T *&amp;x, const device uint32_t *&amp;w, const device T *&amp;scales, const device T *&amp;biases, device T *&amp;y, int output_stride, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid)</div><div class="ttdef"><b>Definition</b> quantized.h:1211</div></div>
+<div class="ttc" id="aquantized_8h_html_acf4c7fc77821a83b31aedfb48443d3ed"><div class="ttname"><a href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed">bs_qmv</a></div><div class="ttdeci">void bs_qmv(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1743</div></div>
+<div class="ttc" id="aquantized_8h_html_ad5cf1cf63656bc1780685d22169cd4ef"><div class="ttname"><a href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">qmv_quad_impl</a></div><div class="ttdeci">METAL_FUNC void qmv_quad_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint quad_gid, uint quad_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:563</div></div>
+<div class="ttc" id="aquantized_8h_html_ad84f7d5ab9e32dbbe3ca759ae5d5d5c5"><div class="ttname"><a href="quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5">qvm</a></div><div class="ttdeci">void qvm(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1457</div></div>
+<div class="ttc" id="aquantized_8h_html_ae756f6817b584c60f5dcdd1d9c6b4f58"><div class="ttname"><a href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter</a></div><div class="ttdeci">void qouter(const thread uint8_t *w, U x, U scale, U bias, thread U *result)</div><div class="ttdef"><b>Definition</b> quantized.h:307</div></div>
+<div class="ttc" id="aquantized_8h_html_aecff265b63566d0d5689cfc4e5b037d2"><div class="ttname"><a href="quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2">dequantize</a></div><div class="ttdeci">void dequantize(const device uint8_t *w, U scale, U bias, threadgroup U *w_local)</div><div class="ttdef"><b>Definition</b> quantized.h:372</div></div>
+<div class="ttc" id="aquantized_8h_html_af5750a35e8f5462218effba719f7f5b8"><div class="ttname"><a href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl</a></div><div class="ttdeci">METAL_FUNC void qmm_t_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, threadgroup T *Xs, threadgroup T *Ws, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:956</div></div>
+<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:23</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html"><div class="ttname"><a href="struct_quantized_block_loader.html">QuantizedBlockLoader</a></div><div class="ttdef"><b>Definition</b> quantized.h:443</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a0ace7e3762ecfa5a4106e7dee7e1b6ab"><div class="ttname"><a href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">QuantizedBlockLoader::group_stride</a></div><div class="ttdeci">const int group_stride</div><div class="ttdef"><b>Definition</b> quantized.h:464</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a1392a5278cf6e090ea80ebe7c4ac5fbb"><div class="ttname"><a href="struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb">QuantizedBlockLoader::BCOLS_PACKED</a></div><div class="ttdeci">static constant constexpr const short BCOLS_PACKED</div><div class="ttdef"><b>Definition</b> quantized.h:456</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a17d01a6aba0833b073586ef2c09d0fbd"><div class="ttname"><a href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">QuantizedBlockLoader::biases</a></div><div class="ttdeci">const device T * biases</div><div class="ttdef"><b>Definition</b> quantized.h:473</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a234feacde36a4afc0d740332a3769fb6"><div class="ttname"><a href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">QuantizedBlockLoader::group_step_cnt</a></div><div class="ttdeci">short group_step_cnt</div><div class="ttdef"><b>Definition</b> quantized.h:463</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a31e14175f3d4902d9fe5ab5a219f61ba"><div class="ttname"><a href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">QuantizedBlockLoader::group_steps</a></div><div class="ttdeci">static constant constexpr const short group_steps</div><div class="ttdef"><b>Definition</b> quantized.h:459</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a50821537ea747bc03295a09bb0eef475"><div class="ttname"><a href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">QuantizedBlockLoader::thread_idx</a></div><div class="ttdeci">const short thread_idx</div><div class="ttdef"><b>Definition</b> quantized.h:466</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a60713ce7498aa683cbb2a0f19ab16589"><div class="ttname"><a href="struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589">QuantizedBlockLoader::QuantizedBlockLoader</a></div><div class="ttdeci">QuantizedBlockLoader(const device uint8_t *src_, const device T *scales_, const device T *biases_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</div><div class="ttdef"><b>Definition</b> quantized.h:475</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a6123e4a9209d6eacb58b2c2344ed1ecf"><div class="ttname"><a href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">QuantizedBlockLoader::scales</a></div><div class="ttdeci">const device T * scales</div><div class="ttdef"><b>Definition</b> quantized.h:472</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a6213479f7a6d9314d8879f8856b0b6fb"><div class="ttname"><a href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">QuantizedBlockLoader::n_reads</a></div><div class="ttdeci">static constant constexpr const short n_reads</div><div class="ttdef"><b>Definition</b> quantized.h:457</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a674138ef7c43cc45586ea9f8fd6f6bd9"><div class="ttname"><a href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">QuantizedBlockLoader::next</a></div><div class="ttdeci">void next()</div><div class="ttdef"><b>Definition</b> quantized.h:541</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a699dc9aa284b8fbf870310bbb224465b"><div class="ttname"><a href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">QuantizedBlockLoader::load_safe</a></div><div class="ttdeci">void load_safe(short2 src_tile_dim) const</div><div class="ttdef"><b>Definition</b> quantized.h:511</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a8050977d473d1a24fae5c833e609839e"><div class="ttname"><a href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">QuantizedBlockLoader::src_ld</a></div><div class="ttdeci">const int src_ld</div><div class="ttdef"><b>Definition</b> quantized.h:461</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a85041d72225a2095659c70509291a906"><div class="ttname"><a href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">QuantizedBlockLoader::bi</a></div><div class="ttdeci">const short bi</div><div class="ttdef"><b>Definition</b> quantized.h:467</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a86009527cb4b53e4c21fd6b1f78cfefc"><div class="ttname"><a href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">QuantizedBlockLoader::load_unsafe</a></div><div class="ttdeci">void load_unsafe() const</div><div class="ttdef"><b>Definition</b> quantized.h:498</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a8eae73a0c04bf1e41fb96131f6aa500d"><div class="ttname"><a href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">QuantizedBlockLoader::pack_factor</a></div><div class="ttdeci">static constant constexpr const short pack_factor</div><div class="ttdef"><b>Definition</b> quantized.h:454</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a9857214690fe6abad0e19d1045152f83"><div class="ttname"><a href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">QuantizedBlockLoader::dst</a></div><div class="ttdeci">threadgroup T * dst</div><div class="ttdef"><b>Definition</b> quantized.h:470</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_abbf8249ca99e3e87b296ddd60a984b76"><div class="ttname"><a href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">QuantizedBlockLoader::src</a></div><div class="ttdeci">const device uint8_t * src</div><div class="ttdef"><b>Definition</b> quantized.h:471</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_ac3f651c1a645291d1037a2cc8ded2320"><div class="ttname"><a href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">QuantizedBlockLoader::tile_stride</a></div><div class="ttdeci">const int tile_stride</div><div class="ttdef"><b>Definition</b> quantized.h:462</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_ad00fe6d8bd395206a41693a8ed65d4db"><div class="ttname"><a href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">QuantizedBlockLoader::bytes_per_pack</a></div><div class="ttdeci">static constant constexpr const short bytes_per_pack</div><div class="ttdef"><b>Definition</b> quantized.h:455</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_ae2add92b2aaf3414e91f0470b9b0cc00"><div class="ttname"><a href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">QuantizedBlockLoader::bj</a></div><div class="ttdeci">const short bj</div><div class="ttdef"><b>Definition</b> quantized.h:468</div></div>
 <div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a></div><div class="ttdef"><b>Definition</b> loader.h:25</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/radix_8h_source.html b/docs/build/html/radix_8h_source.html
index 22d91ce00..8ec8e64be 100644
--- a/docs/build/html/radix_8h_source.html
+++ b/docs/build/html/radix_8h_source.html
@@ -447,8 +447,8 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span>  y[2] = x[12] * inv + x[0];</div>
 <div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>}</div>
 </div>
-<div class="ttc" id="anamespacemetal_1_1fast_html_a3af771cfe7a135104f9d063147dba270"><div class="ttname"><a href="namespacemetal_1_1fast.html#a3af771cfe7a135104f9d063147dba270">metal::fast::sin</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sin(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:242</div></div>
-<div class="ttc" id="anamespacemetal_1_1fast_html_a75b6bb32fa3870eda46a7bfc9f481f88"><div class="ttname"><a href="namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88">metal::fast::cos</a></div><div class="ttdeci">METAL_FUNC bfloat16_t cos(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:242</div></div>
+<div class="ttc" id="anamespacemetal_1_1fast_html_a3af771cfe7a135104f9d063147dba270"><div class="ttname"><a href="namespacemetal_1_1fast.html#a3af771cfe7a135104f9d063147dba270">metal::fast::sin</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sin(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:240</div></div>
+<div class="ttc" id="anamespacemetal_1_1fast_html_a75b6bb32fa3870eda46a7bfc9f481f88"><div class="ttname"><a href="namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88">metal::fast::cos</a></div><div class="ttdeci">METAL_FUNC bfloat16_t cos(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:240</div></div>
 <div class="ttc" id="aradix_8h_html_a026e6779e6d2ecdef39ff4aad186091e"><div class="ttname"><a href="radix_8h.html#a026e6779e6d2ecdef39ff4aad186091e">radix5</a></div><div class="ttdeci">METAL_FUNC void radix5(thread float2 *x, thread float2 *y)</div><div class="ttdef"><b>Definition</b> radix.h:69</div></div>
 <div class="ttc" id="aradix_8h_html_a0e2dfd3d1dda09f47ccc64eec35629f3"><div class="ttname"><a href="radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3">complex_mul_conj</a></div><div class="ttdeci">METAL_FUNC float2 complex_mul_conj(float2 a, float2 b)</div><div class="ttdef"><b>Definition</b> radix.h:24</div></div>
 <div class="ttc" id="aradix_8h_html_a12cb26bd3ad635d16a195ccea750256d"><div class="ttname"><a href="radix_8h.html#a12cb26bd3ad635d16a195ccea750256d">radix4</a></div><div class="ttdeci">METAL_FUNC void radix4(thread float2 *x, thread float2 *y)</div><div class="ttdef"><b>Definition</b> radix.h:56</div></div>
diff --git a/docs/build/html/readwrite_8h_source.html b/docs/build/html/readwrite_8h_source.html
index 9b6e62dd4..39ba91cab 100644
--- a/docs/build/html/readwrite_8h_source.html
+++ b/docs/build/html/readwrite_8h_source.html
@@ -778,8 +778,8 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="abackend_2metal_2allocator_8h_html_a15aa5cc1baf29be08d55cca88509e697"><div class="ttname"><a href="backend_2metal_2allocator_8h.html#a15aa5cc1baf29be08d55cca88509e697">buf</a></div><div class="ttdeci">MTL::Buffer * buf</div><div class="ttdef"><b>Definition</b> allocator.h:39</div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2fft_8h_html_a7a83318497519ff3ff0141b7d511ed38"><div class="ttname"><a href="backend_2metal_2kernels_2fft_8h.html#a7a83318497519ff3ff0141b7d511ed38">inv_</a></div><div class="ttdeci">static constant constexpr const bool inv_</div><div class="ttdef"><b>Definition</b> fft.h:23</div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2fft_8h_html_ad395c11e6f2aee72cd1928fba93a35a3"><div class="ttname"><a href="backend_2metal_2kernels_2fft_8h.html#ad395c11e6f2aee72cd1928fba93a35a3">elems_per_thread_</a></div><div class="ttdeci">static constant constexpr const int elems_per_thread_</div><div class="ttdef"><b>Definition</b> fft.h:25</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
 <div class="ttc" id="aradix_8h_html"><div class="ttname"><a href="radix_8h.html">radix.h</a></div></div>
 <div class="ttc" id="aradix_8h_html_a5bfc53b531214c9ce277bebc18aa67d6"><div class="ttname"><a href="radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6">complex_mul</a></div><div class="ttdeci">METAL_FUNC float2 complex_mul(float2 a, float2 b)</div><div class="ttdef"><b>Definition</b> radix.h:19</div></div>
 <div class="ttc" id="aradix_8h_html_ac5cf950316b9445296ee9ecfc56a56bd"><div class="ttname"><a href="radix_8h.html#ac5cf950316b9445296ee9ecfc56a56bd">get_twiddle</a></div><div class="ttdeci">METAL_FUNC float2 get_twiddle(int k, int p)</div><div class="ttdef"><b>Definition</b> radix.h:29</div></div>
diff --git a/docs/build/html/reduce__all_8h.html b/docs/build/html/reduce__all_8h.html
index 1b115a393..ed33e10fc 100644
--- a/docs/build/html/reduce__all_8h.html
+++ b/docs/build/html/reduce__all_8h.html
@@ -98,18 +98,18 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:a99ef48ae72b3e715c5f4d7ea07cd213d" id="r_a99ef48ae72b3e715c5f4d7ea07cd213d"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int N_READS = REDUCE_N_READS&gt; </td></tr>
-<tr class="memitem:a99ef48ae72b3e715c5f4d7ea07cd213d"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a99ef48ae72b3e715c5f4d7ea07cd213d">all_reduce</a> (const device T *in, device U *out, const constant size_t &amp;in_size, const constant size_t &amp;row_size, uint3 gid, uint3 lid, uint3 lsize, uint simd_per_group, uint simd_lane_id, uint simd_group_id)</td></tr>
-<tr class="separator:a99ef48ae72b3e715c5f4d7ea07cd213d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a9086a585eda5a887160ee24baae0a7b8" id="r_a9086a585eda5a887160ee24baae0a7b8"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT  = int64_t, int N_READS = REDUCE_N_READS&gt; </td></tr>
+<tr class="memitem:a9086a585eda5a887160ee24baae0a7b8"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a9086a585eda5a887160ee24baae0a7b8">all_reduce</a> (const device T *in, device U *out, const constant size_t &amp;in_size, const constant size_t &amp;row_size, uint3 gid, uint3 lid, uint3 lsize, uint simd_per_group, uint simd_lane_id, uint simd_group_id)</td></tr>
+<tr class="separator:a9086a585eda5a887160ee24baae0a7b8"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="a99ef48ae72b3e715c5f4d7ea07cd213d" name="a99ef48ae72b3e715c5f4d7ea07cd213d"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a99ef48ae72b3e715c5f4d7ea07cd213d">&#9670;&#160;</a></span>all_reduce()</h2>
+<a id="a9086a585eda5a887160ee24baae0a7b8" name="a9086a585eda5a887160ee24baae0a7b8"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a9086a585eda5a887160ee24baae0a7b8">&#9670;&#160;</a></span>all_reduce()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int N_READS = REDUCE_N_READS&gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT  = int64_t, int N_READS = REDUCE_N_READS&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void all_reduce </td>
diff --git a/docs/build/html/reduce__all_8h_source.html b/docs/build/html/reduce__all_8h_source.html
index 013cf7141..34ab309f3 100644
--- a/docs/build/html/reduce__all_8h_source.html
+++ b/docs/build/html/reduce__all_8h_source.html
@@ -93,70 +93,76 @@ $(function(){ initResizable(false); });
 <div class="contents">
 <a href="reduce__all_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2023-2024 Apple Inc.</span></div>
 <div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
-<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> N_READS = REDUCE_N_READS&gt;</div>
-<div class="foldopen" id="foldopen00004" data-start="{" data-end="}">
-<div class="line"><a id="l00004" name="l00004"></a><span class="lineno"><a class="line" href="reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d">    4</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d">all_reduce</a>(</div>
-<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
-<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span>    device U* out [[buffer(1)]],</div>
-<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; in_size [[buffer(2)]],</div>
-<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; row_size [[buffer(3)]],</div>
-<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span>    uint3 lid [[thread_position_in_threadgroup]],</div>
-<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span>    uint3 lsize [[threads_per_threadgroup]],</div>
-<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span>    uint simd_per_group [[simdgroups_per_threadgroup]],</div>
-<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
-<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
-<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>  threadgroup U shared_vals[<a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>];</div>
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span> </div>
-<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>  U total = Op::init;</div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>  int64_t start_idx = gid.y * row_size;</div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>  int64_t actual_row =</div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>      (start_idx + row_size &lt;= in_size) ? row_size : in_size - start_idx;</div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  int64_t blocks = actual_row / (lsize.x * N_READS);</div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>  <span class="keywordtype">int</span> extra = actual_row - blocks * (lsize.x * N_READS);</div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>  extra -= lid.x * N_READS;</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>  start_idx += lid.x * N_READS;</div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>  in += start_idx;</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span> </div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>  <span class="keywordflow">if</span> (extra &gt;= N_READS) {</div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>    blocks++;</div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>    extra = 0;</div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  }</div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span>    <span class="keyword">typename</span> Op,</div>
+<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span>    <span class="keyword">typename</span> IdxT = int64_t,</div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span>    <span class="keywordtype">int</span> N_READS = <a class="code hl_variable" href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a>&gt;</div>
+<div class="foldopen" id="foldopen00009" data-start="{" data-end="}">
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno"><a class="line" href="reduce__all_8h.html#a9086a585eda5a887160ee24baae0a7b8">    9</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__all_8h.html#a9086a585eda5a887160ee24baae0a7b8">all_reduce</a>(</div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; in_size [[buffer(2)]],</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; row_size [[buffer(3)]],</div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>    uint3 lid [[thread_position_in_threadgroup]],</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    uint3 lsize [[threads_per_threadgroup]],</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    uint simd_per_group [[simdgroups_per_threadgroup]],</div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>  threadgroup U shared_vals[<a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>];</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span> </div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>  U total = Op::init;</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>  IdxT start_idx = gid.y * IdxT(row_size);</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>  IdxT actual_row =</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>      (start_idx + row_size &lt;= in_size) ? row_size : in_size - start_idx;</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>  IdxT blocks = actual_row / (lsize.x * N_READS);</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>  <span class="keywordtype">int</span> extra = actual_row - blocks * (lsize.x * N_READS);</div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  extra -= lid.x * N_READS;</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  start_idx += lid.x * N_READS;</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  in += start_idx;</div>
 <div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span> </div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  <span class="keywordflow">for</span> (int64_t b = 0; b &lt; blocks; b++) {</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>      total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(in[i]), total);</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>    }</div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>    in += lsize.x * N_READS;</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  }</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  <span class="keywordflow">if</span> (extra &gt; 0) {</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; extra; i++) {</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>      total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(in[i]), total);</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>    }</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  <span class="keywordflow">if</span> (extra &gt;= N_READS) {</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    blocks++;</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>    extra = 0;</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  }</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span> </div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <span class="keywordflow">for</span> (IdxT b = 0; b &lt; blocks; b++) {</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>      total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(in[i]), total);</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>    }</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>    in += lsize.x * N_READS;</div>
 <div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>  }</div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span> </div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>  <span class="comment">// Reduction within simd group</span></div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(total);</div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>  <span class="keywordflow">if</span> (simd_per_group &gt; 1) {</div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>    <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>      shared_vals[simd_group_id] = total;</div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>    }</div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span> </div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>    <span class="comment">// Reduction within thread group</span></div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>    total = lid.x &lt; simd_per_group ? shared_vals[lid.x] : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>    total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(total);</div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  }</div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span> </div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>  <span class="keywordflow">if</span> (lid.x == 0) {</div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>    out[gid.y] = total;</div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>  }</div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>}</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  <span class="keywordflow">if</span> (extra &gt; 0) {</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; extra; i++) {</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>      total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(in[i]), total);</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>    }</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>  }</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span> </div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>  <span class="comment">// Reduction within simd group</span></div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>  total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(total);</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>  <span class="keywordflow">if</span> (simd_per_group &gt; 1) {</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>    <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>      shared_vals[simd_group_id] = total;</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>    }</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span> </div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>    <span class="comment">// Reduction within thread group</span></div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>    total = lid.x &lt; simd_per_group ? shared_vals[lid.x] : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>    total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(total);</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>  }</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span> </div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>  <span class="keywordflow">if</span> (lid.x == 0) {</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    out[gid.y] = total;</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>  }</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3"><div class="ttname"><a href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a></div><div class="ttdeci">static constant constexpr const uint8_t simd_size</div><div class="ttdef"><b>Definition</b> ops.h:22</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
-<div class="ttc" id="areduce__all_8h_html_a99ef48ae72b3e715c5f4d7ea07cd213d"><div class="ttname"><a href="reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d">all_reduce</a></div><div class="ttdeci">void all_reduce(const device T *in, device U *out, const constant size_t &amp;in_size, const constant size_t &amp;row_size, uint3 gid, uint3 lid, uint3 lsize, uint simd_per_group, uint simd_lane_id, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> reduce_all.h:4</div></div>
+<div class="ttc" id="adefines_8h_html_a2ad505864a2ab786147766900bc18c21"><div class="ttname"><a href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a></div><div class="ttdeci">static constexpr int REDUCE_N_READS</div><div class="ttdef"><b>Definition</b> defines.h:12</div></div>
+<div class="ttc" id="areduce__all_8h_html_a9086a585eda5a887160ee24baae0a7b8"><div class="ttname"><a href="reduce__all_8h.html#a9086a585eda5a887160ee24baae0a7b8">all_reduce</a></div><div class="ttdeci">void all_reduce(const device T *in, device U *out, const constant size_t &amp;in_size, const constant size_t &amp;row_size, uint3 gid, uint3 lid, uint3 lsize, uint simd_per_group, uint simd_lane_id, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> reduce_all.h:9</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/reduce__col_8h.html b/docs/build/html/reduce__col_8h.html
index b7dda2eda..92d3040fe 100644
--- a/docs/build/html/reduce__col_8h.html
+++ b/docs/build/html/reduce__col_8h.html
@@ -98,28 +98,28 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:a7c378443a2b6f4d9210db8a21a9ac4f5" id="r_a7c378443a2b6f4d9210db8a21a9ac4f5"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS&gt; </td></tr>
-<tr class="memitem:a7c378443a2b6f4d9210db8a21a9ac4f5"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a7c378443a2b6f4d9210db8a21a9ac4f5">col_reduce_small</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</td></tr>
-<tr class="separator:a7c378443a2b6f4d9210db8a21a9ac4f5"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5b4f4c4c247ad341ff8d31dcbbbce0eb" id="r_a5b4f4c4c247ad341ff8d31dcbbbce0eb"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS&gt; </td></tr>
-<tr class="memitem:a5b4f4c4c247ad341ff8d31dcbbbce0eb"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a5b4f4c4c247ad341ff8d31dcbbbce0eb">col_reduce_longcolumn</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</td></tr>
-<tr class="separator:a5b4f4c4c247ad341ff8d31dcbbbce0eb"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a11bfc6112ae2386ac03f5ea7b7d93385" id="r_a11bfc6112ae2386ac03f5ea7b7d93385"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt; </td></tr>
-<tr class="memitem:a11bfc6112ae2386ac03f5ea7b7d93385"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a11bfc6112ae2386ac03f5ea7b7d93385">col_reduce_looped</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</td></tr>
-<tr class="memdesc:a11bfc6112ae2386ac03f5ea7b7d93385"><td class="mdescLeft">&#160;</td><td class="mdescRight">Our approach is the following simple looped approach:  <br /></td></tr>
-<tr class="separator:a11bfc6112ae2386ac03f5ea7b7d93385"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0e92fc74eeaa8ee2ceb83bafc6eb1d7d" id="r_a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt; </td></tr>
-<tr class="memitem:a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">col_reduce_2pass</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</td></tr>
-<tr class="separator:a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a82cd031d8014c02e61dc9a817ea6d4ec" id="r_a82cd031d8014c02e61dc9a817ea6d4ec"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS&gt; </td></tr>
+<tr class="memitem:a82cd031d8014c02e61dc9a817ea6d4ec"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a82cd031d8014c02e61dc9a817ea6d4ec">col_reduce_small</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</td></tr>
+<tr class="separator:a82cd031d8014c02e61dc9a817ea6d4ec"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa3287cd98e97123b67b5d3920d984ca2" id="r_aa3287cd98e97123b67b5d3920d984ca2"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS&gt; </td></tr>
+<tr class="memitem:aa3287cd98e97123b67b5d3920d984ca2"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa3287cd98e97123b67b5d3920d984ca2">col_reduce_longcolumn</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</td></tr>
+<tr class="separator:aa3287cd98e97123b67b5d3920d984ca2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae8f9354e1c595142d05b33fe13988f02" id="r_ae8f9354e1c595142d05b33fe13988f02"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS, int BM, int BN&gt; </td></tr>
+<tr class="memitem:ae8f9354e1c595142d05b33fe13988f02"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ae8f9354e1c595142d05b33fe13988f02">col_reduce_looped</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</td></tr>
+<tr class="memdesc:ae8f9354e1c595142d05b33fe13988f02"><td class="mdescLeft">&#160;</td><td class="mdescRight">Our approach is the following simple looped approach:  <br /></td></tr>
+<tr class="separator:ae8f9354e1c595142d05b33fe13988f02"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a9a7be400d810700b47fc1a998032ce29" id="r_a9a7be400d810700b47fc1a998032ce29"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS, int BM, int BN&gt; </td></tr>
+<tr class="memitem:a9a7be400d810700b47fc1a998032ce29"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a9a7be400d810700b47fc1a998032ce29">col_reduce_2pass</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</td></tr>
+<tr class="separator:a9a7be400d810700b47fc1a998032ce29"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="a0e92fc74eeaa8ee2ceb83bafc6eb1d7d" name="a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">&#9670;&#160;</a></span>col_reduce_2pass()</h2>
+<a id="a9a7be400d810700b47fc1a998032ce29" name="a9a7be400d810700b47fc1a998032ce29"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a9a7be400d810700b47fc1a998032ce29">&#9670;&#160;</a></span>col_reduce_2pass()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS, int BM, int BN&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void col_reduce_2pass </td>
@@ -206,13 +206,13 @@ template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt
 
 </div>
 </div>
-<a id="a5b4f4c4c247ad341ff8d31dcbbbce0eb" name="a5b4f4c4c247ad341ff8d31dcbbbce0eb"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a5b4f4c4c247ad341ff8d31dcbbbce0eb">&#9670;&#160;</a></span>col_reduce_longcolumn()</h2>
+<a id="aa3287cd98e97123b67b5d3920d984ca2" name="aa3287cd98e97123b67b5d3920d984ca2"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa3287cd98e97123b67b5d3920d984ca2">&#9670;&#160;</a></span>col_reduce_longcolumn()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int NDIMS&gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void col_reduce_longcolumn </td>
@@ -299,13 +299,13 @@ template&lt;typename T , typename U , typename Op , int NDIMS&gt; </div>
 
 </div>
 </div>
-<a id="a11bfc6112ae2386ac03f5ea7b7d93385" name="a11bfc6112ae2386ac03f5ea7b7d93385"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a11bfc6112ae2386ac03f5ea7b7d93385">&#9670;&#160;</a></span>col_reduce_looped()</h2>
+<a id="ae8f9354e1c595142d05b33fe13988f02" name="ae8f9354e1c595142d05b33fe13988f02"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ae8f9354e1c595142d05b33fe13988f02">&#9670;&#160;</a></span>col_reduce_looped()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS, int BM, int BN&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void col_reduce_looped </td>
@@ -396,13 +396,13 @@ template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt
 
 </div>
 </div>
-<a id="a7c378443a2b6f4d9210db8a21a9ac4f5" name="a7c378443a2b6f4d9210db8a21a9ac4f5"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7c378443a2b6f4d9210db8a21a9ac4f5">&#9670;&#160;</a></span>col_reduce_small()</h2>
+<a id="a82cd031d8014c02e61dc9a817ea6d4ec" name="a82cd031d8014c02e61dc9a817ea6d4ec"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a82cd031d8014c02e61dc9a817ea6d4ec">&#9670;&#160;</a></span>col_reduce_small()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int NDIMS&gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void col_reduce_small </td>
diff --git a/docs/build/html/reduce__col_8h_source.html b/docs/build/html/reduce__col_8h_source.html
index caf4a5f82..3531ccde7 100644
--- a/docs/build/html/reduce__col_8h_source.html
+++ b/docs/build/html/reduce__col_8h_source.html
@@ -93,9 +93,9 @@ $(function(){ initResizable(false); });
 <div class="contents">
 <a href="reduce__col_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2023-2024 Apple Inc.</span></div>
 <div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
-<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> NDIMS&gt;</div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keyword">typename</span> IdxT, <span class="keywordtype">int</span> NDIMS&gt;</div>
 <div class="foldopen" id="foldopen00004" data-start="{" data-end="}">
-<div class="line"><a id="l00004" name="l00004"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">    4</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">col_reduce_small</a>(</div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a82cd031d8014c02e61dc9a817ea6d4ec">    4</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a82cd031d8014c02e61dc9a817ea6d4ec">col_reduce_small</a>(</div>
 <div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
 <div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span>    device U* out [[buffer(1)]],</div>
 <div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
@@ -113,7 +113,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    uint3 lsize [[threads_per_threadgroup]]) {</div>
 <div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_reads = 4;</div>
 <div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  <a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;NDIMS, IdxT, (NDIMS &gt; 2)&gt; loop(reduce_ndim);</div>
 <div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>  <span class="keyword">const</span> device T* row;</div>
 <div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span> </div>
 <div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>  U totals[n_reads];</div>
@@ -121,20 +121,20 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>    totals[i] = Op::init;</div>
 <div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>  }</div>
 <div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span> </div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  <span class="keywordtype">size_t</span> column = size_t(gid.x) * lsize.x * n_reads + lid.x * n_reads;</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  IdxT column = IdxT(gid.x) * lsize.x * n_reads + lid.x * n_reads;</div>
 <div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="keywordflow">if</span> (column &gt;= reduction_stride) {</div>
 <div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    <span class="keywordflow">return</span>;</div>
 <div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  }</div>
 <div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>  <span class="keywordtype">bool</span> safe = column + n_reads &lt;= reduction_stride;</div>
 <div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span> </div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="keywordtype">size_t</span> out_idx = gid.y + gsize.y * size_t(gid.z);</div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keywordtype">size_t</span> in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  IdxT out_idx = gid.y + gsize.y * IdxT(gid.z);</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  IdxT in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, IdxT&gt;</a>(out_idx, shape, strides, ndim);</div>
 <div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  in += in_idx + column;</div>
 <div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span> </div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  <span class="keywordtype">size_t</span> total_rows = non_col_reductions * reduction_size;</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(lid.y, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = lid.y; r &lt; total_rows; r += lsize.y) {</div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  IdxT total_rows = IdxT(non_col_reductions) * IdxT(reduction_size);</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  loop.next(lid.y, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  <span class="keywordflow">for</span> (IdxT r = lid.y; r &lt; total_rows; r += lsize.y) {</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>    row = in + loop.location();</div>
 <div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>    <span class="keywordflow">if</span> (safe) {</div>
 <div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
 <div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
@@ -149,7 +149,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
 <div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>      }</div>
 <div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>    }</div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(lsize.y, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>    loop.next(lsize.y, reduce_shape, reduce_strides);</div>
 <div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>  }</div>
 <div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span> </div>
 <div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>  <span class="keywordflow">if</span> (lsize.y &gt; 1) {</div>
@@ -174,7 +174,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>  }</div>
 <div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span> </div>
 <div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>  <span class="keywordflow">if</span> (lid.y == 0) {</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    out += out_idx * reduction_stride + column;</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    out += out_idx * IdxT(reduction_stride) + column;</div>
 <div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    <span class="keywordflow">if</span> (safe) {</div>
 <div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
 <div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>        out[i] = totals[i];</div>
@@ -188,9 +188,9 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>}</div>
 </div>
 <div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span> </div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> NDIMS&gt;</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keyword">typename</span> IdxT, <span class="keywordtype">int</span> NDIMS&gt;</div>
 <div class="foldopen" id="foldopen00097" data-start="{" data-end="}">
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">   97</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">col_reduce_longcolumn</a>(</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#aa3287cd98e97123b67b5d3920d984ca2">   97</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#aa3287cd98e97123b67b5d3920d984ca2">col_reduce_longcolumn</a>(</div>
 <div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
 <div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>    device U* out [[buffer(1)]],</div>
 <div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
@@ -208,21 +208,21 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    uint3 lid [[thread_position_in_threadgroup]],</div>
 <div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    uint3 lsize [[threads_per_threadgroup]]) {</div>
 <div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>  <a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;NDIMS, IdxT, (NDIMS &gt; 2)&gt; loop(reduce_ndim);</div>
 <div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  <span class="keyword">const</span> device T* row;</div>
 <div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span> </div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>  <span class="keywordtype">size_t</span> out_idx = gid.x + gsize.x * size_t(gid.y);</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  <span class="keywordtype">size_t</span> in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>  IdxT out_idx = gid.x + gsize.x * IdxT(gid.y);</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  IdxT in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, IdxT&gt;</a>(out_idx, shape, strides, ndim);</div>
 <div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  in += in_idx + lid.x;</div>
 <div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span> </div>
 <div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>  U total = Op::init;</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>  <span class="keywordtype">size_t</span> total_rows = non_col_reductions * reduction_size;</div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>  loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(gid.z * lsize.y + lid.y, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = gid.z * lsize.y + lid.y; r &lt; total_rows;</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>  IdxT total_rows = IdxT(non_col_reductions) * IdxT(reduction_size);</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>  loop.next(gid.z * lsize.y + lid.y, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>  <span class="keywordflow">for</span> (IdxT r = gid.z * lsize.y + lid.y; r &lt; total_rows;</div>
 <div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>       r += lsize.y * gsize.z) {</div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    row = in + loop.location();</div>
 <div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(*row), total);</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(lsize.y * gsize.z, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    loop.next(lsize.y * gsize.z, reduce_shape, reduce_strides);</div>
 <div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>  }</div>
 <div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span> </div>
 <div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>  threadgroup U shared_vals[32 * 32];</div>
@@ -232,256 +232,268 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    <span class="keywordflow">for</span> (uint i = 1; i &lt; lsize.y; i++) {</div>
 <div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>      total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(total, shared_vals[i * lsize.x + lid.x]);</div>
 <div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    }</div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    out[gid.z * out_size + out_idx * reduction_stride + lid.x] = total;</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>  }</div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>}</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    out[gid.z * IdxT(out_size) + out_idx * IdxT(reduction_stride) + lid.x] =</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>        total;</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>  }</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>}</div>
 </div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span> </div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> NDIMS, <span class="keywordtype">int</span> BM, <span class="keywordtype">int</span> BN&gt;</div>
-<div class="foldopen" id="foldopen00155" data-start="{" data-end="}">
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">  155</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">col_reduce_looped</a>(</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    device U* out [[buffer(1)]],</div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    uint3 gsize [[threadgroups_per_grid]],</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_simdgroups = 8;</div>
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> tgp_size = n_simdgroups * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_reads = (BM * BN) / tgp_size;</div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_read_blocks = BN / n_reads;</div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span> </div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>  threadgroup U shared_vals[BN * BM];</div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>  U totals[n_reads];</div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>  <span class="keyword">const</span> device T* row;</div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span> </div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>    totals[i] = Op::init;</div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>  }</div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span> </div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>  <span class="keywordtype">short</span> lid = simd_group_id * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> + simd_lane_id;</div>
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>  short2 offset((lid % n_read_blocks) * n_reads, lid / n_read_blocks);</div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>  <span class="keywordtype">size_t</span> column = BN * gid.x + offset.x;</div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>  <span class="keywordtype">bool</span> safe = column + n_reads &lt;= reduction_stride;</div>
-<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span> </div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>  <span class="keywordtype">size_t</span> out_idx = gid.y + gsize.y * size_t(gid.z);</div>
-<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>  <span class="keywordtype">size_t</span> in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
-<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>  in += in_idx + column;</div>
-<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span> </div>
-<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>  <span class="keywordtype">size_t</span> total = non_col_reductions * reduction_size;</div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>  loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(offset.y, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = offset.y; r &lt; total; r += BM) {</div>
-<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span> </div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>    <span class="keywordflow">if</span> (safe) {</div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>      }</div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      U vals[n_reads];</div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>        vals[i] =</div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>            (column + i &lt; reduction_stride) ? static_cast&lt;U&gt;(row[i]) : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>      }</div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>      }</div>
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>    }</div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span> </div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(BM, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>  }</div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span> </div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>  <span class="comment">// We can use a simd reduction to accumulate across BM so each thread writes</span></div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>  <span class="comment">// the partial output to SM and then each simdgroup does BN / n_simdgroups</span></div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>  <span class="comment">// accumulations.</span></div>
-<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>  <span class="keywordflow">if</span> (BM == 32) {</div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>    <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_outputs = BN / n_simdgroups;</div>
-<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>    <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>        BM != 32 || n_outputs == n_reads,</div>
-<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>        <span class="stringliteral">&quot;The tile should be selected such that n_outputs == n_reads&quot;</span>);</div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>      shared_vals[offset.y * BN + offset.x + i] = totals[i];</div>
-<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>    }</div>
-<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    short2 out_offset(simd_group_id * n_outputs, simd_lane_id);</div>
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>      totals[i] =</div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>          <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(shared_vals[out_offset.y * BN + out_offset.x + i]);</div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>    }</div>
-<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span> </div>
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>    <span class="comment">// Write the output.</span></div>
-<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>    <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>      <span class="keywordtype">size_t</span> out_column = BN * gid.x + out_offset.x;</div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>      out += out_idx * reduction_stride + out_column;</div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>      <span class="keywordflow">if</span> (out_column + n_outputs &lt;= reduction_stride) {</div>
-<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
-<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>        }</div>
-<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>      } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; out_column + i &lt; reduction_stride; i++) {</div>
-<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>        }</div>
-<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>      }</div>
-<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>    }</div>
-<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>  }</div>
-<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span> </div>
-<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>  <span class="comment">// Each thread holds n_reads partial results. We write them all out to shared</span></div>
-<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>  <span class="comment">// memory and threads with offset.y == 0 aggregate the columns and write the</span></div>
-<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>  <span class="comment">// outputs.</span></div>
-<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>  <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>    <span class="keywordtype">short</span> x_block = offset.x / n_reads;</div>
-<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>      shared_vals[x_block * BM * n_reads + i * BM + offset.y] = totals[i];</div>
-<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>    }</div>
-<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>    <span class="keywordflow">if</span> (offset.y == 0) {</div>
-<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 1; j &lt; BM; j++) {</div>
-<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>          totals[i] =</div>
-<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>              <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(shared_vals[x_block * BM * n_reads + i * BM + j], totals[i]);</div>
-<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>        }</div>
-<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>      }</div>
-<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>    }</div>
-<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span> </div>
-<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>    <span class="comment">// Write the output.</span></div>
-<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>    <span class="keywordflow">if</span> (offset.y == 0) {</div>
-<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>      out += out_idx * reduction_stride + column;</div>
-<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>      <span class="keywordflow">if</span> (safe) {</div>
-<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>        }</div>
-<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>      } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; column + i &lt; reduction_stride; i++) {</div>
-<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>        }</div>
-<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>      }</div>
-<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>    }</div>
-<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>  }</div>
-<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>}</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span> </div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    <span class="keyword">typename</span> Op,</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    <span class="keyword">typename</span> IdxT,</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    <span class="keywordtype">int</span> NDIMS,</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    <span class="keywordtype">int</span> BM,</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    <span class="keywordtype">int</span> BN&gt;</div>
+<div class="foldopen" id="foldopen00163" data-start="{" data-end="}">
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#ae8f9354e1c595142d05b33fe13988f02">  163</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#ae8f9354e1c595142d05b33fe13988f02">col_reduce_looped</a>(</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>    uint3 gsize [[threadgroups_per_grid]],</div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_simdgroups = 8;</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> tgp_size = n_simdgroups * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_reads = (BM * BN) / tgp_size;</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_read_blocks = BN / n_reads;</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span> </div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>  threadgroup U shared_vals[BN * BM];</div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>  U totals[n_reads];</div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>  <a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;NDIMS, IdxT, (NDIMS &gt; 2)&gt; loop(reduce_ndim);</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>  <span class="keyword">const</span> device T* row;</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span> </div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>    totals[i] = Op::init;</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>  }</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span> </div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>  <span class="keywordtype">short</span> lid = simd_group_id * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> + simd_lane_id;</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>  short2 offset((lid % n_read_blocks) * n_reads, lid / n_read_blocks);</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>  IdxT column = BN * gid.x + offset.x;</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>  <span class="keywordtype">bool</span> safe = column + n_reads &lt;= reduction_stride;</div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span> </div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>  IdxT out_idx = gid.y + gsize.y * IdxT(gid.z);</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>  IdxT in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, IdxT&gt;</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>  in += in_idx + column;</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span> </div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>  IdxT total = IdxT(non_col_reductions) * IdxT(reduction_size);</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>  loop.next(offset.y, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>  <span class="keywordflow">for</span> (IdxT r = offset.y; r &lt; total; r += BM) {</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>    row = in + loop.location();</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span> </div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>    <span class="keywordflow">if</span> (safe) {</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>      }</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>      U vals[n_reads];</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>        vals[i] =</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>            (column + i &lt; reduction_stride) ? static_cast&lt;U&gt;(row[i]) : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>      }</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>      }</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>    }</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span> </div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>    loop.next(BM, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>  }</div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span> </div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>  <span class="comment">// We can use a simd reduction to accumulate across BM so each thread writes</span></div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>  <span class="comment">// the partial output to SM and then each simdgroup does BN / n_simdgroups</span></div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>  <span class="comment">// accumulations.</span></div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>  <span class="keywordflow">if</span> (BM == 32) {</div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_outputs = BN / n_simdgroups;</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>    <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>        BM != 32 || n_outputs == n_reads,</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>        <span class="stringliteral">&quot;The tile should be selected such that n_outputs == n_reads&quot;</span>);</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>      shared_vals[offset.y * BN + offset.x + i] = totals[i];</div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>    }</div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>    short2 out_offset(simd_group_id * n_outputs, simd_lane_id);</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>      totals[i] =</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>          <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(shared_vals[out_offset.y * BN + out_offset.x + i]);</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>    }</div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span> </div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>    <span class="comment">// Write the output.</span></div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>    <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>      IdxT out_column = BN * gid.x + out_offset.x;</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>      out += out_idx * IdxT(reduction_stride) + out_column;</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>      <span class="keywordflow">if</span> (out_column + n_outputs &lt;= reduction_stride) {</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>          out[i] = totals[i];</div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>        }</div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>      } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; out_column + i &lt; reduction_stride; i++) {</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>          out[i] = totals[i];</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>        }</div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>      }</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>    }</div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>  }</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span> </div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>  <span class="comment">// Each thread holds n_reads partial results. We write them all out to shared</span></div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>  <span class="comment">// memory and threads with offset.y == 0 aggregate the columns and write the</span></div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>  <span class="comment">// outputs.</span></div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>  <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>    <span class="keywordtype">short</span> x_block = offset.x / n_reads;</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>      shared_vals[x_block * BM * n_reads + i * BM + offset.y] = totals[i];</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>    }</div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>    <span class="keywordflow">if</span> (offset.y == 0) {</div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 1; j &lt; BM; j++) {</div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>          totals[i] =</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>              <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(shared_vals[x_block * BM * n_reads + i * BM + j], totals[i]);</div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>        }</div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>      }</div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>    }</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span> </div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>    <span class="comment">// Write the output.</span></div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>    <span class="keywordflow">if</span> (offset.y == 0) {</div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>      out += out_idx * IdxT(reduction_stride) + column;</div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>      <span class="keywordflow">if</span> (safe) {</div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>          out[i] = totals[i];</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>        }</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>      } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; column + i &lt; reduction_stride; i++) {</div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>          out[i] = totals[i];</div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>        }</div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>      }</div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>    }</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>  }</div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>}</div>
 </div>
-<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span> </div>
-<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> NDIMS, <span class="keywordtype">int</span> BM, <span class="keywordtype">int</span> BN&gt;</div>
-<div class="foldopen" id="foldopen00287" data-start="{" data-end="}">
-<div class="line"><a id="l00287" name="l00287"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">  287</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">col_reduce_2pass</a>(</div>
-<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
-<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>    device U* out [[buffer(1)]],</div>
-<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
-<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
-<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
-<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
-<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
-<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
-<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
-<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
-<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
-<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; out_size [[buffer(11)]],</div>
-<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>    uint3 gsize [[threadgroups_per_grid]],</div>
-<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
-<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
-<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_simdgroups = 8;</div>
-<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> tgp_size = n_simdgroups * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
-<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_reads = (BM * BN) / tgp_size;</div>
-<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_read_blocks = BN / n_reads;</div>
-<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_outputs = BN / n_simdgroups;</div>
-<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> outer_blocks = 32;</div>
-<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>  <span class="keyword">static_assert</span>(BM == 32, <span class="stringliteral">&quot;BM should be equal to 32&quot;</span>);</div>
-<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span> </div>
-<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>  threadgroup U shared_vals[BN * BM];</div>
-<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>  U totals[n_reads];</div>
-<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
-<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span>  <span class="keyword">const</span> device T* row;</div>
-<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span> </div>
-<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>    totals[i] = Op::init;</div>
-<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>  }</div>
-<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span> </div>
-<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>  <span class="keywordtype">short</span> lid = simd_group_id * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> + simd_lane_id;</div>
-<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>  short2 offset((lid % n_read_blocks) * n_reads, lid / n_read_blocks);</div>
-<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>  <span class="keywordtype">size_t</span> column = BN * gid.x + offset.x;</div>
-<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>  <span class="keywordtype">bool</span> safe = column + n_reads &lt;= reduction_stride;</div>
-<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span> </div>
-<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span>  <span class="keywordtype">size_t</span> full_idx = gid.y + gsize.y * size_t(gid.z);</div>
-<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>  <span class="keywordtype">size_t</span> block_idx = full_idx / out_size;</div>
-<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>  <span class="keywordtype">size_t</span> out_idx = full_idx % out_size;</div>
-<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>  <span class="keywordtype">size_t</span> in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
-<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>  in += in_idx + column;</div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span> </div>
+<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>    <span class="keyword">typename</span> Op,</div>
+<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>    <span class="keyword">typename</span> IdxT,</div>
+<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>    <span class="keywordtype">int</span> NDIMS,</div>
+<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>    <span class="keywordtype">int</span> BM,</div>
+<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>    <span class="keywordtype">int</span> BN&gt;</div>
+<div class="foldopen" id="foldopen00302" data-start="{" data-end="}">
+<div class="line"><a id="l00302" name="l00302"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a9a7be400d810700b47fc1a998032ce29">  302</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a9a7be400d810700b47fc1a998032ce29">col_reduce_2pass</a>(</div>
+<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
+<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
+<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
+<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
+<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
+<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
+<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
+<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
+<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
+<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; out_size [[buffer(11)]],</div>
+<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span>    uint3 gsize [[threadgroups_per_grid]],</div>
+<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
+<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_simdgroups = 8;</div>
+<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> tgp_size = n_simdgroups * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
+<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_reads = (BM * BN) / tgp_size;</div>
+<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_read_blocks = BN / n_reads;</div>
+<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_outputs = BN / n_simdgroups;</div>
+<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> outer_blocks = 32;</div>
+<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span>  <span class="keyword">static_assert</span>(BM == 32, <span class="stringliteral">&quot;BM should be equal to 32&quot;</span>);</div>
+<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span> </div>
+<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>  threadgroup U shared_vals[BN * BM];</div>
+<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>  U totals[n_reads];</div>
+<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>  <a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;NDIMS, IdxT, (NDIMS &gt; 2)&gt; loop(reduce_ndim);</div>
+<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>  <span class="keyword">const</span> device T* row;</div>
 <div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span> </div>
-<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>  <span class="keywordtype">size_t</span> total = non_col_reductions * reduction_size;</div>
-<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span>  loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(offset.y + block_idx * BM, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = offset.y + block_idx * BM; r &lt; total;</div>
-<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span>       r += outer_blocks * BM) {</div>
-<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
-<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span> </div>
-<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>    <span class="keywordflow">if</span> (safe) {</div>
-<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
-<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span>      }</div>
-<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>      U vals[n_reads];</div>
-<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span>        vals[i] =</div>
-<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span>            (column + i &lt; reduction_stride) ? static_cast&lt;U&gt;(row[i]) : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
-<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>      }</div>
-<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
-<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span>      }</div>
-<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>    }</div>
-<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span> </div>
-<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(outer_blocks * BM, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>  }</div>
-<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span> </div>
-<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span>  <span class="comment">// We can use a simd reduction to accumulate across BM so each thread writes</span></div>
-<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span>  <span class="comment">// the partial output to SM and then each simdgroup does BN / n_simdgroups</span></div>
-<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>  <span class="comment">// accumulations.</span></div>
-<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span>    shared_vals[offset.y * BN + offset.x + i] = totals[i];</div>
-<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span>  }</div>
-<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>  short2 out_offset(simd_group_id * n_outputs, simd_lane_id);</div>
-<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
-<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span>    totals[i] =</div>
-<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span>        <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(shared_vals[out_offset.y * BN + out_offset.x + i]);</div>
-<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span>  }</div>
-<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span> </div>
-<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span>  <span class="comment">// Write the output.</span></div>
-<div class="line"><a id="l00371" name="l00371"></a><span class="lineno">  371</span>  <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
-<div class="line"><a id="l00372" name="l00372"></a><span class="lineno">  372</span>    <span class="keywordtype">size_t</span> out_column = BN * gid.x + out_offset.x;</div>
-<div class="line"><a id="l00373" name="l00373"></a><span class="lineno">  373</span>    out += full_idx * reduction_stride + out_column;</div>
-<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span>    <span class="keywordflow">if</span> (out_column + n_outputs &lt;= reduction_stride) {</div>
-<div class="line"><a id="l00375" name="l00375"></a><span class="lineno">  375</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
-<div class="line"><a id="l00376" name="l00376"></a><span class="lineno">  376</span>        out[i] = totals[i];</div>
-<div class="line"><a id="l00377" name="l00377"></a><span class="lineno">  377</span>      }</div>
-<div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00379" name="l00379"></a><span class="lineno">  379</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; out_column + i &lt; reduction_stride; i++) {</div>
-<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>        out[i] = totals[i];</div>
-<div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span>      }</div>
-<div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span>    }</div>
-<div class="line"><a id="l00383" name="l00383"></a><span class="lineno">  383</span>  }</div>
-<div class="line"><a id="l00384" name="l00384"></a><span class="lineno">  384</span>}</div>
+<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span>    totals[i] = Op::init;</div>
+<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span>  }</div>
+<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span> </div>
+<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span>  <span class="keywordtype">short</span> lid = simd_group_id * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> + simd_lane_id;</div>
+<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span>  short2 offset((lid % n_read_blocks) * n_reads, lid / n_read_blocks);</div>
+<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>  IdxT column = BN * gid.x + offset.x;</div>
+<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span>  <span class="keywordtype">bool</span> safe = column + n_reads &lt;= reduction_stride;</div>
+<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span> </div>
+<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span>  IdxT full_idx = gid.y + gsize.y * IdxT(gid.z);</div>
+<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>  IdxT block_idx = full_idx / IdxT(out_size);</div>
+<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>  IdxT out_idx = full_idx % IdxT(out_size);</div>
+<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>  IdxT in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, IdxT&gt;</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span>  in += in_idx + column;</div>
+<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span> </div>
+<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>  IdxT total = IdxT(non_col_reductions) * IdxT(reduction_size);</div>
+<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>  loop.next(offset.y + block_idx * BM, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span>  <span class="keywordflow">for</span> (IdxT r = offset.y + block_idx * BM; r &lt; total; r += outer_blocks * BM) {</div>
+<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span>    row = in + loop.location();</div>
+<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span> </div>
+<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span>    <span class="keywordflow">if</span> (safe) {</div>
+<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
+<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>      }</div>
+<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span>      U vals[n_reads];</div>
+<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>        vals[i] =</div>
+<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span>            (column + i &lt; reduction_stride) ? static_cast&lt;U&gt;(row[i]) : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
+<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span>      }</div>
+<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
+<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span>      }</div>
+<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span>    }</div>
+<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span> </div>
+<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span>    loop.next(outer_blocks * BM, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span>  }</div>
+<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span> </div>
+<div class="line"><a id="l00371" name="l00371"></a><span class="lineno">  371</span>  <span class="comment">// We can use a simd reduction to accumulate across BM so each thread writes</span></div>
+<div class="line"><a id="l00372" name="l00372"></a><span class="lineno">  372</span>  <span class="comment">// the partial output to SM and then each simdgroup does BN / n_simdgroups</span></div>
+<div class="line"><a id="l00373" name="l00373"></a><span class="lineno">  373</span>  <span class="comment">// accumulations.</span></div>
+<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00375" name="l00375"></a><span class="lineno">  375</span>    shared_vals[offset.y * BN + offset.x + i] = totals[i];</div>
+<div class="line"><a id="l00376" name="l00376"></a><span class="lineno">  376</span>  }</div>
+<div class="line"><a id="l00377" name="l00377"></a><span class="lineno">  377</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span>  short2 out_offset(simd_group_id * n_outputs, simd_lane_id);</div>
+<div class="line"><a id="l00379" name="l00379"></a><span class="lineno">  379</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
+<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>    totals[i] =</div>
+<div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span>        <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(shared_vals[out_offset.y * BN + out_offset.x + i]);</div>
+<div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span>  }</div>
+<div class="line"><a id="l00383" name="l00383"></a><span class="lineno">  383</span> </div>
+<div class="line"><a id="l00384" name="l00384"></a><span class="lineno">  384</span>  <span class="comment">// Write the output.</span></div>
+<div class="line"><a id="l00385" name="l00385"></a><span class="lineno">  385</span>  <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
+<div class="line"><a id="l00386" name="l00386"></a><span class="lineno">  386</span>    IdxT out_column = BN * gid.x + out_offset.x;</div>
+<div class="line"><a id="l00387" name="l00387"></a><span class="lineno">  387</span>    out += full_idx * IdxT(reduction_stride) + out_column;</div>
+<div class="line"><a id="l00388" name="l00388"></a><span class="lineno">  388</span>    <span class="keywordflow">if</span> (out_column + n_outputs &lt;= reduction_stride) {</div>
+<div class="line"><a id="l00389" name="l00389"></a><span class="lineno">  389</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
+<div class="line"><a id="l00390" name="l00390"></a><span class="lineno">  390</span>        out[i] = totals[i];</div>
+<div class="line"><a id="l00391" name="l00391"></a><span class="lineno">  391</span>      }</div>
+<div class="line"><a id="l00392" name="l00392"></a><span class="lineno">  392</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00393" name="l00393"></a><span class="lineno">  393</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; out_column + i &lt; reduction_stride; i++) {</div>
+<div class="line"><a id="l00394" name="l00394"></a><span class="lineno">  394</span>        out[i] = totals[i];</div>
+<div class="line"><a id="l00395" name="l00395"></a><span class="lineno">  395</span>      }</div>
+<div class="line"><a id="l00396" name="l00396"></a><span class="lineno">  396</span>    }</div>
+<div class="line"><a id="l00397" name="l00397"></a><span class="lineno">  397</span>  }</div>
+<div class="line"><a id="l00398" name="l00398"></a><span class="lineno">  398</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3"><div class="ttname"><a href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a></div><div class="ttdeci">static constant constexpr const uint8_t simd_size</div><div class="ttdef"><b>Definition</b> ops.h:22</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
-<div class="ttc" id="areduce__col_8h_html_a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"><div class="ttname"><a href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">col_reduce_2pass</a></div><div class="ttdeci">void col_reduce_2pass(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> reduce_col.h:287</div></div>
-<div class="ttc" id="areduce__col_8h_html_a11bfc6112ae2386ac03f5ea7b7d93385"><div class="ttname"><a href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">col_reduce_looped</a></div><div class="ttdeci">void col_reduce_looped(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</div><div class="ttdoc">Our approach is the following simple looped approach:</div><div class="ttdef"><b>Definition</b> reduce_col.h:155</div></div>
-<div class="ttc" id="areduce__col_8h_html_a5b4f4c4c247ad341ff8d31dcbbbce0eb"><div class="ttname"><a href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">col_reduce_longcolumn</a></div><div class="ttdeci">void col_reduce_longcolumn(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</div><div class="ttdef"><b>Definition</b> reduce_col.h:97</div></div>
-<div class="ttc" id="areduce__col_8h_html_a7c378443a2b6f4d9210db8a21a9ac4f5"><div class="ttname"><a href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">col_reduce_small</a></div><div class="ttdeci">void col_reduce_small(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</div><div class="ttdef"><b>Definition</b> reduce_col.h:4</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html"><div class="ttname"><a href="structlooped__elem__to__loc.html">looped_elem_to_loc</a></div><div class="ttdef"><b>Definition</b> utils.h:197</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_a05558dabba889ee0d80ed4b567d901ca"><div class="ttname"><a href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">looped_elem_to_loc::next</a></div><div class="ttdeci">void next(const constant int *shape, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:202</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_accc6d4957a8aeb38f5062754793b74d2"><div class="ttname"><a href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">looped_elem_to_loc::location</a></div><div class="ttdeci">offset_t location(offset_t, const constant int *, const constant size_t *, int)</div><div class="ttdef"><b>Definition</b> utils.h:229</div></div>
+<div class="ttc" id="areduce__col_8h_html_a82cd031d8014c02e61dc9a817ea6d4ec"><div class="ttname"><a href="reduce__col_8h.html#a82cd031d8014c02e61dc9a817ea6d4ec">col_reduce_small</a></div><div class="ttdeci">void col_reduce_small(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</div><div class="ttdef"><b>Definition</b> reduce_col.h:4</div></div>
+<div class="ttc" id="areduce__col_8h_html_a9a7be400d810700b47fc1a998032ce29"><div class="ttname"><a href="reduce__col_8h.html#a9a7be400d810700b47fc1a998032ce29">col_reduce_2pass</a></div><div class="ttdeci">void col_reduce_2pass(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> reduce_col.h:302</div></div>
+<div class="ttc" id="areduce__col_8h_html_aa3287cd98e97123b67b5d3920d984ca2"><div class="ttname"><a href="reduce__col_8h.html#aa3287cd98e97123b67b5d3920d984ca2">col_reduce_longcolumn</a></div><div class="ttdeci">void col_reduce_longcolumn(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</div><div class="ttdef"><b>Definition</b> reduce_col.h:97</div></div>
+<div class="ttc" id="areduce__col_8h_html_ae8f9354e1c595142d05b33fe13988f02"><div class="ttname"><a href="reduce__col_8h.html#ae8f9354e1c595142d05b33fe13988f02">col_reduce_looped</a></div><div class="ttdeci">void col_reduce_looped(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</div><div class="ttdoc">Our approach is the following simple looped approach:</div><div class="ttdef"><b>Definition</b> reduce_col.h:163</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html"><div class="ttname"><a href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a></div><div class="ttdef"><b>Definition</b> utils.h:208</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/reduce__row_8h.html b/docs/build/html/reduce__row_8h.html
index 15e218f00..d85dd87a3 100644
--- a/docs/build/html/reduce__row_8h.html
+++ b/docs/build/html/reduce__row_8h.html
@@ -117,15 +117,15 @@ Functions</h2></td></tr>
 <tr class="memitem:afd80a25fa84e6cc884dcc8698859ade1" id="r_afd80a25fa84e6cc884dcc8698859ade1"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int N_READS = REDUCE_N_READS&gt; </td></tr>
 <tr class="memitem:afd80a25fa84e6cc884dcc8698859ade1"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#afd80a25fa84e6cc884dcc8698859ade1">thread_reduce</a> (thread U &amp;total, const device T *row, int blocks, int extra)</td></tr>
 <tr class="separator:afd80a25fa84e6cc884dcc8698859ade1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a27e75312086e31f6bd1bbf4b366679da" id="r_a27e75312086e31f6bd1bbf4b366679da"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS, int N_READS = REDUCE_N_READS&gt; </td></tr>
-<tr class="memitem:a27e75312086e31f6bd1bbf4b366679da"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a27e75312086e31f6bd1bbf4b366679da">row_reduce_small</a> (const device T *in, device U *out, const constant size_t &amp;row_size, const constant size_t &amp;non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, uint simd_lane_id, uint3 gid, uint3 gsize, uint3 tid, uint3 tsize)</td></tr>
-<tr class="separator:a27e75312086e31f6bd1bbf4b366679da"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac01d30987668930c8b38900e47b8308b" id="r_ac01d30987668930c8b38900e47b8308b"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int N_READS = REDUCE_N_READS, int N_WRITES = REDUCE_N_WRITES&gt; </td></tr>
-<tr class="memitem:ac01d30987668930c8b38900e47b8308b"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ac01d30987668930c8b38900e47b8308b">row_reduce_simple</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id)</td></tr>
-<tr class="separator:ac01d30987668930c8b38900e47b8308b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad98332d74a6824aa7499df3e2f2246ae" id="r_ad98332d74a6824aa7499df3e2f2246ae"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS, int N_READS = REDUCE_N_READS&gt; </td></tr>
-<tr class="memitem:ad98332d74a6824aa7499df3e2f2246ae"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ad98332d74a6824aa7499df3e2f2246ae">row_reduce_looped</a> (const device T *in, device U *out, const constant size_t &amp;row_size, const constant size_t &amp;non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id)</td></tr>
-<tr class="separator:ad98332d74a6824aa7499df3e2f2246ae"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aeb49e89f1163cb3093770bb710df9f5e" id="r_aeb49e89f1163cb3093770bb710df9f5e"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS, int N_READS = REDUCE_N_READS&gt; </td></tr>
+<tr class="memitem:aeb49e89f1163cb3093770bb710df9f5e"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aeb49e89f1163cb3093770bb710df9f5e">row_reduce_small</a> (const device T *in, device U *out, const constant size_t &amp;row_size, const constant size_t &amp;non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, uint simd_lane_id, uint3 gid, uint3 gsize, uint3 tid, uint3 tsize)</td></tr>
+<tr class="separator:aeb49e89f1163cb3093770bb710df9f5e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aef628dfccdb1361da5546f8b17c510bf" id="r_aef628dfccdb1361da5546f8b17c510bf"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT  = size_t, int N_READS = REDUCE_N_READS, int N_WRITES = REDUCE_N_WRITES&gt; </td></tr>
+<tr class="memitem:aef628dfccdb1361da5546f8b17c510bf"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aef628dfccdb1361da5546f8b17c510bf">row_reduce_simple</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id)</td></tr>
+<tr class="separator:aef628dfccdb1361da5546f8b17c510bf"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:afba85f5a1c935c124ef52e986d4b2c49" id="r_afba85f5a1c935c124ef52e986d4b2c49"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS, int N_READS = REDUCE_N_READS&gt; </td></tr>
+<tr class="memitem:afba85f5a1c935c124ef52e986d4b2c49"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#afba85f5a1c935c124ef52e986d4b2c49">row_reduce_looped</a> (const device T *in, device U *out, const constant size_t &amp;row_size, const constant size_t &amp;non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id)</td></tr>
+<tr class="separator:afba85f5a1c935c124ef52e986d4b2c49"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
 <a id="a045ec34228e77c79ec67d11c39ff097a" name="a045ec34228e77c79ec67d11c39ff097a"></a>
@@ -289,13 +289,13 @@ template&lt;typename T , typename U , typename Op , int N_READS = REDUCE_N_READS
 
 </div>
 </div>
-<a id="ad98332d74a6824aa7499df3e2f2246ae" name="ad98332d74a6824aa7499df3e2f2246ae"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ad98332d74a6824aa7499df3e2f2246ae">&#9670;&#160;</a></span>row_reduce_looped()</h2>
+<a id="afba85f5a1c935c124ef52e986d4b2c49" name="afba85f5a1c935c124ef52e986d4b2c49"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#afba85f5a1c935c124ef52e986d4b2c49">&#9670;&#160;</a></span>row_reduce_looped()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int NDIMS, int N_READS = REDUCE_N_READS&gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS, int N_READS = REDUCE_N_READS&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void row_reduce_looped </td>
@@ -387,13 +387,13 @@ template&lt;typename T , typename U , typename Op , int NDIMS, int N_READS = RED
 
 </div>
 </div>
-<a id="ac01d30987668930c8b38900e47b8308b" name="ac01d30987668930c8b38900e47b8308b"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ac01d30987668930c8b38900e47b8308b">&#9670;&#160;</a></span>row_reduce_simple()</h2>
+<a id="aef628dfccdb1361da5546f8b17c510bf" name="aef628dfccdb1361da5546f8b17c510bf"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aef628dfccdb1361da5546f8b17c510bf">&#9670;&#160;</a></span>row_reduce_simple()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int N_READS = REDUCE_N_READS, int N_WRITES = REDUCE_N_WRITES&gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT  = size_t, int N_READS = REDUCE_N_READS, int N_WRITES = REDUCE_N_WRITES&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void row_reduce_simple </td>
@@ -455,13 +455,13 @@ template&lt;typename T , typename U , typename Op , int N_READS = REDUCE_N_READS
 
 </div>
 </div>
-<a id="a27e75312086e31f6bd1bbf4b366679da" name="a27e75312086e31f6bd1bbf4b366679da"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a27e75312086e31f6bd1bbf4b366679da">&#9670;&#160;</a></span>row_reduce_small()</h2>
+<a id="aeb49e89f1163cb3093770bb710df9f5e" name="aeb49e89f1163cb3093770bb710df9f5e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aeb49e89f1163cb3093770bb710df9f5e">&#9670;&#160;</a></span>row_reduce_small()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int NDIMS, int N_READS = REDUCE_N_READS&gt; </div>
+template&lt;typename T , typename U , typename Op , typename IdxT , int NDIMS, int N_READS = REDUCE_N_READS&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void row_reduce_small </td>
diff --git a/docs/build/html/reduce__row_8h_source.html b/docs/build/html/reduce__row_8h_source.html
index d59088ed9..588211d06 100644
--- a/docs/build/html/reduce__row_8h_source.html
+++ b/docs/build/html/reduce__row_8h_source.html
@@ -198,7 +198,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  <span class="keyword">const</span> device T* inputs[N_WRITES];</div>
 <div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  in += lid_x * N_READS;</div>
 <div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    inputs[i] = in + <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(row_idx + i, shape, strides, ndim);</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    inputs[i] = in + <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(row_idx + i, shape, strides, ndim);</div>
 <div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  }</div>
 <div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span> </div>
 <div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  <a class="code hl_function" href="reduce__row_8h.html#a9d5e0049a2276f43702fc6907e74a35f">per_thread_row_reduce&lt;T, U, Op, N_READS, N_WRITES&gt;</a>(</div>
@@ -283,197 +283,199 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>    <span class="keyword">typename</span> T,</div>
 <div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>    <span class="keyword">typename</span> U,</div>
 <div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    <span class="keyword">typename</span> Op,</div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>    <span class="keywordtype">int</span> NDIMS,</div>
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>    <span class="keywordtype">int</span> N_READS = <a class="code hl_variable" href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a>&gt;</div>
-<div class="foldopen" id="foldopen00198" data-start="{" data-end="}">
-<div class="line"><a id="l00198" name="l00198"></a><span class="lineno"><a class="line" href="reduce__row_8h.html#a27e75312086e31f6bd1bbf4b366679da">  198</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__row_8h.html#a27e75312086e31f6bd1bbf4b366679da">row_reduce_small</a>(</div>
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>    device U* out [[buffer(1)]],</div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; row_size [[buffer(2)]],</div>
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_row_reductions [[buffer(3)]],</div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>    uint3 gsize [[threadgroups_per_grid]],</div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>    uint3 tid [[thread_position_in_grid]],</div>
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>    uint3 tsize [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span> </div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>  U total_val = Op::init;</div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span> </div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>  <span class="comment">// Precompute some row reduction numbers</span></div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>  <span class="keyword">const</span> device T* row;</div>
-<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>  <span class="keywordtype">int</span> blocks = row_size / N_READS;</div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>  <span class="keywordtype">int</span> extra = row_size % N_READS;</div>
-<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span> </div>
-<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>  <span class="keywordflow">if</span> ((non_row_reductions &lt; 32 &amp;&amp; row_size &lt;= 8) || non_row_reductions &lt;= 8) {</div>
-<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>    <span class="comment">// Simple loop over non_row_reductions and reduce the row in the thread.</span></div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>    <span class="keywordtype">size_t</span> out_idx = tid.x + tsize.y * size_t(tid.y);</div>
-<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>    in += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
-<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span> </div>
-<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>    <span class="keywordflow">for</span> (uint r = 0; r &lt; non_row_reductions; r++) {</div>
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>      row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>      <a class="code hl_function" href="reduce__row_8h.html#afd80a25fa84e6cc884dcc8698859ade1">thread_reduce&lt;T, U, Op, N_READS&gt;</a>(total_val, row, blocks, extra);</div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>      loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>    }</div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span> </div>
-<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>    out[out_idx] = total_val;</div>
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>  } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>    <span class="comment">// Collaboratively reduce over non_row_reductions in the simdgroup. Each</span></div>
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>    <span class="comment">// thread reduces every 32nd row and then a simple simd reduce.</span></div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>    <span class="keywordtype">size_t</span> out_idx = gid.y + gsize.y * size_t(gid.z);</div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>    in += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
-<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span> </div>
-<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(simd_lane_id, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span> </div>
-<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>    <span class="keywordflow">for</span> (uint r = simd_lane_id; r &lt; non_row_reductions; r += <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>) {</div>
-<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>      row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
-<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>      <a class="code hl_function" href="reduce__row_8h.html#afd80a25fa84e6cc884dcc8698859ade1">thread_reduce&lt;T, U, Op, N_READS&gt;</a>(total_val, row, blocks, extra);</div>
-<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>      loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(<a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>    }</div>
-<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span> </div>
-<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>    total_val = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(total_val);</div>
-<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span> </div>
-<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>    <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
-<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>      out[out_idx] = total_val;</div>
-<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>    }</div>
-<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>  }</div>
-<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>}</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>    <span class="keyword">typename</span> IdxT,</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>    <span class="keywordtype">int</span> NDIMS,</div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>    <span class="keywordtype">int</span> N_READS = <a class="code hl_variable" href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a>&gt;</div>
+<div class="foldopen" id="foldopen00199" data-start="{" data-end="}">
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno"><a class="line" href="reduce__row_8h.html#aeb49e89f1163cb3093770bb710df9f5e">  199</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__row_8h.html#aeb49e89f1163cb3093770bb710df9f5e">row_reduce_small</a>(</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; row_size [[buffer(2)]],</div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_row_reductions [[buffer(3)]],</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>    uint3 gsize [[threadgroups_per_grid]],</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>    uint3 tid [[thread_position_in_grid]],</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>    uint3 tsize [[threads_per_grid]]) {</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span> </div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>  U total_val = Op::init;</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>  <a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;NDIMS, IdxT, (NDIMS &gt; 2)&gt; loop(reduce_ndim);</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span> </div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>  <span class="comment">// Precompute some row reduction numbers</span></div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>  <span class="keyword">const</span> device T* row;</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>  <span class="keywordtype">int</span> blocks = IdxT(row_size) / N_READS;</div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>  <span class="keywordtype">int</span> extra = IdxT(row_size) % N_READS;</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span> </div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>  <span class="keywordflow">if</span> ((non_row_reductions &lt; 32 &amp;&amp; row_size &lt;= 8) || non_row_reductions &lt;= 8) {</div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>    <span class="comment">// Simple loop over non_row_reductions and reduce the row in the thread.</span></div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>    IdxT out_idx = tid.x + tsize.y * IdxT(tid.y);</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>    in += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, IdxT&gt;</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span> </div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    <span class="keywordflow">for</span> (uint r = 0; r &lt; non_row_reductions; r++) {</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>      row = in + loop.location();</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>      <a class="code hl_function" href="reduce__row_8h.html#afd80a25fa84e6cc884dcc8698859ade1">thread_reduce&lt;T, U, Op, N_READS&gt;</a>(total_val, row, blocks, extra);</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>      loop.next(reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>    }</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span> </div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>    out[out_idx] = total_val;</div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>    <span class="comment">// Collaboratively reduce over non_row_reductions in the simdgroup. Each</span></div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>    <span class="comment">// thread reduces every 32nd row and then a simple simd reduce.</span></div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>    IdxT out_idx = gid.y + gsize.y * IdxT(gid.z);</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>    in += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, IdxT&gt;</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span> </div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>    loop.next(simd_lane_id, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span> </div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>    <span class="keywordflow">for</span> (uint r = simd_lane_id; r &lt; non_row_reductions; r += <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>) {</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>      row = in + loop.location();</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>      <a class="code hl_function" href="reduce__row_8h.html#afd80a25fa84e6cc884dcc8698859ade1">thread_reduce&lt;T, U, Op, N_READS&gt;</a>(total_val, row, blocks, extra);</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>      loop.next(<a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>    }</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span> </div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>    total_val = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(total_val);</div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span> </div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>    <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>      out[out_idx] = total_val;</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>    }</div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>  }</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>}</div>
 </div>
-<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span> </div>
-<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>    <span class="keyword">typename</span> U,</div>
-<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>    <span class="keyword">typename</span> Op,</div>
-<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>    <span class="keywordtype">int</span> N_READS = <a class="code hl_variable" href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a>,</div>
-<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>    <span class="keywordtype">int</span> N_WRITES = <a class="code hl_variable" href="defines_8h.html#a68c33274e15a2f163f7631a36280d82f">REDUCE_N_WRITES</a>&gt;</div>
-<div class="foldopen" id="foldopen00264" data-start="{" data-end="}">
-<div class="line"><a id="l00264" name="l00264"></a><span class="lineno"><a class="line" href="reduce__row_8h.html#ac01d30987668930c8b38900e47b8308b">  264</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__row_8h.html#ac01d30987668930c8b38900e47b8308b">row_reduce_simple</a>(</div>
-<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
-<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>    device U* out [[buffer(1)]],</div>
-<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
-<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; out_size [[buffer(3)]],</div>
-<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>    uint3 gsize [[threadgroups_per_grid]],</div>
-<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>    uint3 lid [[thread_position_in_threadgroup]],</div>
-<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>    uint3 lsize [[threads_per_threadgroup]],</div>
-<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
-<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>    uint simd_per_group [[simdgroups_per_threadgroup]],</div>
-<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
-<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>  threadgroup U shared_vals[<a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> * N_WRITES];</div>
-<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>  U totals[N_WRITES];</div>
-<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span> </div>
-<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>  <span class="comment">// Move to the row</span></div>
-<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>  <span class="keywordtype">size_t</span> out_idx = N_WRITES * (gid.y + gsize.y * size_t(gid.z));</div>
-<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>  <span class="keywordflow">if</span> (out_idx + N_WRITES &gt; out_size) {</div>
-<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>    out_idx = out_size - N_WRITES;</div>
-<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>  }</div>
-<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>  in += out_idx * reduction_size;</div>
-<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>  out += out_idx;</div>
-<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span> </div>
-<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>  <span class="comment">// Each thread reduces across the row</span></div>
-<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>  <span class="keywordtype">int</span> blocks = reduction_size / (lsize.x * N_READS);</div>
-<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>  <span class="keywordtype">int</span> extra = reduction_size - blocks * (lsize.x * N_READS);</div>
-<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>  <a class="code hl_function" href="reduce__row_8h.html#a9d5e0049a2276f43702fc6907e74a35f">per_thread_row_reduce&lt;T, U, Op, N_READS, N_WRITES&gt;</a>(</div>
-<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>      totals, in, reduction_size, blocks, extra, lsize.x, lid.x);</div>
-<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span> </div>
-<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>  <span class="comment">// Reduce across the threadgroup</span></div>
-<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span>  <a class="code hl_function" href="reduce__row_8h.html#aa146bb611069fd2892f03714fd1cc3cf">threadgroup_reduce&lt;T, U, Op, N_READS, N_WRITES&gt;</a>(</div>
-<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>      totals, shared_vals, lid, simd_lane_id, simd_per_group, simd_group_id);</div>
-<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span> </div>
-<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>  <span class="comment">// Write the output</span></div>
-<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>  <span class="keywordflow">if</span> (lid.x == 0) {</div>
-<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_WRITES; i++) {</div>
-<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>      out[i] = totals[i];</div>
-<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>    }</div>
-<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>  }</div>
-<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>}</div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span> </div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>    <span class="keyword">typename</span> Op,</div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>    <span class="keyword">typename</span> IdxT = size_t,</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>    <span class="keywordtype">int</span> N_READS = <a class="code hl_variable" href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a>,</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>    <span class="keywordtype">int</span> N_WRITES = <a class="code hl_variable" href="defines_8h.html#a68c33274e15a2f163f7631a36280d82f">REDUCE_N_WRITES</a>&gt;</div>
+<div class="foldopen" id="foldopen00266" data-start="{" data-end="}">
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno"><a class="line" href="reduce__row_8h.html#aef628dfccdb1361da5546f8b17c510bf">  266</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__row_8h.html#aef628dfccdb1361da5546f8b17c510bf">row_reduce_simple</a>(</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; out_size [[buffer(3)]],</div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>    uint3 gsize [[threadgroups_per_grid]],</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>    uint3 lid [[thread_position_in_threadgroup]],</div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>    uint3 lsize [[threads_per_threadgroup]],</div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>    uint simd_per_group [[simdgroups_per_threadgroup]],</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>  threadgroup U shared_vals[<a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> * N_WRITES];</div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>  U totals[N_WRITES];</div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span> </div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>  <span class="comment">// Move to the row</span></div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>  IdxT out_idx = N_WRITES * (gid.y + gsize.y * IdxT(gid.z));</div>
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>  <span class="keywordflow">if</span> (out_idx + N_WRITES &gt; out_size) {</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>    out_idx = out_size - N_WRITES;</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>  }</div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>  in += out_idx * IdxT(reduction_size);</div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>  out += out_idx;</div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span> </div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>  <span class="comment">// Each thread reduces across the row</span></div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>  <span class="keywordtype">int</span> blocks = IdxT(reduction_size) / (lsize.x * N_READS);</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>  <span class="keywordtype">int</span> extra = reduction_size - blocks * (lsize.x * N_READS);</div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>  <a class="code hl_function" href="reduce__row_8h.html#a9d5e0049a2276f43702fc6907e74a35f">per_thread_row_reduce&lt;T, U, Op, N_READS, N_WRITES&gt;</a>(</div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>      totals, in, reduction_size, blocks, extra, lsize.x, lid.x);</div>
+<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span> </div>
+<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>  <span class="comment">// Reduce across the threadgroup</span></div>
+<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>  <a class="code hl_function" href="reduce__row_8h.html#aa146bb611069fd2892f03714fd1cc3cf">threadgroup_reduce&lt;T, U, Op, N_READS, N_WRITES&gt;</a>(</div>
+<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>      totals, shared_vals, lid, simd_lane_id, simd_per_group, simd_group_id);</div>
+<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span> </div>
+<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>  <span class="comment">// Write the output</span></div>
+<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>  <span class="keywordflow">if</span> (lid.x == 0) {</div>
+<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_WRITES; i++) {</div>
+<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>      out[i] = totals[i];</div>
+<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>    }</div>
+<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>  }</div>
+<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>}</div>
 </div>
-<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span> </div>
-<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span>    <span class="keyword">typename</span> U,</div>
-<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>    <span class="keyword">typename</span> Op,</div>
-<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>    <span class="keywordtype">int</span> NDIMS,</div>
-<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>    <span class="keywordtype">int</span> N_READS = <a class="code hl_variable" href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a>&gt;</div>
-<div class="foldopen" id="foldopen00311" data-start="{" data-end="}">
-<div class="line"><a id="l00311" name="l00311"></a><span class="lineno"><a class="line" href="reduce__row_8h.html#ad98332d74a6824aa7499df3e2f2246ae">  311</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__row_8h.html#ad98332d74a6824aa7499df3e2f2246ae">row_reduce_looped</a>(</div>
-<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
-<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>    device U* out [[buffer(1)]],</div>
-<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; row_size [[buffer(2)]],</div>
-<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_row_reductions [[buffer(3)]],</div>
-<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
-<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
-<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
-<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
-<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
-<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
-<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>    uint3 gsize [[threadgroups_per_grid]],</div>
-<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>    uint3 lid [[thread_position_in_threadgroup]],</div>
-<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>    uint3 lsize [[threads_per_threadgroup]],</div>
-<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
-<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span>    uint simd_per_group [[simdgroups_per_threadgroup]],</div>
-<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
-<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>  threadgroup U shared_vals[<a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>];</div>
-<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>  U total = Op::init;</div>
-<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span> </div>
-<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>  <span class="keywordtype">size_t</span> out_idx = gid.y + gsize.y * size_t(gid.z);</div>
-<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span> </div>
-<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span>  <span class="comment">// lid.x * N_READS breaks the per_thread_row_reduce interface a bit. Maybe it</span></div>
-<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span>  <span class="comment">// needs a small refactor.</span></div>
-<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span>  in += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim) + lid.x * N_READS;</div>
-<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span> </div>
-<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
-<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span>  <span class="keyword">const</span> device T* row;</div>
-<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span>  <span class="keywordtype">int</span> blocks = row_size / (lsize.x * N_READS);</div>
-<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span>  <span class="keywordtype">int</span> extra = row_size - blocks * (lsize.x * N_READS);</div>
-<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span> </div>
-<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> i = 0; i &lt; non_row_reductions; i++) {</div>
-<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(i, reduce_shape, reduce_strides, reduce_ndim);</div>
-<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span> </div>
-<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span>    <span class="comment">// Each thread reduces across the row</span></div>
-<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>    U row_total;</div>
-<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>    <a class="code hl_function" href="reduce__row_8h.html#a9d5e0049a2276f43702fc6907e74a35f">per_thread_row_reduce&lt;T, U, Op, N_READS, 1&gt;</a>(</div>
-<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span>        &amp;row_total, &amp;row, blocks, extra, lsize.x, lid.x);</div>
-<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span> </div>
-<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>    <span class="comment">// Aggregate across rows</span></div>
-<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span>    total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(total, row_total);</div>
-<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span> </div>
-<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>  }</div>
-<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span> </div>
-<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span>  <span class="comment">// Reduce across the threadgroup</span></div>
-<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>  <a class="code hl_function" href="reduce__row_8h.html#aa146bb611069fd2892f03714fd1cc3cf">threadgroup_reduce&lt;T, U, Op, N_READS, 1&gt;</a>(</div>
-<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>      &amp;total, shared_vals, lid, simd_lane_id, simd_per_group, simd_group_id);</div>
+<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span> </div>
+<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>    <span class="keyword">typename</span> U,</div>
+<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>    <span class="keyword">typename</span> Op,</div>
+<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>    <span class="keyword">typename</span> IdxT,</div>
+<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span>    <span class="keywordtype">int</span> NDIMS,</div>
+<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>    <span class="keywordtype">int</span> N_READS = <a class="code hl_variable" href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a>&gt;</div>
+<div class="foldopen" id="foldopen00314" data-start="{" data-end="}">
+<div class="line"><a id="l00314" name="l00314"></a><span class="lineno"><a class="line" href="reduce__row_8h.html#afba85f5a1c935c124ef52e986d4b2c49">  314</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__row_8h.html#afba85f5a1c935c124ef52e986d4b2c49">row_reduce_looped</a>(</div>
+<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; row_size [[buffer(2)]],</div>
+<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_row_reductions [[buffer(3)]],</div>
+<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
+<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
+<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
+<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
+<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
+<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
+<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span>    uint3 gsize [[threadgroups_per_grid]],</div>
+<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span>    uint3 lid [[thread_position_in_threadgroup]],</div>
+<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>    uint3 lsize [[threads_per_threadgroup]],</div>
+<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>    uint simd_per_group [[simdgroups_per_threadgroup]],</div>
+<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
+<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>  threadgroup U shared_vals[<a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>];</div>
+<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span>  U total = Op::init;</div>
+<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span> </div>
+<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span>  IdxT out_idx = gid.y + gsize.y * IdxT(gid.z);</div>
+<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span> </div>
+<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span>  <span class="comment">// lid.x * N_READS breaks the per_thread_row_reduce interface a bit. Maybe it</span></div>
+<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>  <span class="comment">// needs a small refactor.</span></div>
+<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span>  in += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, IdxT&gt;</a>(out_idx, shape, strides, ndim) +</div>
+<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span>      lid.x * N_READS;</div>
+<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span> </div>
+<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>  <a class="code hl_struct" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;NDIMS, IdxT, (NDIMS &gt; 2)&gt; loop(reduce_ndim);</div>
+<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>  <span class="keyword">const</span> device T* row;</div>
+<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>  <span class="keywordtype">int</span> blocks = IdxT(row_size) / (lsize.x * N_READS);</div>
+<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span>  <span class="keywordtype">int</span> extra = row_size - blocks * (lsize.x * N_READS);</div>
+<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span> </div>
+<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>  <span class="keywordflow">for</span> (IdxT i = 0; i &lt; non_row_reductions; i++) {</div>
+<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>    row = in + loop.location();</div>
+<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span> </div>
+<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span>    <span class="comment">// Each thread reduces across the row</span></div>
+<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>    U row_total;</div>
+<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span>    <a class="code hl_function" href="reduce__row_8h.html#a9d5e0049a2276f43702fc6907e74a35f">per_thread_row_reduce&lt;T, U, Op, N_READS, 1&gt;</a>(</div>
+<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span>        &amp;row_total, &amp;row, blocks, extra, lsize.x, lid.x);</div>
+<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span> </div>
+<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>    <span class="comment">// Aggregate across rows</span></div>
+<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span>    total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(total, row_total);</div>
+<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span> </div>
+<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>    loop.next(reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>  }</div>
 <div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span> </div>
-<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span>  <span class="comment">// Write the output</span></div>
-<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>  <span class="keywordflow">if</span> (lid.x == 0) {</div>
-<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>    out[out_idx] = total;</div>
-<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span>  }</div>
-<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span>}</div>
+<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span>  <span class="comment">// Reduce across the threadgroup</span></div>
+<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>  <a class="code hl_function" href="reduce__row_8h.html#aa146bb611069fd2892f03714fd1cc3cf">threadgroup_reduce&lt;T, U, Op, N_READS, 1&gt;</a>(</div>
+<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>      &amp;total, shared_vals, lid, simd_lane_id, simd_per_group, simd_group_id);</div>
+<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span> </div>
+<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span>  <span class="comment">// Write the output</span></div>
+<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span>  <span class="keywordflow">if</span> (lid.x == 0) {</div>
+<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span>    out[out_idx] = total;</div>
+<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span>  }</div>
+<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3"><div class="ttname"><a href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a></div><div class="ttdeci">static constant constexpr const uint8_t simd_size</div><div class="ttdef"><b>Definition</b> ops.h:22</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
 <div class="ttc" id="adefines_8h_html_a2ad505864a2ab786147766900bc18c21"><div class="ttname"><a href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a></div><div class="ttdeci">static constexpr int REDUCE_N_READS</div><div class="ttdef"><b>Definition</b> defines.h:12</div></div>
 <div class="ttc" id="adefines_8h_html_a68c33274e15a2f163f7631a36280d82f"><div class="ttname"><a href="defines_8h.html#a68c33274e15a2f163f7631a36280d82f">REDUCE_N_WRITES</a></div><div class="ttdeci">static constexpr int REDUCE_N_WRITES</div><div class="ttdef"><b>Definition</b> defines.h:13</div></div>
-<div class="ttc" id="areduce__row_8h_html_a27e75312086e31f6bd1bbf4b366679da"><div class="ttname"><a href="reduce__row_8h.html#a27e75312086e31f6bd1bbf4b366679da">row_reduce_small</a></div><div class="ttdeci">void row_reduce_small(const device T *in, device U *out, const constant size_t &amp;row_size, const constant size_t &amp;non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, uint simd_lane_id, uint3 gid, uint3 gsize, uint3 tid, uint3 tsize)</div><div class="ttdef"><b>Definition</b> reduce_row.h:198</div></div>
 <div class="ttc" id="areduce__row_8h_html_a9d5e0049a2276f43702fc6907e74a35f"><div class="ttname"><a href="reduce__row_8h.html#a9d5e0049a2276f43702fc6907e74a35f">per_thread_row_reduce</a></div><div class="ttdeci">METAL_FUNC void per_thread_row_reduce(thread U totals[N_WRITES], const device T *inputs[N_WRITES], int blocks, int extra, uint lsize_x, uint lid_x)</div><div class="ttdoc">The thread group collaboratively reduces across the rows with bounds checking.</div><div class="ttdef"><b>Definition</b> reduce_row.h:19</div></div>
 <div class="ttc" id="areduce__row_8h_html_aa146bb611069fd2892f03714fd1cc3cf"><div class="ttname"><a href="reduce__row_8h.html#aa146bb611069fd2892f03714fd1cc3cf">threadgroup_reduce</a></div><div class="ttdeci">METAL_FUNC void threadgroup_reduce(thread U totals[N_WRITES], threadgroup U *shared_vals, uint3 lid, uint simd_lane_id, uint simd_per_group, uint simd_group_id)</div><div class="ttdoc">Reduce within the threadgroup.</div><div class="ttdef"><b>Definition</b> reduce_row.h:129</div></div>
-<div class="ttc" id="areduce__row_8h_html_ac01d30987668930c8b38900e47b8308b"><div class="ttname"><a href="reduce__row_8h.html#ac01d30987668930c8b38900e47b8308b">row_reduce_simple</a></div><div class="ttdeci">void row_reduce_simple(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> reduce_row.h:264</div></div>
-<div class="ttc" id="areduce__row_8h_html_ad98332d74a6824aa7499df3e2f2246ae"><div class="ttname"><a href="reduce__row_8h.html#ad98332d74a6824aa7499df3e2f2246ae">row_reduce_looped</a></div><div class="ttdeci">void row_reduce_looped(const device T *in, device U *out, const constant size_t &amp;row_size, const constant size_t &amp;non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> reduce_row.h:311</div></div>
+<div class="ttc" id="areduce__row_8h_html_aeb49e89f1163cb3093770bb710df9f5e"><div class="ttname"><a href="reduce__row_8h.html#aeb49e89f1163cb3093770bb710df9f5e">row_reduce_small</a></div><div class="ttdeci">void row_reduce_small(const device T *in, device U *out, const constant size_t &amp;row_size, const constant size_t &amp;non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, uint simd_lane_id, uint3 gid, uint3 gsize, uint3 tid, uint3 tsize)</div><div class="ttdef"><b>Definition</b> reduce_row.h:199</div></div>
+<div class="ttc" id="areduce__row_8h_html_aef628dfccdb1361da5546f8b17c510bf"><div class="ttname"><a href="reduce__row_8h.html#aef628dfccdb1361da5546f8b17c510bf">row_reduce_simple</a></div><div class="ttdeci">void row_reduce_simple(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> reduce_row.h:266</div></div>
+<div class="ttc" id="areduce__row_8h_html_afba85f5a1c935c124ef52e986d4b2c49"><div class="ttname"><a href="reduce__row_8h.html#afba85f5a1c935c124ef52e986d4b2c49">row_reduce_looped</a></div><div class="ttdeci">void row_reduce_looped(const device T *in, device U *out, const constant size_t &amp;row_size, const constant size_t &amp;non_row_reductions, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize, uint simd_lane_id, uint simd_per_group, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> reduce_row.h:314</div></div>
 <div class="ttc" id="areduce__row_8h_html_afd80a25fa84e6cc884dcc8698859ade1"><div class="ttname"><a href="reduce__row_8h.html#afd80a25fa84e6cc884dcc8698859ade1">thread_reduce</a></div><div class="ttdeci">METAL_FUNC void thread_reduce(thread U &amp;total, const device T *row, int blocks, int extra)</div><div class="ttdef"><b>Definition</b> reduce_row.h:166</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html"><div class="ttname"><a href="structlooped__elem__to__loc.html">looped_elem_to_loc</a></div><div class="ttdef"><b>Definition</b> utils.h:197</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_a05558dabba889ee0d80ed4b567d901ca"><div class="ttname"><a href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">looped_elem_to_loc::next</a></div><div class="ttdeci">void next(const constant int *shape, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:202</div></div>
-<div class="ttc" id="astructlooped__elem__to__loc_html_accc6d4957a8aeb38f5062754793b74d2"><div class="ttname"><a href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">looped_elem_to_loc::location</a></div><div class="ttdeci">offset_t location(offset_t, const constant int *, const constant size_t *, int)</div><div class="ttdef"><b>Definition</b> utils.h:229</div></div>
+<div class="ttc" id="astruct_looped_elem_to_loc_html"><div class="ttname"><a href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a></div><div class="ttdef"><b>Definition</b> utils.h:208</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/scaled__dot__product__attention__params_8h_source.html b/docs/build/html/scaled__dot__product__attention__params_8h_source.html
deleted file mode 100644
index 2f67adead..000000000
--- a/docs/build/html/scaled__dot__product__attention__params_8h_source.html
+++ /dev/null
@@ -1,174 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=11"/>
-<meta name="generator" content="Doxygen 1.12.0"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: mlx/backend/metal/kernels/scaled_dot_product_attention_params.h Source File</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<script type="text/javascript" src="clipboard.js"></script>
-<link href="navtree.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="resize.js"></script>
-<script type="text/javascript" src="cookie.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr id="projectrow">
-  <td id="projectalign">
-   <div id="projectname">MLX
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.12.0 -->
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-var searchBox = new SearchBox("searchBox", "search/",'.html');
-/* @license-end */
-</script>
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-$(function() { codefold.init(0); });
-/* @license-end */
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-$(function() {
-  initMenu('',true,false,'search.php','Search',false);
-  $(function() { init_search(); });
-});
-/* @license-end */
-</script>
-<div id="main-nav"></div>
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-$(function(){ initResizable(false); });
-/* @license-end */
-</script>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<div id="MSearchResults">
-<div class="SRPage">
-<div id="SRIndex">
-<div id="SRResults"></div>
-<div class="SRStatus" id="Loading">Loading...</div>
-<div class="SRStatus" id="Searching">Searching...</div>
-<div class="SRStatus" id="NoMatches">No Matches</div>
-</div>
-</div>
-</div>
-</div>
-
-<div id="nav-path" class="navpath">
-  <ul>
-<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li>  </ul>
-</div>
-</div><!-- top -->
-<div id="doc-content">
-<div class="header">
-  <div class="headertitle"><div class="title">scaled_dot_product_attention_params.h</div></div>
-</div><!--header-->
-<div class="contents">
-<a href="scaled__dot__product__attention__params_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">//</span></div>
-<div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span><span class="comment">//  scaled_dot_product_attention_params.h</span></div>
-<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="comment">//  mlx</span></div>
-<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
-<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span><span class="preprocessor">#pragma once</span></div>
-<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span> </div>
-<div class="foldopen" id="foldopen00007" data-start="{" data-end="};">
-<div class="line"><a id="l00007" name="l00007"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html">    7</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a> {</div>
-<div class="line"><a id="l00008" name="l00008"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a5cd3ede5f41d5fdf8177cab3f059f4d8">    8</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a5cd3ede5f41d5fdf8177cab3f059f4d8">M</a>;</div>
-<div class="line"><a id="l00009" name="l00009"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#ab42c792a80388002e34992cbd837a167">    9</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#ab42c792a80388002e34992cbd837a167">N</a>;</div>
-<div class="line"><a id="l00010" name="l00010"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#ada454f5ad22ec36a22d0ff596751af23">   10</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#ada454f5ad22ec36a22d0ff596751af23">K</a>;</div>
-<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span> </div>
-<div class="line"><a id="l00012" name="l00012"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#af2dadba2a28f5db2ca52472d00937e58">   12</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#af2dadba2a28f5db2ca52472d00937e58">ldq</a>; <span class="comment">// ldq == ldo</span></div>
-<div class="line"><a id="l00013" name="l00013"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a1f8c89bd55d89ad7b9fe27c60e3cb8d5">   13</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a1f8c89bd55d89ad7b9fe27c60e3cb8d5">ldk</a>;</div>
-<div class="line"><a id="l00014" name="l00014"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#aebada0bf0789e8706dce564752208e8b">   14</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#aebada0bf0789e8706dce564752208e8b">ldv</a>;</div>
-<div class="line"><a id="l00015" name="l00015"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a274eeb8591c02511014dce50c4240c8a">   15</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a274eeb8591c02511014dce50c4240c8a">lds</a>;</div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a9e73dc1971b5ab913bd85a7afa7cf46c">   16</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a9e73dc1971b5ab913bd85a7afa7cf46c">ldo</a>;</div>
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span> </div>
-<div class="line"><a id="l00018" name="l00018"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a608aa256216ac6d80af00209303d2029">   18</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a608aa256216ac6d80af00209303d2029">tiles_n</a>;</div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a0df159c839fc27b9426b8ac4336cc0ad">   19</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a0df159c839fc27b9426b8ac4336cc0ad">tiles_m</a>;</div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span> </div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a98766fc89f75d5eef65b345f16a782d1">   21</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a98766fc89f75d5eef65b345f16a782d1">batch_stride_q</a>;</div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a162826d3f288f64c0aea88a36b34859b">   22</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a162826d3f288f64c0aea88a36b34859b">batch_stride_k</a>;</div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a1180e311b95cd4b6d4a336d21b873c21">   23</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a1180e311b95cd4b6d4a336d21b873c21">batch_stride_v</a>;</div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a3c5b1170999087f3f3a03830193b55c7">   24</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a3c5b1170999087f3f3a03830193b55c7">batch_stride_o</a>;</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span> </div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a68a338d522ffeb6761b7b168869361e2">   26</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a68a338d522ffeb6761b7b168869361e2">swizzle_log</a>;</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#ab56b3db8fc6a938ce9c739ee78a7b803">   27</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#ab56b3db8fc6a938ce9c739ee78a7b803">gemm_n_iterations_aligned</a>;</div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#adbc0a13076da5f704498e57239cb2bf2">   28</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#adbc0a13076da5f704498e57239cb2bf2">gemm_k_iterations_aligned</a>;</div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a2799a2f219441fef7f351374f4cbc67c">   29</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a2799a2f219441fef7f351374f4cbc67c">gemm_sv_m_block_iterations</a>;</div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span> </div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a6f3d94dbe44b32e675558768710bf0a3">   31</a></span>  <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a6f3d94dbe44b32e675558768710bf0a3">batch_ndim</a>;</div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno"><a class="line" href="struct_m_l_x_fast_attention_params.html#a932266d04fa7d6e27d4a4a2c175f1477">   32</a></span>  <span class="keyword">const</span> <span class="keywordtype">float</span> <a class="code hl_variable" href="struct_m_l_x_fast_attention_params.html#a932266d04fa7d6e27d4a4a2c175f1477">alpha</a>;</div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>};</div>
-</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span> </div>
-<div class="foldopen" id="foldopen00035" data-start="{" data-end="};">
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno"><a class="line" href="struct_m_l_x_scaled_dot_product_attention_params.html">   35</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a> {</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="comment">// Associated dimensions &amp; transposition information</span></div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno"><a class="line" href="struct_m_l_x_scaled_dot_product_attention_params.html#a46cc2da6a069d822f36983ee18467e5c">   37</a></span>  <span class="keyword">const</span> uint <a class="code hl_variable" href="struct_m_l_x_scaled_dot_product_attention_params.html#a46cc2da6a069d822f36983ee18467e5c">QUERY_SEQUENCE_LENGTH</a> = 1;</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno"><a class="line" href="struct_m_l_x_scaled_dot_product_attention_params.html#a1a63d2e7ad712b4ba26219c784c95177">   38</a></span>  <span class="keyword">const</span> uint <a class="code hl_variable" href="struct_m_l_x_scaled_dot_product_attention_params.html#a1a63d2e7ad712b4ba26219c784c95177">N_Q_HEADS</a> = 32;</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno"><a class="line" href="struct_m_l_x_scaled_dot_product_attention_params.html#a68a292b9986c20560aca88394f82e9f7">   39</a></span>  <span class="keyword">const</span> uint <a class="code hl_variable" href="struct_m_l_x_scaled_dot_product_attention_params.html#a68a292b9986c20560aca88394f82e9f7">N_KV_HEADS</a> = 32;</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno"><a class="line" href="struct_m_l_x_scaled_dot_product_attention_params.html#a58ef2765fd681e6b35b2ba72030610e0">   40</a></span>  <span class="keyword">const</span> uint <a class="code hl_variable" href="struct_m_l_x_scaled_dot_product_attention_params.html#a58ef2765fd681e6b35b2ba72030610e0">KV_TILES</a> = 1;</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno"><a class="line" href="struct_m_l_x_scaled_dot_product_attention_params.html#a7461e0e17cdc7d3fed80bb00d58d8644">   41</a></span>  <span class="keyword">const</span> <span class="keywordtype">float</span> <a class="code hl_variable" href="struct_m_l_x_scaled_dot_product_attention_params.html#a7461e0e17cdc7d3fed80bb00d58d8644">INV_ALPHA</a> = 0.08838834764831843f;</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>};</div>
-</div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:7</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a0df159c839fc27b9426b8ac4336cc0ad"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a0df159c839fc27b9426b8ac4336cc0ad">MLXFastAttentionParams::tiles_m</a></div><div class="ttdeci">const int tiles_m</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:19</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a1180e311b95cd4b6d4a336d21b873c21"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a1180e311b95cd4b6d4a336d21b873c21">MLXFastAttentionParams::batch_stride_v</a></div><div class="ttdeci">const int batch_stride_v</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:23</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a162826d3f288f64c0aea88a36b34859b"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a162826d3f288f64c0aea88a36b34859b">MLXFastAttentionParams::batch_stride_k</a></div><div class="ttdeci">const int batch_stride_k</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:22</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a1f8c89bd55d89ad7b9fe27c60e3cb8d5"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a1f8c89bd55d89ad7b9fe27c60e3cb8d5">MLXFastAttentionParams::ldk</a></div><div class="ttdeci">const int ldk</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:13</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a274eeb8591c02511014dce50c4240c8a"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a274eeb8591c02511014dce50c4240c8a">MLXFastAttentionParams::lds</a></div><div class="ttdeci">const int lds</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:15</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a2799a2f219441fef7f351374f4cbc67c"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a2799a2f219441fef7f351374f4cbc67c">MLXFastAttentionParams::gemm_sv_m_block_iterations</a></div><div class="ttdeci">const int gemm_sv_m_block_iterations</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:29</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a3c5b1170999087f3f3a03830193b55c7"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a3c5b1170999087f3f3a03830193b55c7">MLXFastAttentionParams::batch_stride_o</a></div><div class="ttdeci">const int batch_stride_o</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:24</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a5cd3ede5f41d5fdf8177cab3f059f4d8"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a5cd3ede5f41d5fdf8177cab3f059f4d8">MLXFastAttentionParams::M</a></div><div class="ttdeci">const int M</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:8</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a608aa256216ac6d80af00209303d2029"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a608aa256216ac6d80af00209303d2029">MLXFastAttentionParams::tiles_n</a></div><div class="ttdeci">const int tiles_n</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:18</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a68a338d522ffeb6761b7b168869361e2"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a68a338d522ffeb6761b7b168869361e2">MLXFastAttentionParams::swizzle_log</a></div><div class="ttdeci">const int swizzle_log</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:26</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a6f3d94dbe44b32e675558768710bf0a3"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a6f3d94dbe44b32e675558768710bf0a3">MLXFastAttentionParams::batch_ndim</a></div><div class="ttdeci">const int batch_ndim</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:31</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a932266d04fa7d6e27d4a4a2c175f1477"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a932266d04fa7d6e27d4a4a2c175f1477">MLXFastAttentionParams::alpha</a></div><div class="ttdeci">const float alpha</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:32</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a98766fc89f75d5eef65b345f16a782d1"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a98766fc89f75d5eef65b345f16a782d1">MLXFastAttentionParams::batch_stride_q</a></div><div class="ttdeci">const int batch_stride_q</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:21</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_a9e73dc1971b5ab913bd85a7afa7cf46c"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#a9e73dc1971b5ab913bd85a7afa7cf46c">MLXFastAttentionParams::ldo</a></div><div class="ttdeci">const int ldo</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:16</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_ab42c792a80388002e34992cbd837a167"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#ab42c792a80388002e34992cbd837a167">MLXFastAttentionParams::N</a></div><div class="ttdeci">const int N</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:9</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_ab56b3db8fc6a938ce9c739ee78a7b803"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#ab56b3db8fc6a938ce9c739ee78a7b803">MLXFastAttentionParams::gemm_n_iterations_aligned</a></div><div class="ttdeci">const int gemm_n_iterations_aligned</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:27</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_ada454f5ad22ec36a22d0ff596751af23"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#ada454f5ad22ec36a22d0ff596751af23">MLXFastAttentionParams::K</a></div><div class="ttdeci">const int K</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:10</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_adbc0a13076da5f704498e57239cb2bf2"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#adbc0a13076da5f704498e57239cb2bf2">MLXFastAttentionParams::gemm_k_iterations_aligned</a></div><div class="ttdeci">const int gemm_k_iterations_aligned</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:28</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_aebada0bf0789e8706dce564752208e8b"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#aebada0bf0789e8706dce564752208e8b">MLXFastAttentionParams::ldv</a></div><div class="ttdeci">const int ldv</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:14</div></div>
-<div class="ttc" id="astruct_m_l_x_fast_attention_params_html_af2dadba2a28f5db2ca52472d00937e58"><div class="ttname"><a href="struct_m_l_x_fast_attention_params.html#af2dadba2a28f5db2ca52472d00937e58">MLXFastAttentionParams::ldq</a></div><div class="ttdeci">const int ldq</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:12</div></div>
-<div class="ttc" id="astruct_m_l_x_scaled_dot_product_attention_params_html"><div class="ttname"><a href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a></div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:35</div></div>
-<div class="ttc" id="astruct_m_l_x_scaled_dot_product_attention_params_html_a1a63d2e7ad712b4ba26219c784c95177"><div class="ttname"><a href="struct_m_l_x_scaled_dot_product_attention_params.html#a1a63d2e7ad712b4ba26219c784c95177">MLXScaledDotProductAttentionParams::N_Q_HEADS</a></div><div class="ttdeci">const uint N_Q_HEADS</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:38</div></div>
-<div class="ttc" id="astruct_m_l_x_scaled_dot_product_attention_params_html_a46cc2da6a069d822f36983ee18467e5c"><div class="ttname"><a href="struct_m_l_x_scaled_dot_product_attention_params.html#a46cc2da6a069d822f36983ee18467e5c">MLXScaledDotProductAttentionParams::QUERY_SEQUENCE_LENGTH</a></div><div class="ttdeci">const uint QUERY_SEQUENCE_LENGTH</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:37</div></div>
-<div class="ttc" id="astruct_m_l_x_scaled_dot_product_attention_params_html_a58ef2765fd681e6b35b2ba72030610e0"><div class="ttname"><a href="struct_m_l_x_scaled_dot_product_attention_params.html#a58ef2765fd681e6b35b2ba72030610e0">MLXScaledDotProductAttentionParams::KV_TILES</a></div><div class="ttdeci">const uint KV_TILES</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:40</div></div>
-<div class="ttc" id="astruct_m_l_x_scaled_dot_product_attention_params_html_a68a292b9986c20560aca88394f82e9f7"><div class="ttname"><a href="struct_m_l_x_scaled_dot_product_attention_params.html#a68a292b9986c20560aca88394f82e9f7">MLXScaledDotProductAttentionParams::N_KV_HEADS</a></div><div class="ttdeci">const uint N_KV_HEADS</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:39</div></div>
-<div class="ttc" id="astruct_m_l_x_scaled_dot_product_attention_params_html_a7461e0e17cdc7d3fed80bb00d58d8644"><div class="ttname"><a href="struct_m_l_x_scaled_dot_product_attention_params.html#a7461e0e17cdc7d3fed80bb00d58d8644">MLXScaledDotProductAttentionParams::INV_ALPHA</a></div><div class="ttdeci">const float INV_ALPHA</div><div class="ttdef"><b>Definition</b> scaled_dot_product_attention_params.h:41</div></div>
-</div><!-- fragment --></div><!-- contents -->
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
-</small></address>
-</div><!-- doc-content -->
-</body>
-</html>
diff --git a/docs/build/html/scan_8h.html b/docs/build/html/scan_8h.html
index d4dce074e..17c2b3459 100644
--- a/docs/build/html/scan_8h.html
+++ b/docs/build/html/scan_8h.html
@@ -164,7 +164,7 @@ Functions</h2></td></tr>
 <div class="line">    val = simd_scan(val);                                                \</div>
 <div class="line">    <span class="keywordflow">return</span> <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">simd_shuffle_and_fill_up</a>(val, init, 1);                       \</div>
 <div class="line">  }</div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a5862d5ea154c9b76cf56a630cf6385b4"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">simd_shuffle_and_fill_up</a></div><div class="ttdeci">uint64_t simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta)</div><div class="ttdef"><b>Definition</b> utils.h:342</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a5862d5ea154c9b76cf56a630cf6385b4"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">simd_shuffle_and_fill_up</a></div><div class="ttdeci">uint64_t simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta)</div><div class="ttdef"><b>Definition</b> utils.h:383</div></div>
 </div><!-- fragment -->
 </div>
 </div>
diff --git a/docs/build/html/scan_8h_source.html b/docs/build/html/scan_8h_source.html
index ed748db26..0aa061ea5 100644
--- a/docs/build/html/scan_8h_source.html
+++ b/docs/build/html/scan_8h_source.html
@@ -623,9 +623,9 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00487" name="l00487"></a><span class="lineno">  487</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3"><div class="ttname"><a href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a></div><div class="ttdeci">static constant constexpr const uint8_t simd_size</div><div class="ttdef"><b>Definition</b> ops.h:22</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a5862d5ea154c9b76cf56a630cf6385b4"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">simd_shuffle_and_fill_up</a></div><div class="ttdeci">uint64_t simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta)</div><div class="ttdef"><b>Definition</b> utils.h:342</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a71986ecdd7d18f975dd22c3df7421ce2"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2">simd_shuffle</a></div><div class="ttdeci">uint64_t simd_shuffle(uint64_t data, uint16_t lane)</div><div class="ttdef"><b>Definition</b> utils.h:367</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8e5a4b0fb5d018d7b078d147efe4f1e3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">ceildiv</a></div><div class="ttdeci">T ceildiv(T N, U M)</div><div class="ttdoc">Compute ceil((float)N/(float)M)</div><div class="ttdef"><b>Definition</b> utils.h:272</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a5862d5ea154c9b76cf56a630cf6385b4"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">simd_shuffle_and_fill_up</a></div><div class="ttdeci">uint64_t simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta)</div><div class="ttdef"><b>Definition</b> utils.h:383</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a71986ecdd7d18f975dd22c3df7421ce2"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2">simd_shuffle</a></div><div class="ttdeci">uint64_t simd_shuffle(uint64_t data, uint16_t lane)</div><div class="ttdef"><b>Definition</b> utils.h:408</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8e5a4b0fb5d018d7b078d147efe4f1e3"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">ceildiv</a></div><div class="ttdeci">T ceildiv(T N, U M)</div><div class="ttdoc">Compute ceil((float)N/(float)M)</div><div class="ttdef"><b>Definition</b> utils.h:313</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
 <div class="ttc" id="ascan_8h_html_a0d8d6a9b0f3a1263629380bda8eca7bc"><div class="ttname"><a href="scan_8h.html#a0d8d6a9b0f3a1263629380bda8eca7bc">DEFINE_SIMD_SCAN</a></div><div class="ttdeci">#define DEFINE_SIMD_SCAN()</div><div class="ttdef"><b>Definition</b> scan.h:5</div></div>
 <div class="ttc" id="ascan_8h_html_a185f66aac8c5317587e6abd43f3013fc"><div class="ttname"><a href="scan_8h.html#a185f66aac8c5317587e6abd43f3013fc">DEFINE_SIMD_EXCLUSIVE_SCAN</a></div><div class="ttdeci">#define DEFINE_SIMD_EXCLUSIVE_SCAN()</div><div class="ttdef"><b>Definition</b> scan.h:19</div></div>
@@ -650,7 +650,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="astruct_cum_prod_3_01bool_01_4_html_ad634be0b139d10ce6d21332eef0d936b"><div class="ttname"><a href="struct_cum_prod_3_01bool_01_4.html#ad634be0b139d10ce6d21332eef0d936b">CumProd&lt; bool &gt;::operator()</a></div><div class="ttdeci">bool operator()(bool a, T b)</div><div class="ttdef"><b>Definition</b> scan.h:78</div></div>
 <div class="ttc" id="astruct_cum_prod_html"><div class="ttname"><a href="struct_cum_prod.html">CumProd</a></div><div class="ttdef"><b>Definition</b> scan.h:53</div></div>
 <div class="ttc" id="astruct_cum_sum_html"><div class="ttname"><a href="struct_cum_sum.html">CumSum</a></div><div class="ttdef"><b>Definition</b> scan.h:32</div></div>
-<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:17</div></div>
+<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:23</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/scatter_8h.html b/docs/build/html/scatter_8h.html
index 86d8336bd..77f5bfc46 100644
--- a/docs/build/html/scatter_8h.html
+++ b/docs/build/html/scatter_8h.html
@@ -99,18 +99,18 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:ad1ce39d0b6d733a95e739121fcc61bd1" id="r_ad1ce39d0b6d733a95e739121fcc61bd1"><td class="memTemplParams" colspan="2">template&lt;typename T , typename IdxT , typename Op , int NIDX, bool UPD_ROW_CONTIG, int NWORK&gt; </td></tr>
-<tr class="memitem:ad1ce39d0b6d733a95e739121fcc61bd1"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ad1ce39d0b6d733a95e739121fcc61bd1">scatter_impl</a> (const device T *updates, device <a class="el" href="structmlx__atomic.html">mlx_atomic</a>&lt; T &gt; *out, const constant int *upd_shape, const constant size_t *upd_strides, const constant size_t &amp;upd_ndim, const constant size_t &amp;upd_size, const constant int *out_shape, const constant size_t *out_strides, const constant size_t &amp;out_ndim, const constant int *axes, const constant size_t &amp;idx_size, const thread <a class="el" href="struct_indices.html">Indices</a>&lt; IdxT, NIDX &gt; &amp;indices, uint2 gid)</td></tr>
-<tr class="separator:ad1ce39d0b6d733a95e739121fcc61bd1"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a0df7206d4519defb48a6275afc12f87c" id="r_a0df7206d4519defb48a6275afc12f87c"><td class="memTemplParams" colspan="2">template&lt;typename T , typename IdxT , typename Op , int NIDX, bool UPD_ROW_CONTIG, int NWORK, typename LocT &gt; </td></tr>
+<tr class="memitem:a0df7206d4519defb48a6275afc12f87c"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a0df7206d4519defb48a6275afc12f87c">scatter_impl</a> (const device T *updates, device <a class="el" href="structmlx__atomic.html">mlx_atomic</a>&lt; T &gt; *out, const constant int *upd_shape, const constant size_t *upd_strides, const constant size_t &amp;upd_ndim, const constant size_t &amp;upd_size, const constant int *out_shape, const constant size_t *out_strides, const constant size_t &amp;out_ndim, const constant int *axes, const constant size_t &amp;idx_size, const thread <a class="el" href="struct_indices.html">Indices</a>&lt; IdxT, NIDX &gt; &amp;indices, uint2 gid)</td></tr>
+<tr class="separator:a0df7206d4519defb48a6275afc12f87c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="ad1ce39d0b6d733a95e739121fcc61bd1" name="ad1ce39d0b6d733a95e739121fcc61bd1"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ad1ce39d0b6d733a95e739121fcc61bd1">&#9670;&#160;</a></span>scatter_impl()</h2>
+<a id="a0df7206d4519defb48a6275afc12f87c" name="a0df7206d4519defb48a6275afc12f87c"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a0df7206d4519defb48a6275afc12f87c">&#9670;&#160;</a></span>scatter_impl()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename IdxT , typename Op , int NIDX, bool UPD_ROW_CONTIG, int NWORK&gt; </div>
+template&lt;typename T , typename IdxT , typename Op , int NIDX, bool UPD_ROW_CONTIG, int NWORK, typename LocT &gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">METAL_FUNC void scatter_impl </td>
diff --git a/docs/build/html/scatter_8h_source.html b/docs/build/html/scatter_8h_source.html
index 2c89521d7..0a9e0f291 100644
--- a/docs/build/html/scatter_8h_source.html
+++ b/docs/build/html/scatter_8h_source.html
@@ -103,58 +103,61 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span>    <span class="keyword">typename</span> Op,</div>
 <div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span>    <span class="keywordtype">int</span> NIDX,</div>
 <div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span>    <span class="keywordtype">bool</span> UPD_ROW_CONTIG,</div>
-<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span>    <span class="keywordtype">int</span> NWORK&gt;</div>
-<div class="foldopen" id="foldopen00014" data-start="{" data-end="}">
-<div class="line"><a id="l00014" name="l00014"></a><span class="lineno"><a class="line" href="scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1">   14</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1">scatter_impl</a>(</div>
-<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>    <span class="keyword">const</span> device T* updates,</div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    device <a class="code hl_struct" href="structmlx__atomic.html">mlx_atomic&lt;T&gt;</a>* out,</div>
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* upd_shape,</div>
-<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* upd_strides,</div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; upd_ndim,</div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; upd_size,</div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* out_shape,</div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* out_strides,</div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; out_ndim,</div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* axes,</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; idx_size,</div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    <span class="keyword">const</span> thread <a class="code hl_struct" href="struct_indices.html">Indices&lt;IdxT, NIDX&gt;</a>&amp; indices,</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>    uint2 gid [[thread_position_in_grid]]) {</div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span> </div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  <span class="keyword">auto</span> ind_idx = gid.y * NWORK;</div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="keywordtype">size_t</span> out_offset = 0;</div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>  <span class="keywordflow">if</span> (upd_size &gt; 1) {</div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>    out_offset =</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>        <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(gid.x, upd_shape + indices.ndim, out_strides, out_ndim);</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>  }</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span> </div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j &lt; NWORK &amp;&amp; ind_idx &lt; idx_size; ++j, ind_idx++) {</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>    <span class="keywordtype">size_t</span> out_idx = out_offset;</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; NIDX; ++i) {</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>      <span class="keyword">auto</span> idx_loc = indices.row_contiguous[i]</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>          ? ind_idx</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>          : <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(</div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>                ind_idx,</div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>                &amp;indices.shapes[indices.ndim * i],</div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>                &amp;indices.strides[indices.ndim * i],</div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>                indices.ndim);</div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>      <span class="keyword">auto</span> ax = axes[i];</div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>      <span class="keyword">auto</span> idx_val = <a class="code hl_function" href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df">offset_neg_idx</a>(indices.buffers[i][idx_loc], out_shape[ax]);</div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>      out_idx += idx_val * out_strides[ax];</div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>    }</div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>    <span class="keyword">auto</span> upd_idx = ind_idx * upd_size + gid.x;</div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>    <span class="keywordflow">if</span> <span class="keyword">constexpr</span> (!UPD_ROW_CONTIG) {</div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>      upd_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(upd_idx, upd_shape, upd_strides, upd_ndim);</div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>    }</div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>    <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.atomic_update(out, updates[upd_idx], out_idx);</div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  }</div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>}</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span>    <span class="keywordtype">int</span> NWORK,</div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span>    <span class="keyword">typename</span> LocT&gt;</div>
+<div class="foldopen" id="foldopen00015" data-start="{" data-end="}">
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno"><a class="line" href="scatter_8h.html#a0df7206d4519defb48a6275afc12f87c">   15</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="scatter_8h.html#a0df7206d4519defb48a6275afc12f87c">scatter_impl</a>(</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    <span class="keyword">const</span> device T* updates,</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    device <a class="code hl_struct" href="structmlx__atomic.html">mlx_atomic&lt;T&gt;</a>* out,</div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* upd_shape,</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* upd_strides,</div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; upd_ndim,</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; upd_size,</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* out_shape,</div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* out_strides,</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; out_ndim,</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* axes,</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; idx_size,</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>    <span class="keyword">const</span> thread <a class="code hl_struct" href="struct_indices.html">Indices&lt;IdxT, NIDX&gt;</a>&amp; indices,</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>    uint2 gid [[thread_position_in_grid]]) {</div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span> </div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="keyword">auto</span> ind_idx = gid.y * NWORK;</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>  LocT out_offset = 0;</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  <span class="keywordflow">if</span> (upd_size &gt; 1) {</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    out_offset = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, LocT&gt;</a>(</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>        gid.x, upd_shape + indices.ndim, out_strides, out_ndim);</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  }</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span> </div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j &lt; NWORK &amp;&amp; ind_idx &lt; idx_size; ++j, ind_idx++) {</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>    LocT out_idx = out_offset;</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; NIDX; ++i) {</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>      <span class="keyword">auto</span> idx_loc = indices.row_contiguous[i]</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>          ? ind_idx</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>          : <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, LocT&gt;</a>(</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>                ind_idx,</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>                &amp;indices.shapes[indices.ndim * i],</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>                &amp;indices.strides[indices.ndim * i],</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>                indices.ndim);</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>      <span class="keyword">auto</span> ax = axes[i];</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>      <span class="keyword">auto</span> idx_val = <a class="code hl_function" href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8">offset_neg_idx</a>(indices.buffers[i][idx_loc], out_shape[ax]);</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>      out_idx +=</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>          <span class="keyword">static_cast&lt;</span>LocT<span class="keyword">&gt;</span>(idx_val) * <span class="keyword">static_cast&lt;</span>LocT<span class="keyword">&gt;</span>(out_strides[ax]);</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>    }</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>    <span class="keyword">auto</span> upd_idx = ind_idx * <span class="keyword">static_cast&lt;</span>LocT<span class="keyword">&gt;</span>(upd_size) + gid.x;</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>    <span class="keywordflow">if</span> <span class="keyword">constexpr</span> (!UPD_ROW_CONTIG) {</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>      upd_idx =</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>          <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc&lt;size_t, LocT&gt;</a>(upd_idx, upd_shape, upd_strides, upd_ndim);</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>    }</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>    <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.atomic_update(out, updates[upd_idx], out_idx);</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>  }</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>}</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
 <div class="ttc" id="akernels_2indexing_8h_html"><div class="ttname"><a href="kernels_2indexing_8h.html">indexing.h</a></div></div>
-<div class="ttc" id="akernels_2indexing_8h_html_ab41167dc537c06fbdb4df100972393df"><div class="ttname"><a href="kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df">offset_neg_idx</a></div><div class="ttdeci">METAL_FUNC size_t offset_neg_idx(IdxT idx, size_t size)</div><div class="ttdef"><b>Definition</b> indexing.h:17</div></div>
-<div class="ttc" id="ascatter_8h_html_ad1ce39d0b6d733a95e739121fcc61bd1"><div class="ttname"><a href="scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1">scatter_impl</a></div><div class="ttdeci">METAL_FUNC void scatter_impl(const device T *updates, device mlx_atomic&lt; T &gt; *out, const constant int *upd_shape, const constant size_t *upd_strides, const constant size_t &amp;upd_ndim, const constant size_t &amp;upd_size, const constant int *out_shape, const constant size_t *out_strides, const constant size_t &amp;out_ndim, const constant int *axes, const constant size_t &amp;idx_size, const thread Indices&lt; IdxT, NIDX &gt; &amp;indices, uint2 gid)</div><div class="ttdef"><b>Definition</b> scatter.h:14</div></div>
+<div class="ttc" id="akernels_2indexing_8h_html_a58a65ea6215999cd4ccb4fe757cc2dc8"><div class="ttname"><a href="kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8">offset_neg_idx</a></div><div class="ttdeci">METAL_FUNC size_t offset_neg_idx(IdxT idx, int size)</div><div class="ttdef"><b>Definition</b> indexing.h:17</div></div>
+<div class="ttc" id="ascatter_8h_html_a0df7206d4519defb48a6275afc12f87c"><div class="ttname"><a href="scatter_8h.html#a0df7206d4519defb48a6275afc12f87c">scatter_impl</a></div><div class="ttdeci">METAL_FUNC void scatter_impl(const device T *updates, device mlx_atomic&lt; T &gt; *out, const constant int *upd_shape, const constant size_t *upd_strides, const constant size_t &amp;upd_ndim, const constant size_t &amp;upd_size, const constant int *out_shape, const constant size_t *out_strides, const constant size_t &amp;out_ndim, const constant int *axes, const constant size_t &amp;idx_size, const thread Indices&lt; IdxT, NIDX &gt; &amp;indices, uint2 gid)</div><div class="ttdef"><b>Definition</b> scatter.h:15</div></div>
 <div class="ttc" id="astruct_indices_html"><div class="ttname"><a href="struct_indices.html">Indices</a></div><div class="ttdef"><b>Definition</b> indexing.h:8</div></div>
 <div class="ttc" id="astructmlx__atomic_html"><div class="ttname"><a href="structmlx__atomic.html">mlx_atomic</a></div><div class="ttdef"><b>Definition</b> atomic.h:25</div></div>
 </div><!-- fragment --></div><!-- contents -->
diff --git a/docs/build/html/sdpa__vector_8h.html b/docs/build/html/sdpa__vector_8h.html
index daee8ad0a..a809f89e3 100644
--- a/docs/build/html/sdpa__vector_8h.html
+++ b/docs/build/html/sdpa__vector_8h.html
@@ -102,6 +102,12 @@ Functions</h2></td></tr>
 <tr class="memitem:a4bf36f16e16c1c62d9b243573568e5ae" id="r_a4bf36f16e16c1c62d9b243573568e5ae"><td class="memTemplParams" colspan="2">template&lt;typename T , int D&gt; </td></tr>
 <tr class="memitem:a4bf36f16e16c1c62d9b243573568e5ae"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a4bf36f16e16c1c62d9b243573568e5ae">sdpa_vector</a> (const device T *queries, const device T *keys, const device T *values, device T *out, const constant int &amp;gqa_factor, const constant int &amp;N, const constant size_t &amp;k_stride, const constant size_t &amp;v_stride, const constant float &amp;scale, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
 <tr class="separator:a4bf36f16e16c1c62d9b243573568e5ae"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae070ec482c79c5b3bd19dd03ea42ec74" id="r_ae070ec482c79c5b3bd19dd03ea42ec74"><td class="memTemplParams" colspan="2">template&lt;typename T , int D&gt; </td></tr>
+<tr class="memitem:ae070ec482c79c5b3bd19dd03ea42ec74"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ae070ec482c79c5b3bd19dd03ea42ec74">sdpa_vector_2pass_1</a> (const device T *queries, const device T *keys, const device T *values, device float *out, device float *sums, device float *maxs, const constant int &amp;gqa_factor, const constant int &amp;N, const constant size_t &amp;k_stride, const constant size_t &amp;v_stride, const constant float &amp;scale, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
+<tr class="separator:ae070ec482c79c5b3bd19dd03ea42ec74"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1368cf3618a4e03dbf743b3463205efe" id="r_a1368cf3618a4e03dbf743b3463205efe"><td class="memTemplParams" colspan="2">template&lt;typename T , int D&gt; </td></tr>
+<tr class="memitem:a1368cf3618a4e03dbf743b3463205efe"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1368cf3618a4e03dbf743b3463205efe">sdpa_vector_2pass_2</a> (const device float *partials, const device float *sums, const device float *maxs, device T *out, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
+<tr class="separator:a1368cf3618a4e03dbf743b3463205efe"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
 <a id="a4bf36f16e16c1c62d9b243573568e5ae" name="a4bf36f16e16c1c62d9b243573568e5ae"></a>
@@ -175,6 +181,137 @@ template&lt;typename T , int D&gt; </div>
       </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="ae070ec482c79c5b3bd19dd03ea42ec74" name="ae070ec482c79c5b3bd19dd03ea42ec74"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ae070ec482c79c5b3bd19dd03ea42ec74">&#9670;&#160;</a></span>sdpa_vector_2pass_1()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int D&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">void sdpa_vector_2pass_1 </td>
+          <td>(</td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>queries</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>keys</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>values</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">device float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">device float *</td>          <td class="paramname"><span class="paramname"><em>sums</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">device float *</td>          <td class="paramname"><span class="paramname"><em>maxs</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>gqa_factor</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>N</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>k_stride</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>v_stride</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant float &amp;</td>          <td class="paramname"><span class="paramname"><em>scale</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>tid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_gid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_lid</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a1368cf3618a4e03dbf743b3463205efe" name="a1368cf3618a4e03dbf743b3463205efe"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1368cf3618a4e03dbf743b3463205efe">&#9670;&#160;</a></span>sdpa_vector_2pass_2()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int D&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">void sdpa_vector_2pass_2 </td>
+          <td>(</td>
+          <td class="paramtype">const device float *</td>          <td class="paramname"><span class="paramname"><em>partials</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device float *</td>          <td class="paramname"><span class="paramname"><em>sums</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device float *</td>          <td class="paramname"><span class="paramname"><em>maxs</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">device T *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>tid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_gid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_lid</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 </div><!-- contents -->
diff --git a/docs/build/html/sdpa__vector_8h_source.html b/docs/build/html/sdpa__vector_8h_source.html
index 390193135..8785dfc94 100644
--- a/docs/build/html/sdpa__vector_8h_source.html
+++ b/docs/build/html/sdpa__vector_8h_source.html
@@ -115,106 +115,288 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN = 32;</div>
 <div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BD = 32;</div>
 <div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> elem_per_thread = D / BD;</div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span> </div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> stride = BN * D;</div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span> </div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span> </div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  thread U q[elem_per_thread];</div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  thread U k[elem_per_thread];</div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  thread U o[elem_per_thread];</div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span> </div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  threadgroup U outputs[BN * BD];</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>  threadgroup U max_scores[BN];</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>  threadgroup U sum_exp_scores[BN];</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span> </div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="comment">// Adjust positions</span></div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> head_idx = tid.y;</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> kv_head_idx = head_idx / gqa_factor;</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  queries += head_idx * D + simd_lid * elem_per_thread;</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  keys += kv_head_idx * k_stride + simd_gid * D + simd_lid * elem_per_thread;</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  values += kv_head_idx * v_stride + simd_gid * D + simd_lid * elem_per_thread;</div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>  out += head_idx * D + simd_gid * elem_per_thread;</div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span> </div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>  <span class="comment">// Read the query and 0 the output accumulator</span></div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>    q[i] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(scale) * queries[i];</div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>  }</div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>    o[i] = 0;</div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>  }</div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span> </div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>  U max_score = -INFINITY;</div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>  U sum_exp_score = 0;</div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span> </div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  <span class="comment">// For each key</span></div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = simd_gid; i &lt; N; i += BN) {</div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>    <span class="comment">// Read the key</span></div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>      k[i] = keys[i];</div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>    }</div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span> </div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    <span class="comment">// Compute the i-th score</span></div>
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    U score = 0;</div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>      score += q[i] * k[i];</div>
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    }</div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(score);</div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span> </div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>    <span class="comment">// Update the accumulators</span></div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>    U new_max = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>(max_score, score);</div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>    U exp_score = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(score - new_max);</div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span> </div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    max_score = new_max;</div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    sum_exp_score = sum_exp_score * factor + exp_score;</div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span> </div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    <span class="comment">// Update the output accumulator</span></div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>      o[i] = o[i] * factor + exp_score * values[i];</div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    }</div>
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span> </div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    <span class="comment">// Move the pointers to the next kv</span></div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    keys += stride;</div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>    values += stride;</div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>  }</div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> stride = BN * D;</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span> </div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span> </div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>  thread U q[elem_per_thread];</div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  thread U k[elem_per_thread];</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  thread U o[elem_per_thread];</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span> </div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>  threadgroup U outputs[BN * BD];</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  threadgroup U max_scores[BN];</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>  threadgroup U sum_exp_scores[BN];</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span> </div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="comment">// Adjust positions</span></div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> head_idx = tid.y;</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> kv_head_idx = head_idx / gqa_factor;</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  queries += head_idx * D + simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  keys += kv_head_idx * k_stride + simd_gid * D + simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  values += kv_head_idx * v_stride + simd_gid * D + simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  out += head_idx * D + simd_gid * elem_per_thread;</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span> </div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  <span class="comment">// Read the query and 0 the output accumulator</span></div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>    q[i] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(scale) * queries[i];</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>  }</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>    o[i] = 0;</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>  }</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span> </div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>  U max_score = -INFINITY;</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>  U sum_exp_score = 0;</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span> </div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>  <span class="comment">// For each key</span></div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = simd_gid; i &lt; N; i += BN) {</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>    <span class="comment">// Read the key</span></div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>      k[i] = keys[i];</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>    }</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span> </div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>    <span class="comment">// Compute the i-th score</span></div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    U score = 0;</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>      score += q[i] * k[i];</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    }</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(score);</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span> </div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>    <span class="comment">// Update the accumulators</span></div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>    U new_max = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>(max_score, score);</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>    U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    U exp_score = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(score - new_max);</div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span> </div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>    max_score = new_max;</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    sum_exp_score = sum_exp_score * factor + exp_score;</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span> </div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    <span class="comment">// Update the output accumulator</span></div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>      o[i] = o[i] * factor + exp_score * values[i];</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    }</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span> </div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    <span class="comment">// Move the pointers to the next kv</span></div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    keys += stride;</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    values += stride;</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>  }</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span> </div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>  <span class="comment">// Each thread has a partial part of the output so we need to combine them.</span></div>
 <div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span> </div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>  <span class="comment">// Each thread has a partial part of the output so we need to combine them.</span></div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span> </div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  <span class="comment">// First let&#39;s communicate the max and sum_exp</span></div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>    max_scores[simd_gid] = max_score;</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>    sum_exp_scores[simd_gid] = sum_exp_score;</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>  }</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>  max_score = max_scores[simd_lid];</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  U new_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(max_score);</div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>  U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>  sum_exp_score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(sum_exp_scores[simd_lid] * factor);</div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span> </div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>  <span class="comment">// Now we need to aggregate all the outputs</span></div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    outputs[simd_lid * BD + simd_gid] = o[i];</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>  <span class="comment">// First let&#39;s communicate the max and sum_exp</span></div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>    max_scores[simd_gid] = max_score;</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    sum_exp_scores[simd_gid] = sum_exp_score;</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>  }</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>  max_score = max_scores[simd_lid];</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>  U new_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(max_score);</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>  U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  sum_exp_score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(sum_exp_scores[simd_lid] * factor);</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span> </div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>  <span class="comment">// Now we need to aggregate all the outputs</span></div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    outputs[simd_lid * BD + simd_gid] = o[i];</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    o[i] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(outputs[simd_gid * BD + simd_lid] * factor) / sum_exp_score;</div>
 <div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    o[i] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(outputs[simd_gid * BD + simd_lid] * factor) / sum_exp_score;</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>  }</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span> </div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  <span class="comment">// And write the output</span></div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>      out[i] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(o[i]);</div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>    }</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>  }</div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>}</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>  }</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span> </div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>  <span class="comment">// And write the output</span></div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>      out[i] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(o[i]);</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    }</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>  }</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>}</div>
 </div>
-<div class="ttc" id="anamespacemetal_1_1fast_html_ad3dbd387b63373c29e3449609f763ede"><div class="ttname"><a href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">metal::fast::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:242</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a048cad0aca52cb737ebf103e76bd1c49"><div class="ttname"><a href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">metal::simd_max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_max(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:392</div></div>
-<div class="ttc" id="anamespacemetal_html_a85181e37a00cb4a4217f1bb25389bce5"><div class="ttname"><a href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">metal::simd_sum</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_sum(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:392</div></div>
-<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span> </div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> D&gt;</div>
+<div class="foldopen" id="foldopen00117" data-start="{" data-end="}">
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno"><a class="line" href="sdpa__vector_8h.html#ae070ec482c79c5b3bd19dd03ea42ec74">  117</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="sdpa__vector_8h.html#ae070ec482c79c5b3bd19dd03ea42ec74">sdpa_vector_2pass_1</a>(</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    <span class="keyword">const</span> device T* queries [[buffer(0)]],</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    <span class="keyword">const</span> device T* keys [[buffer(1)]],</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>    <span class="keyword">const</span> device T* values [[buffer(2)]],</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    device <span class="keywordtype">float</span>* out [[buffer(3)]],</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    device <span class="keywordtype">float</span>* sums [[buffer(4)]],</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    device <span class="keywordtype">float</span>* maxs [[buffer(5)]],</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; gqa_factor,</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N,</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; k_stride,</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; v_stride,</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    <span class="keyword">const</span> constant <span class="keywordtype">float</span>&amp; scale,</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN = 8;</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BD = 32;</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> elem_per_thread = D / BD;</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> stride = BN * D;</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> blocks = 32;</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span> </div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span> </div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>  thread U q[elem_per_thread];</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>  thread U k[elem_per_thread];</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>  thread U o[elem_per_thread];</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span> </div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>  threadgroup U outputs[BN * BD];</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>  threadgroup U max_scores[BN];</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>  threadgroup U sum_exp_scores[BN];</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span> </div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>  <span class="comment">// Adjust positions</span></div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> block_idx = tid.z;</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> head_idx = tid.y;</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> kv_head_idx = head_idx / gqa_factor;</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>  queries += head_idx * D + simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>  keys += kv_head_idx * k_stride + (block_idx * BN + simd_gid) * D +</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>      simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>  values += kv_head_idx * v_stride + (block_idx * BN + simd_gid) * D +</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>      simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>  out += head_idx * blocks * D + block_idx * D + simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>  sums += head_idx * blocks + block_idx;</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>  maxs += head_idx * blocks + block_idx;</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span> </div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>  <span class="comment">// Read the query and 0 the output accumulator</span></div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    q[i] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(scale) * queries[i];</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>  }</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    o[i] = 0;</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>  }</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span> </div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>  U max_score = -1e9;</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>  U sum_exp_score = 0;</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span> </div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>  <span class="comment">// For each key</span></div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = block_idx * BN + simd_gid; i &lt; N; i += blocks * BN) {</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>    <span class="comment">// Read the key</span></div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>      k[i] = keys[i];</div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    }</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span> </div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>    <span class="comment">// Compute the i-th score</span></div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>    U score = 0;</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>      score += q[i] * k[i];</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>    }</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>    score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(score);</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span> </div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>    <span class="comment">// Update the accumulators</span></div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>    U new_max = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>(max_score, score);</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>    U exp_score = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(score - new_max);</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span> </div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>    max_score = new_max;</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>    sum_exp_score = sum_exp_score * factor + exp_score;</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span> </div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>    <span class="comment">// Update the output accumulator</span></div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>      o[i] = o[i] * factor + exp_score * values[i];</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>    }</div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span> </div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>    <span class="comment">// Move the pointers to the next kv</span></div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>    keys += blocks * stride;</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>    values += blocks * stride;</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>  }</div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span> </div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>  <span class="comment">// Each thread has a partial part of the output so we need to combine them.</span></div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span> </div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>  <span class="comment">// First let&#39;s communicate the max and sum_exp</span></div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>    max_scores[simd_gid] = max_score;</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>    sum_exp_scores[simd_gid] = sum_exp_score;</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>  }</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>  max_score = (simd_lid &lt; BN) ? max_scores[simd_lid] : -1e9;</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>  U new_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(max_score);</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>  U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>  sum_exp_score = (simd_lid &lt; BN) ? sum_exp_scores[simd_lid] : 0;</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>  sum_exp_score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(sum_exp_score * factor);</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span> </div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>  <span class="comment">// Write the sum and new max</span></div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>  <span class="keywordflow">if</span> (simd_gid == 0) {</div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>    sums[0] = sum_exp_score;</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>    maxs[0] = new_max;</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>  }</div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span> </div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>  <span class="comment">// Now we need to aggregate all the outputs</span></div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>    outputs[simd_lid * BN + simd_gid] =</div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>        o[i] * <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_scores[simd_gid] - new_max);</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span> </div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    <span class="comment">// And write the output</span></div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>    <span class="keywordflow">if</span> (simd_gid == 0) {</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>      U output = outputs[simd_lid * BN];</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 1; j &lt; BN; j++) {</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>        output += outputs[simd_lid * BN + j];</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>      }</div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>      out[i] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(output);</div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>    }</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>  }</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>}</div>
+</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span> </div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> D&gt;</div>
+<div class="foldopen" id="foldopen00243" data-start="{" data-end="}">
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno"><a class="line" href="sdpa__vector_8h.html#a1368cf3618a4e03dbf743b3463205efe">  243</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="sdpa__vector_8h.html#a1368cf3618a4e03dbf743b3463205efe">sdpa_vector_2pass_2</a>(</div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>    <span class="keyword">const</span> device <span class="keywordtype">float</span>* partials [[buffer(0)]],</div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>    <span class="keyword">const</span> device <span class="keywordtype">float</span>* sums [[buffer(1)]],</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>    <span class="keyword">const</span> device <span class="keywordtype">float</span>* maxs [[buffer(2)]],</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>    device T* out [[buffer(3)]],</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN = 32;</div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BD = 32;</div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> elem_per_thread = D / BD;</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> blocks = 32;</div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span> </div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span> </div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>  thread U o[elem_per_thread];</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>  threadgroup U outputs[BN * BD];</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span> </div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>  <span class="comment">// Adjust positions</span></div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> head_idx = tid.y;</div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>  partials += head_idx * blocks * D + simd_gid * D + simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>  sums += head_idx * blocks;</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>  maxs += head_idx * blocks;</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>  out += head_idx * D + simd_gid * elem_per_thread;</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span> </div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>  <span class="comment">// First everybody reads the max and sum_exp</span></div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>  U max_score = maxs[simd_lid];</div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>  U new_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(max_score);</div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>  U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>  U sum_exp_score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(sums[simd_lid] * factor);</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span> </div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>  <span class="comment">// Now read the block into registers and then use shared memory to transpose</span></div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>  <span class="comment">// it</span></div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>    o[i] = partials[i];</div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>  }</div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>    outputs[simd_lid * BD + simd_gid] = o[i];</div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>    o[i] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(outputs[simd_gid * BD + simd_lid] * factor) / sum_exp_score;</div>
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>  }</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span> </div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>  <span class="comment">// And write the output</span></div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>      out[i] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(o[i]);</div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>    }</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>  }</div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>}</div>
+</div>
+<div class="ttc" id="anamespacemetal_1_1fast_html_ad3dbd387b63373c29e3449609f763ede"><div class="ttname"><a href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">metal::fast::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:240</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a048cad0aca52cb737ebf103e76bd1c49"><div class="ttname"><a href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">metal::simd_max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_max(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:378</div></div>
+<div class="ttc" id="anamespacemetal_html_a85181e37a00cb4a4217f1bb25389bce5"><div class="ttname"><a href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">metal::simd_sum</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_sum(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:378</div></div>
+<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="asdpa__vector_8h_html_a1368cf3618a4e03dbf743b3463205efe"><div class="ttname"><a href="sdpa__vector_8h.html#a1368cf3618a4e03dbf743b3463205efe">sdpa_vector_2pass_2</a></div><div class="ttdeci">void sdpa_vector_2pass_2(const device float *partials, const device float *sums, const device float *maxs, device T *out, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> sdpa_vector.h:243</div></div>
 <div class="ttc" id="asdpa__vector_8h_html_a4bf36f16e16c1c62d9b243573568e5ae"><div class="ttname"><a href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae">sdpa_vector</a></div><div class="ttdeci">void sdpa_vector(const device T *queries, const device T *keys, const device T *values, device T *out, const constant int &amp;gqa_factor, const constant int &amp;N, const constant size_t &amp;k_stride, const constant size_t &amp;v_stride, const constant float &amp;scale, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> sdpa_vector.h:8</div></div>
+<div class="ttc" id="asdpa__vector_8h_html_ae070ec482c79c5b3bd19dd03ea42ec74"><div class="ttname"><a href="sdpa__vector_8h.html#ae070ec482c79c5b3bd19dd03ea42ec74">sdpa_vector_2pass_1</a></div><div class="ttdeci">void sdpa_vector_2pass_1(const device T *queries, const device T *keys, const device T *values, device float *out, device float *sums, device float *maxs, const constant int &amp;gqa_factor, const constant int &amp;N, const constant size_t &amp;k_stride, const constant size_t &amp;v_stride, const constant float &amp;scale, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> sdpa_vector.h:117</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/search.html b/docs/build/html/search.html
index db738de07..d1f29637b 100644
--- a/docs/build/html/search.html
+++ b/docs/build/html/search.html
@@ -6,7 +6,7 @@
 
   <head>
     <meta charset="utf-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><title>Search - MLX 0.20.0 documentation</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><title>Search - MLX 0.21.0 documentation</title>
   
   
   
@@ -37,7 +37,7 @@
   <link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="_static/documentation_options.js?v=174dfe6e"></script>
     <script src="_static/doctools.js?v=9a2dae69"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -50,7 +50,7 @@
     <link rel="search" title="Search" href="#" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -131,8 +131,8 @@
       
     
     
-    <img src="_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -445,7 +445,6 @@
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -522,6 +521,7 @@
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -551,6 +551,7 @@
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/search/all_1.js b/docs/build/html/search/all_1.js
index 2feceb6ce..0b76d81d5 100644
--- a/docs/build/html/search/all_1.js
+++ b/docs/build/html/search/all_1.js
@@ -5,7 +5,7 @@ var searchData=
   ['a_5fstr_5fm_2',['A_str_m',['../structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877',1,'mlx::steel::BlockMMA']]],
   ['abs_3',['Abs',['../struct_abs.html',1,'Abs'],['../classmlx_1_1core_1_1_abs.html',1,'mlx::core::Abs'],['../structmlx_1_1core_1_1detail_1_1_abs.html',1,'mlx::core::detail::Abs'],['../classmlx_1_1core_1_1_abs.html#a1247e72feb640fb562d036b2dd1ae4ad',1,'mlx::core::Abs::Abs()']]],
   ['abs_4',['abs',['../namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb',1,'metal::abs()'],['../namespacemetal_1_1fast.html#a90d2973f71f83180e7f02e38d11c7a8f',1,'metal::fast::abs()'],['../namespacemetal_1_1precise.html#a99f2b2746e813b9ca7b4249afbaf2a14',1,'metal::precise::abs()'],['../group__ops.html#ga5528e80f5e8bad71e106a0cf9edd8920',1,'mlx::core::abs()']]],
-  ['accum_5ftype_5',['accum_type',['../structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da',1,'mlx::steel::AccumHelper']]],
+  ['accum_5ftype_5',['accum_type',['../structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26',1,'mlx::steel::AccumHelper']]],
   ['accumhelper_6',['AccumHelper',['../structmlx_1_1steel_1_1_accum_helper.html',1,'mlx::steel']]],
   ['acos_7',['acos',['../namespacemetal.html#ad4537748b3c832b6569ff7ccb209fcb2',1,'metal::acos()'],['../namespacemetal_1_1fast.html#a805ce5c3a94b618b7349d70bbb82f0b2',1,'metal::fast::acos()'],['../namespacemetal_1_1precise.html#a8a2bcc89fc0b7e74f0453f82f89a8604',1,'metal::precise::acos()']]],
   ['acosh_8',['acosh',['../namespacemetal.html#a2d0efb92b7f61eff342d776bd6c5f3a0',1,'metal::acosh()'],['../namespacemetal_1_1fast.html#afb656fc3406649a238b6f1e0509de751',1,'metal::fast::acosh()'],['../namespacemetal_1_1precise.html#a1f489fabffab969b8677b56bb1136067',1,'metal::precise::acosh()']]],
@@ -26,18 +26,18 @@ var searchData=
   ['adjust_5fmatrix_5foffsets_23',['adjust_matrix_offsets',['../quantized_8h.html#accab1f9e17a65242347c051f98e4c0be',1,'adjust_matrix_offsets(const device T *&amp;x, const device uint32_t *&amp;w, const device T *&amp;scales, const device T *&amp;biases, device T *&amp;y, int output_stride, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid):&#160;quantized.h'],['../quantized_8h.html#a3ab400746ad77be89c30d25638e01698',1,'adjust_matrix_offsets(const device T *&amp;x, const device uint32_t *&amp;w, const device T *&amp;scales, const device T *&amp;biases, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, device T *&amp;y, int output_stride, const constant int &amp;batch_ndims, const constant int *batch_shape, const constant size_t *lhs_strides, const constant size_t *rhs_strides, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid):&#160;quantized.h']]],
   ['advance_24',['advance',['../classpocketfft_1_1detail_1_1multi__iter.html#a5ddcc0666125b3cb6c0d62b30befdd2c',1,'pocketfft::detail::multi_iter::advance()'],['../classpocketfft_1_1detail_1_1simple__iter.html#a73a9ecd3008d2bd35aaa00bf9fac074f',1,'pocketfft::detail::simple_iter::advance()'],['../classpocketfft_1_1detail_1_1rev__iter.html#ad1918c84ae963188afc7599629b29686',1,'pocketfft::detail::rev_iter::advance()']]],
   ['affine_5fdequantize_25',['affine_dequantize',['../quantized_8h.html#a6076203615038eb06816158f7b3869c6',1,'affine_dequantize():&#160;quantized.h'],['../namespacemlx_1_1core_1_1fast.html#a12c7ef41409d6fb378008e67b6fab328',1,'mlx::core::fast::affine_dequantize()']]],
-  ['affine_5fquantize_26',['affine_quantize',['../quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59',1,'affine_quantize():&#160;quantized.h'],['../namespacemlx_1_1core_1_1fast.html#aa4b5f6886b2288cb6dfdd8598579f080',1,'mlx::core::fast::affine_quantize(const array &amp;w, int group_size=64, int bits=4, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fast.html#a638c7e9b9ea8677f01786d8f9738baf8',1,'mlx::core::fast::affine_quantize(const array &amp;w, const array &amp;scales, const array &amp;biases, int group_size=64, int bits=4, StreamOrDevice s={})']]],
-  ['affine_5fquantize_5fscales_5fbiases_27',['affine_quantize_scales_biases',['../quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c',1,'quantized.h']]],
-  ['affinequantize_28',['AffineQuantize',['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html',1,'mlx::core::fast::AffineQuantize'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a84d5fa9e8c3de407fbcc5f38d2ed1473',1,'mlx::core::fast::AffineQuantize::AffineQuantize()']]],
-  ['align_5fk_29',['align_K',['../steel__gemm__fused_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416',1,'steel_gemm_fused.h']]],
-  ['align_5fm_30',['align_M',['../steel__gemm__fused_8h.html#a55af226dc74b0026b7d4b865142a6d21',1,'steel_gemm_fused.h']]],
-  ['align_5fn_31',['align_N',['../steel__gemm__fused_8h.html#aa3b267252df2dcbfdde8c5f174d27036',1,'steel_gemm_fused.h']]],
+  ['affine_5fquantize_26',['affine_quantize',['../quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59',1,'affine_quantize():&#160;quantized.h'],['../namespacemlx_1_1core_1_1fast.html#aa4b5f6886b2288cb6dfdd8598579f080',1,'mlx::core::fast::affine_quantize(const array &amp;w, int group_size=64, int bits=4, StreamOrDevice s={})']]],
+  ['affinequantize_27',['AffineQuantize',['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html',1,'mlx::core::fast::AffineQuantize'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a84d5fa9e8c3de407fbcc5f38d2ed1473',1,'mlx::core::fast::AffineQuantize::AffineQuantize()']]],
+  ['align_5fk_28',['align_K',['../steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416',1,'align_K:&#160;steel_attention.h'],['../steel__gemm__fused_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416',1,'align_K:&#160;steel_gemm_fused.h']]],
+  ['align_5fm_29',['align_M',['../steel__gemm__fused_8h.html#a55af226dc74b0026b7d4b865142a6d21',1,'steel_gemm_fused.h']]],
+  ['align_5fn_30',['align_N',['../steel__gemm__fused_8h.html#aa3b267252df2dcbfdde8c5f174d27036',1,'steel_gemm_fused.h']]],
+  ['align_5fq_31',['align_Q',['../steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982',1,'steel_attention.h']]],
   ['aligned_5falloc_32',['aligned_alloc',['../namespacepocketfft_1_1detail.html#ae397445c61400f47a8fe3f8e1b6d0b76',1,'pocketfft::detail']]],
   ['aligned_5fallocator_33',['aligned_allocator',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html',1,'pocketfft::detail::threading::aligned_allocator&lt; T &gt;'],['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a57c07047ac09c6cf48a269429de2b0fb',1,'pocketfft::detail::threading::aligned_allocator::aligned_allocator(const aligned_allocator&lt; U &gt; &amp;)'],['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a0c390851ec37c5cdc5c1e7c6232a0b94',1,'pocketfft::detail::threading::aligned_allocator::aligned_allocator()=default']]],
   ['aligned_5fdealloc_34',['aligned_dealloc',['../namespacepocketfft_1_1detail.html#aec7820e36a33e0a8bb83aa03b04b81e8',1,'pocketfft::detail']]],
   ['all_35',['all',['../group__ops.html#ga3b1b90ef1275ca17655b6d7f25d3ee68',1,'mlx::core::all(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga3689e12e8f42dadb4cbe2b07dc4099f4',1,'mlx::core::all(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gac0919c6ba53aea35a7683dea7e9a9a59',1,'mlx::core::all(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gae2d5fcc5b62d673cca76c08b7b4afbbc',1,'mlx::core::all(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
   ['all_5fgather_36',['all_gather',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aeb5a1726358213bc75756506f7b54d04',1,'mlx::core::distributed::detail::all_gather()'],['../namespacemlx_1_1core_1_1distributed.html#a82ef5e8cc7ac62cd228e51b1c1b77cb7',1,'mlx::core::distributed::all_gather()']]],
-  ['all_5freduce_37',['all_reduce',['../reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d',1,'reduce_all.h']]],
+  ['all_5freduce_37',['all_reduce',['../reduce__all_8h.html#a9086a585eda5a887160ee24baae0a7b8',1,'reduce_all.h']]],
   ['all_5freduce_5fdispatch_38',['all_reduce_dispatch',['../namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098',1,'mlx::core']]],
   ['all_5fsum_39',['all_sum',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aa1d225b25f7b6426c48c5e35860ee960',1,'mlx::core::distributed::detail::all_sum()'],['../namespacemlx_1_1core_1_1distributed.html#a67ccb1a5445fc6f5db49dd36a15e5980',1,'mlx::core::distributed::all_sum()']]],
   ['allclose_40',['allclose',['../group__ops.html#gaf0cd4257de7542daf9faf5e605e31020',1,'mlx::core']]],
@@ -48,13 +48,13 @@ var searchData=
   ['allocator_45',['allocator',['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html#abf84c726a37df68345589b897b2e35f0',1,'mlx::core::allocator::CommonAllocator::allocator'],['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#afa1c5a725309caff163c492b5b84491e',1,'mlx::core::metal::MetalAllocator::allocator'],['../namespacemlx_1_1core_1_1allocator.html#aa23e2f20a336d0b159c097087194634e',1,'mlx::core::allocator::allocator()'],['../namespacemlx_1_1core_1_1metal.html#a74b3558bd518aecde6b14b0ba5e1a0d5',1,'mlx::core::metal::allocator()']]],
   ['allocator_2eh_46',['allocator.h',['../allocator_8h.html',1,'(Global Namespace)'],['../backend_2metal_2allocator_8h.html',1,'(Global Namespace)']]],
   ['allreduce_47',['AllReduce',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html',1,'mlx::core::distributed::AllReduce'],['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a2d1ea56cbf72a316680ea90aa6da1c2d',1,'mlx::core::distributed::AllReduce::AllReduce()']]],
-  ['alpha_48',['alpha',['../struct_m_l_x_fast_attention_params.html#a932266d04fa7d6e27d4a4a2c175f1477',1,'MLXFastAttentionParams::alpha'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#af8693d96512eff3e125d33d203920710',1,'mlx::steel::GEMMAddMMParams::alpha'],['../structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff',1,'mlx::steel::TransformAxpby::alpha']]],
+  ['alpha_48',['alpha',['../structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff',1,'mlx::steel::TransformAxpby::alpha'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#af8693d96512eff3e125d33d203920710',1,'mlx::steel::GEMMAddMMParams::alpha']]],
   ['and_49',['And',['../struct_and.html',1,'And&lt; U &gt;'],['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924aba3b7fb927f6b6c8b198a9cdc3dd9e02',1,'mlx::core::distributed::AllReduce::And'],['../classmlx_1_1core_1_1_bitwise_binary.html#a6f8b5d455d0c1770428a6bef1608f23dab14e7d426f45ae7f029f4e00210fbae4',1,'mlx::core::BitwiseBinary::And'],['../classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a5cc3412a1f243dcb11661bca42daea93',1,'mlx::core::Reduce::And']]],
   ['any_50',['any',['../group__ops.html#ga8598dd718fb05cb28535e250372d4e6f',1,'mlx::core::any(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#gad37df97f253a963bece124198dbaf9ba',1,'mlx::core::any(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaf240618fc8b06debf5f56e97e84f18ef',1,'mlx::core::any(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gab1d56277d468a55227f4dad6bc2fc1ce',1,'mlx::core::any(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['apply_51',['apply',['../struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218',1,'ScaleOp::apply(InT x) const'],['../struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218',1,'ScaleOp::apply(InT x) const'],['../structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75',1,'mlx::steel::TransformNone::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90',1,'mlx::steel::TransformNone::apply(InT x, OutT)'],['../structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf',1,'mlx::steel::TransformAdd::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19',1,'mlx::steel::TransformAdd::apply(InT x, OutT c)'],['../structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87',1,'mlx::steel::TransformAxpby::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba',1,'mlx::steel::TransformAxpby::apply(InT x, OutT c) const']]],
-  ['apply_5fepilogue_52',['apply_epilogue',['../structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff',1,'mlx::steel::BlockMMA::apply_epilogue(thread const UnaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae',1,'mlx::steel::BlockMMA::apply_epilogue(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)']]],
-  ['apply_5fepilogue_5fsafe_53',['apply_epilogue_safe',['../structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a',1,'mlx::steel::BlockMMA']]],
-  ['apply_5finplace_5fop_54',['apply_inplace_op',['../structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf',1,'mlx::steel::BlockLoader']]],
+  ['apply_51',['apply',['../struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218',1,'ScaleOp::apply()'],['../struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16',1,'TransformScale::apply()'],['../struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e',1,'MaxOp::apply()'],['../struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d',1,'SumOp::apply()'],['../struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756',1,'MulOp::apply()'],['../struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143',1,'SubOp::apply()'],['../struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334',1,'ExpSubOp::apply()'],['../struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221',1,'DivOp::apply()'],['../structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75',1,'mlx::steel::TransformNone::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90',1,'mlx::steel::TransformNone::apply(InT x, OutT)'],['../structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf',1,'mlx::steel::TransformAdd::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19',1,'mlx::steel::TransformAdd::apply(InT x, OutT c)'],['../structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87',1,'mlx::steel::TransformAxpby::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba',1,'mlx::steel::TransformAxpby::apply(InT x, OutT c) const'],['../struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218',1,'ScaleOp::apply()'],['../structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75',1,'mlx::steel::TransformNone::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90',1,'mlx::steel::TransformNone::apply(InT x, OutT)'],['../structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf',1,'mlx::steel::TransformAdd::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19',1,'mlx::steel::TransformAdd::apply(InT x, OutT c)'],['../structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87',1,'mlx::steel::TransformAxpby::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba',1,'mlx::steel::TransformAxpby::apply(InT x, OutT c) const']]],
+  ['apply_5fepilogue_52',['apply_epilogue',['../structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff',1,'mlx::steel::BlockMMA::apply_epilogue(thread const UnaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae',1,'mlx::steel::BlockMMA::apply_epilogue(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff',1,'mlx::steel::BlockMMA::apply_epilogue(thread const UnaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae',1,'mlx::steel::BlockMMA::apply_epilogue(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)']]],
+  ['apply_5fepilogue_5fsafe_53',['apply_epilogue_safe',['../structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a',1,'mlx::steel::BlockMMA::apply_epilogue_safe(const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const BinaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a',1,'mlx::steel::BlockMMA::apply_epilogue_safe(const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const BinaryEpilogue &amp;epilogue_op)']]],
+  ['apply_5finplace_5fop_54',['apply_inplace_op',['../structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf',1,'mlx::steel::BlockLoader::apply_inplace_op()'],['../structmlx_1_1steel_1_1_block_loader_t.html#a2b136fad00dc54300e68aa6b905eff97',1,'mlx::steel::BlockLoaderT::apply_inplace_op()'],['../structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf',1,'mlx::steel::BlockLoader::apply_inplace_op()']]],
   ['arange_55',['Arange',['../classmlx_1_1core_1_1_arange.html',1,'mlx::core::Arange'],['../classmlx_1_1core_1_1_arange.html#a1a70c3b0b9c67d5a9446c141c5b7c574',1,'mlx::core::Arange::Arange()']]],
   ['arange_56',['arange',['../namespacemlx_1_1core.html#a369aa886219b83cf219e7a7862ce260b',1,'mlx::core::arange()'],['../namespacemlx_1_1core_1_1metal.html#a272c36f0faf2570cbb2f36030e9a3f26',1,'mlx::core::metal::arange()'],['../metal_2kernels_2arange_8h.html#a1e5126ee6ae0164c2343230c4d87c03e',1,'arange():&#160;arange.h'],['../group__ops.html#ga7ca088b8090b9f84f2e08345cf3f835a',1,'mlx::core::arange(double start, double stop, double step, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga4c36b841dc5cba391dad029be5a0ad98',1,'mlx::core::arange(double start, double stop, double step, StreamOrDevice s={})'],['../group__ops.html#ga8d7cf9eb15e2daf1469058907e8abc85',1,'mlx::core::arange(double start, double stop, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga74566a14e69ba6a25f5a35e7ade5c282',1,'mlx::core::arange(double start, double stop, StreamOrDevice s={})'],['../group__ops.html#ga345aa27af3dae3646b8b4b1068e89a3e',1,'mlx::core::arange(double stop, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#gaae179075d0fe23f4bd53fdf8c41f4c70',1,'mlx::core::arange(double stop, StreamOrDevice s={})'],['../group__ops.html#ga6b945f513077c2978afc1a952c884860',1,'mlx::core::arange(int start, int stop, int step, StreamOrDevice s={})'],['../group__ops.html#ga1c39fcc6eaa1c1867735c7f849d708d6',1,'mlx::core::arange(int start, int stop, StreamOrDevice s={})'],['../group__ops.html#gafe6e4580452c873cac294f16129e633f',1,'mlx::core::arange(int stop, StreamOrDevice s={})']]],
   ['arange_2eh_57',['arange.h',['../common_2arange_8h.html',1,'(Global Namespace)'],['../metal_2jit_2arange_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2arange_8h.html',1,'(Global Namespace)']]],
@@ -104,12 +104,15 @@ var searchData=
   ['atan_101',['atan',['../namespacemetal.html#a80a771553d9a0012b93620d19c48b00f',1,'metal::atan()'],['../namespacemetal_1_1fast.html#a769503b4b7f89071d0983258c5a3ac5a',1,'metal::fast::atan()'],['../namespacemetal_1_1precise.html#aaaf4b5f4786a912089bbf0ae7619a6be',1,'metal::precise::atan()']]],
   ['atan2_102',['atan2',['../namespacemetal.html#a1d430793eaa38ccf0d07145e3fcd1e61',1,'metal::atan2()'],['../namespacemetal_1_1fast.html#a00e687ea46f5affe26e6aef8fd62b89a',1,'metal::fast::atan2()'],['../namespacemetal_1_1precise.html#a6f161b049cc6884f87b09b33c2d1cd7f',1,'metal::precise::atan2()']]],
   ['atanh_103',['atanh',['../namespacemetal.html#a57116427997ba71dd3863bfb15de33bf',1,'metal::atanh()'],['../namespacemetal_1_1fast.html#af24608fc605db9a14427d37c36dc1c53',1,'metal::fast::atanh()'],['../namespacemetal_1_1precise.html#a902994837653b90c47f4285673e712c4',1,'metal::precise::atanh()']]],
-  ['atile_104',['Atile',['../structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c',1,'mlx::steel::BlockMMA']]],
+  ['atile_104',['Atile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586',1,'mlx::steel::BlockMMA']]],
   ['atleast_5f1d_105',['atleast_1d',['../group__ops.html#gaba4d25e7a2bf87ba4feb7837ec7fa396',1,'mlx::core::atleast_1d(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga08ca172ce80157c916c89dd0b45b95c5',1,'mlx::core::atleast_1d(const std::vector&lt; array &gt; &amp;a, StreamOrDevice s={})']]],
   ['atleast_5f2d_106',['atleast_2d',['../group__ops.html#gaeeb7f5bb88aa32a3ac2be1f39c5f8087',1,'mlx::core::atleast_2d(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga9950299a80c2562f13448758f856d1f5',1,'mlx::core::atleast_2d(const std::vector&lt; array &gt; &amp;a, StreamOrDevice s={})']]],
   ['atleast_5f3d_107',['atleast_3d',['../group__ops.html#ga4afd919601e67782ff964465919956a0',1,'mlx::core::atleast_3d(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaffdf742ad79440a60dda40062a8074fe',1,'mlx::core::atleast_3d(const std::vector&lt; array &gt; &amp;a, StreamOrDevice s={})']]],
   ['atomic_2eh_108',['atomic.h',['../atomic_8h.html',1,'']]],
   ['atomic_5fupdate_109',['atomic_update',['../struct_none.html#aecbce7c97e8b1d5dc4afd2e788c24e06',1,'None']]],
   ['attach_5fevent_110',['attach_event',['../classmlx_1_1core_1_1array.html#a000c3cfe13cb378bf0523b62816190da',1,'mlx::core::array']]],
-  ['available_111',['available',['../classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078a308bd3e5bf976888b120dd36d0c2d2ae',1,'mlx::core::array']]]
+  ['attention_111',['attention',['../steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33',1,'steel_attention.h']]],
+  ['attn_2eh_112',['attn.h',['../attn_8h.html',1,'']]],
+  ['attnparams_113',['AttnParams',['../structmlx_1_1steel_1_1_attn_params.html',1,'mlx::steel']]],
+  ['available_114',['available',['../classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078a308bd3e5bf976888b120dd36d0c2d2ae',1,'mlx::core::array']]]
 ];
diff --git a/docs/build/html/search/all_10.js b/docs/build/html/search/all_10.js
index 9ae5f957e..d5786df44 100644
--- a/docs/build/html/search/all_10.js
+++ b/docs/build/html/search/all_10.js
@@ -6,7 +6,7 @@ var searchData=
   ['pad_5fgpu_3',['pad_gpu',['../namespacemlx_1_1core.html#a6e2054d396ae487d810642dc19cdd0b0',1,'mlx::core']]],
   ['parallelfilereader_4',['ParallelFileReader',['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html',1,'mlx::core::io::ParallelFileReader'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a6cdb4547408f8cbca9e2ddd82514e697',1,'mlx::core::io::ParallelFileReader::ParallelFileReader()']]],
   ['params_5',['params',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a09b4719415c5bddb0bb70c704b1d8d02',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a4f8c792ede675d14b70dd19fcf3c5aee',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::params'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a1fed11be2e8d9d594dcdf60e32b936b1',1,'mlx::steel::Conv2DWeightBlockLoader::params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a335c573456ede3dd34bda1eec9842fe2',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::params'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#af67adf4550d69231a259e79c1aae9acc',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a09fd92c74ef57c20b48bc780153365ba',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::params'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#ae4759d18c0e5cc3530b3da8493008419',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::params']]],
-  ['params_2eh_6',['params.h',['../conv_2params_8h.html',1,'(Global Namespace)'],['../gemm_2params_8h.html',1,'(Global Namespace)']]],
+  ['params_2eh_6',['params.h',['../attn_2params_8h.html',1,'(Global Namespace)'],['../conv_2params_8h.html',1,'(Global Namespace)'],['../gemm_2params_8h.html',1,'(Global Namespace)']]],
   ['partition_7',['Partition',['../classmlx_1_1core_1_1_partition.html',1,'mlx::core::Partition'],['../classmlx_1_1core_1_1_partition.html#a7b82ca3895b6654308fac566b277ac0d',1,'mlx::core::Partition::Partition()']]],
   ['partition_8',['partition',['../group__ops.html#gac1b30830a972fb9a2601379ad2b32405',1,'mlx::core::partition(const array &amp;a, int kth, StreamOrDevice s={})'],['../group__ops.html#ga4fbea3a5f66cf81e3c119d1661119321',1,'mlx::core::partition(const array &amp;a, int kth, int axis, StreamOrDevice s={})']]],
   ['per_5fthread_5frow_5freduce_9',['per_thread_row_reduce',['../reduce__row_8h.html#a9d5e0049a2276f43702fc6907e74a35f',1,'per_thread_row_reduce(thread U totals[N_WRITES], const device T *inputs[N_WRITES], int blocks, int extra, uint lsize_x, uint lid_x):&#160;reduce_row.h'],['../reduce__row_8h.html#a045ec34228e77c79ec67d11c39ff097a',1,'per_thread_row_reduce(thread U totals[N_WRITES], const device T *in, const constant size_t &amp;reduction_size, int blocks, int extra, uint lsize_x, uint lid_x):&#160;reduce_row.h'],['../reduce__row_8h.html#a4d00c44e5f4a13be529ff8b664a0a342',1,'per_thread_row_reduce(thread U totals[N_WRITES], const device T *in, const size_t row_idx, int blocks, int extra, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, uint lsize_x, uint lid_x):&#160;reduce_row.h']]],
@@ -59,7 +59,7 @@ var searchData=
   ['primitive_5fid_56',['primitive_id',['../classmlx_1_1core_1_1array.html#af5ad83605d4eea81561246873bee1d7c',1,'mlx::core::array']]],
   ['primitive_5fptr_57',['primitive_ptr',['../classmlx_1_1core_1_1array.html#a5119cd616ec3c05d65878944b8889469',1,'mlx::core::array']]],
   ['primitives_2eh_58',['primitives.h',['../distributed_2primitives_8h.html',1,'(Global Namespace)'],['../primitives_8h.html',1,'(Global Namespace)']]],
-  ['print_59',['print',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab',1,'mlx::core::distributed::AllReduce::print()'],['../classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb',1,'mlx::core::Primitive::print()'],['../classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107',1,'mlx::core::Abs::print()'],['../classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d',1,'mlx::core::Add::print()'],['../classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9',1,'mlx::core::AddMM::print()'],['../classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e',1,'mlx::core::Arange::print()'],['../classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739',1,'mlx::core::ArcCos::print()'],['../classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630',1,'mlx::core::ArcCosh::print()'],['../classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87',1,'mlx::core::ArcSin::print()'],['../classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430',1,'mlx::core::ArcSinh::print()'],['../classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05',1,'mlx::core::ArcTan::print()'],['../classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361',1,'mlx::core::ArcTan2::print()'],['../classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523',1,'mlx::core::ArcTanh::print()'],['../classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63',1,'mlx::core::ArgPartition::print()'],['../classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f',1,'mlx::core::ArgReduce::print()'],['../classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd',1,'mlx::core::ArgSort::print()'],['../classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c',1,'mlx::core::AsType::print()'],['../classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf',1,'mlx::core::AsStrided::print()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d',1,'mlx::core::BitwiseBinary::print()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159',1,'mlx::core::BlockMaskedMM::print()'],['../classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758',1,'mlx::core::GatherMM::print()'],['../classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11',1,'mlx::core::Broadcast::print()'],['../classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee',1,'mlx::core::Ceil::print()'],['../classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b',1,'mlx::core::Compiled::print()'],['../classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33',1,'mlx::core::Concatenate::print()'],['../classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4',1,'mlx::core::Conjugate::print()'],['../classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd',1,'mlx::core::Convolution::print()'],['../classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008',1,'mlx::core::Copy::print()'],['../classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696',1,'mlx::core::Cos::print()'],['../classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2',1,'mlx::core::Cosh::print()'],['../classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298',1,'mlx::core::CustomTransforms::print()'],['../classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82',1,'mlx::core::Depends::print()'],['../classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6',1,'mlx::core::Divide::print()'],['../classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1',1,'mlx::core::DivMod::print()'],['../classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7',1,'mlx::core::Select::print()'],['../classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4',1,'mlx::core::Remainder::print()'],['../classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774',1,'mlx::core::Equal::print()'],['../classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c',1,'mlx::core::Erf::print()'],['../classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9',1,'mlx::core::ErfInv::print()'],['../classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a',1,'mlx::core::Exp::print()'],['../classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1',1,'mlx::core::Expm1::print()'],['../classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf',1,'mlx::core::FFT::print()'],['../classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6',1,'mlx::core::Floor::print()'],['../classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013',1,'mlx::core::Full::print()'],['../classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91',1,'mlx::core::Gather::print()'],['../classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04',1,'mlx::core::Greater::print()'],['../classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef',1,'mlx::core::GreaterEqual::print()'],['../classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6',1,'mlx::core::Hadamard::print()'],['../classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d',1,'mlx::core::Imag::print()'],['../classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78',1,'mlx::core::Less::print()'],['../classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950',1,'mlx::core::LessEqual::print()'],['../classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa',1,'mlx::core::Load::print()'],['../classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d',1,'mlx::core::Log::print()'],['../classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4',1,'mlx::core::Log1p::print()'],['../classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c',1,'mlx::core::LogicalNot::print()'],['../classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397',1,'mlx::core::LogicalAnd::print()'],['../classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003',1,'mlx::core::LogicalOr::print()'],['../classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9',1,'mlx::core::LogAddExp::print()'],['../classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd',1,'mlx::core::Matmul::print()'],['../classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca',1,'mlx::core::Maximum::print()'],['../classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512',1,'mlx::core::Minimum::print()'],['../classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909',1,'mlx::core::Multiply::print()'],['../classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91',1,'mlx::core::Negative::print()'],['../classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09',1,'mlx::core::NotEqual::print()'],['../classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52',1,'mlx::core::NumberOfElements::print()'],['../classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a',1,'mlx::core::Pad::print()'],['../classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0',1,'mlx::core::Partition::print()'],['../classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60',1,'mlx::core::Power::print()'],['../classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db',1,'mlx::core::QuantizedMatmul::print()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0',1,'mlx::core::GatherQMM::print()'],['../classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271',1,'mlx::core::RandomBits::print()'],['../classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b',1,'mlx::core::Real::print()'],['../classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862',1,'mlx::core::Reshape::print()'],['../classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd',1,'mlx::core::Reduce::print()'],['../classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72',1,'mlx::core::Round::print()'],['../classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22',1,'mlx::core::Scan::print()'],['../classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa',1,'mlx::core::Scatter::print()'],['../classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2',1,'mlx::core::Sigmoid::print()'],['../classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a',1,'mlx::core::Sign::print()'],['../classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4',1,'mlx::core::Sin::print()'],['../classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77',1,'mlx::core::Sinh::print()'],['../classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504',1,'mlx::core::Slice::print()'],['../classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b',1,'mlx::core::SliceUpdate::print()'],['../classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83',1,'mlx::core::Softmax::print()'],['../classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2',1,'mlx::core::Sort::print()'],['../classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2',1,'mlx::core::Split::print()'],['../classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384',1,'mlx::core::Square::print()'],['../classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f',1,'mlx::core::Sqrt::print()'],['../classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50',1,'mlx::core::StopGradient::print()'],['../classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b',1,'mlx::core::Subtract::print()'],['../classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f',1,'mlx::core::Tan::print()'],['../classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e',1,'mlx::core::Tanh::print()'],['../classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d',1,'mlx::core::Uniform::print()'],['../classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c',1,'mlx::core::View::print()'],['../classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04',1,'mlx::core::Transpose::print()'],['../classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b',1,'mlx::core::QRF::print()'],['../classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53',1,'mlx::core::SVD::print()'],['../classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9',1,'mlx::core::Inverse::print()'],['../classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84',1,'mlx::core::Cholesky::print()'],['../classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84',1,'mlx::core::Eigh::print()'],['../structmlx_1_1core_1_1_print_formatter.html#a79fad4cf5844db8c92b066539146281b',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, bool val)'],['../structmlx_1_1core_1_1_print_formatter.html#a8da448a8adae671b26359341ea514316',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a9d750c134a6fbfa8251c5b1d01d73287',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#adbbb9cbff767f9db73c659a0c07ba633',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int32_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a520adb07fafd911b22bc24b295e4f6cf',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint32_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ab0c702f1ae201e17cd328c9855cf522e',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int64_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ac59a5137ddd8b32aae057bb9826ee80d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint64_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ac4b7895d1168cfc1a3d1186d8a414d2f',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, float16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ae21005f92bc641f2d657096f5d176a6d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, bfloat16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a57af5c32561b95d6ac2a3a1dc4f5d43e',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, float val)'],['../structmlx_1_1core_1_1_print_formatter.html#a9e1dc67c9afb0a09966336504790823d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, complex64_t val)']]],
+  ['print_59',['print',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab',1,'mlx::core::distributed::AllReduce::print()'],['../classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb',1,'mlx::core::Primitive::print()'],['../classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107',1,'mlx::core::Abs::print()'],['../classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d',1,'mlx::core::Add::print()'],['../classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9',1,'mlx::core::AddMM::print()'],['../classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e',1,'mlx::core::Arange::print()'],['../classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739',1,'mlx::core::ArcCos::print()'],['../classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630',1,'mlx::core::ArcCosh::print()'],['../classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87',1,'mlx::core::ArcSin::print()'],['../classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430',1,'mlx::core::ArcSinh::print()'],['../classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05',1,'mlx::core::ArcTan::print()'],['../classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361',1,'mlx::core::ArcTan2::print()'],['../classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523',1,'mlx::core::ArcTanh::print()'],['../classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63',1,'mlx::core::ArgPartition::print()'],['../classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f',1,'mlx::core::ArgReduce::print()'],['../classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd',1,'mlx::core::ArgSort::print()'],['../classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c',1,'mlx::core::AsType::print()'],['../classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf',1,'mlx::core::AsStrided::print()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d',1,'mlx::core::BitwiseBinary::print()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159',1,'mlx::core::BlockMaskedMM::print()'],['../classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758',1,'mlx::core::GatherMM::print()'],['../classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11',1,'mlx::core::Broadcast::print()'],['../classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee',1,'mlx::core::Ceil::print()'],['../classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b',1,'mlx::core::Compiled::print()'],['../classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33',1,'mlx::core::Concatenate::print()'],['../classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4',1,'mlx::core::Conjugate::print()'],['../classmlx_1_1core_1_1_contiguous.html#aca8a4ba9a58cc10f063e6b082fa2fc23',1,'mlx::core::Contiguous::print()'],['../classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd',1,'mlx::core::Convolution::print()'],['../classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008',1,'mlx::core::Copy::print()'],['../classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696',1,'mlx::core::Cos::print()'],['../classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2',1,'mlx::core::Cosh::print()'],['../classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298',1,'mlx::core::CustomTransforms::print()'],['../classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82',1,'mlx::core::Depends::print()'],['../classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6',1,'mlx::core::Divide::print()'],['../classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1',1,'mlx::core::DivMod::print()'],['../classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7',1,'mlx::core::Select::print()'],['../classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4',1,'mlx::core::Remainder::print()'],['../classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774',1,'mlx::core::Equal::print()'],['../classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c',1,'mlx::core::Erf::print()'],['../classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9',1,'mlx::core::ErfInv::print()'],['../classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a',1,'mlx::core::Exp::print()'],['../classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1',1,'mlx::core::Expm1::print()'],['../classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf',1,'mlx::core::FFT::print()'],['../classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6',1,'mlx::core::Floor::print()'],['../classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013',1,'mlx::core::Full::print()'],['../classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91',1,'mlx::core::Gather::print()'],['../classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04',1,'mlx::core::Greater::print()'],['../classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef',1,'mlx::core::GreaterEqual::print()'],['../classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6',1,'mlx::core::Hadamard::print()'],['../classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d',1,'mlx::core::Imag::print()'],['../classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78',1,'mlx::core::Less::print()'],['../classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950',1,'mlx::core::LessEqual::print()'],['../classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa',1,'mlx::core::Load::print()'],['../classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d',1,'mlx::core::Log::print()'],['../classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4',1,'mlx::core::Log1p::print()'],['../classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c',1,'mlx::core::LogicalNot::print()'],['../classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397',1,'mlx::core::LogicalAnd::print()'],['../classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003',1,'mlx::core::LogicalOr::print()'],['../classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9',1,'mlx::core::LogAddExp::print()'],['../classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd',1,'mlx::core::Matmul::print()'],['../classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca',1,'mlx::core::Maximum::print()'],['../classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512',1,'mlx::core::Minimum::print()'],['../classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909',1,'mlx::core::Multiply::print()'],['../classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91',1,'mlx::core::Negative::print()'],['../classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09',1,'mlx::core::NotEqual::print()'],['../classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52',1,'mlx::core::NumberOfElements::print()'],['../classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a',1,'mlx::core::Pad::print()'],['../classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0',1,'mlx::core::Partition::print()'],['../classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60',1,'mlx::core::Power::print()'],['../classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db',1,'mlx::core::QuantizedMatmul::print()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0',1,'mlx::core::GatherQMM::print()'],['../classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271',1,'mlx::core::RandomBits::print()'],['../classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b',1,'mlx::core::Real::print()'],['../classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862',1,'mlx::core::Reshape::print()'],['../classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd',1,'mlx::core::Reduce::print()'],['../classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72',1,'mlx::core::Round::print()'],['../classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22',1,'mlx::core::Scan::print()'],['../classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa',1,'mlx::core::Scatter::print()'],['../classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2',1,'mlx::core::Sigmoid::print()'],['../classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a',1,'mlx::core::Sign::print()'],['../classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4',1,'mlx::core::Sin::print()'],['../classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77',1,'mlx::core::Sinh::print()'],['../classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504',1,'mlx::core::Slice::print()'],['../classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b',1,'mlx::core::SliceUpdate::print()'],['../classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83',1,'mlx::core::Softmax::print()'],['../classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2',1,'mlx::core::Sort::print()'],['../classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2',1,'mlx::core::Split::print()'],['../classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384',1,'mlx::core::Square::print()'],['../classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f',1,'mlx::core::Sqrt::print()'],['../classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50',1,'mlx::core::StopGradient::print()'],['../classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b',1,'mlx::core::Subtract::print()'],['../classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f',1,'mlx::core::Tan::print()'],['../classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e',1,'mlx::core::Tanh::print()'],['../classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d',1,'mlx::core::Uniform::print()'],['../classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c',1,'mlx::core::View::print()'],['../classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04',1,'mlx::core::Transpose::print()'],['../classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b',1,'mlx::core::QRF::print()'],['../classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53',1,'mlx::core::SVD::print()'],['../classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9',1,'mlx::core::Inverse::print()'],['../classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84',1,'mlx::core::Cholesky::print()'],['../classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84',1,'mlx::core::Eigh::print()'],['../structmlx_1_1core_1_1_print_formatter.html#a79fad4cf5844db8c92b066539146281b',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, bool val)'],['../structmlx_1_1core_1_1_print_formatter.html#a8da448a8adae671b26359341ea514316',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a9d750c134a6fbfa8251c5b1d01d73287',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#adbbb9cbff767f9db73c659a0c07ba633',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int32_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a520adb07fafd911b22bc24b295e4f6cf',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint32_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ab0c702f1ae201e17cd328c9855cf522e',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int64_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ac59a5137ddd8b32aae057bb9826ee80d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint64_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ac4b7895d1168cfc1a3d1186d8a414d2f',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, float16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ae21005f92bc641f2d657096f5d176a6d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, bfloat16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a57af5c32561b95d6ac2a3a1dc4f5d43e',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, float val)'],['../structmlx_1_1core_1_1_print_formatter.html#a9e1dc67c9afb0a09966336504790823d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, complex64_t val)']]],
   ['print_5fcomplex_5fconstant_60',['print_complex_constant',['../namespacemlx_1_1core.html#a2b78f270942c6eb185e8045f1c5b4286',1,'mlx::core']]],
   ['print_5fconstant_61',['print_constant',['../namespacemlx_1_1core.html#a7d11b000895d44d183260634f4192d92',1,'mlx::core']]],
   ['print_5ffloat_5fconstant_62',['print_float_constant',['../namespacemlx_1_1core.html#a93a8ac59c644b801ec8881a58368caf2',1,'mlx::core']]],
diff --git a/docs/build/html/search/all_11.js b/docs/build/html/search/all_11.js
index 1d12b31b2..4d6d6dc39 100644
--- a/docs/build/html/search/all_11.js
+++ b/docs/build/html/search/all_11.js
@@ -1,32 +1,33 @@
 var searchData=
 [
   ['q_0',['q',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#adf608e22d0c0397217472408aab52631',1,'mlx::core::scheduler::StreamThread']]],
-  ['qdot_1',['qdot',['../quantized_8h.html#ab364d58ab652e3ad87a8f80910556071',1,'quantized.h']]],
-  ['qdot_5fsafe_2',['qdot_safe',['../quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42',1,'quantized.h']]],
-  ['qmm_5fn_3',['qmm_n',['../quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7',1,'quantized.h']]],
-  ['qmm_5fn_5fimpl_4',['qmm_n_impl',['../quantized_8h.html#a0ba59096494f1001c195312571523ae9',1,'quantized.h']]],
-  ['qmm_5ft_5',['qmm_t',['../quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10',1,'quantized.h']]],
-  ['qmm_5ft_5fimpl_6',['qmm_t_impl',['../quantized_8h.html#af5750a35e8f5462218effba719f7f5b8',1,'quantized.h']]],
-  ['qmv_7',['qmv',['../quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd',1,'quantized.h']]],
-  ['qmv_5ffast_8',['qmv_fast',['../quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f',1,'quantized.h']]],
-  ['qmv_5ffast_5fimpl_9',['qmv_fast_impl',['../quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81',1,'quantized.h']]],
-  ['qmv_5fimpl_10',['qmv_impl',['../quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd',1,'quantized.h']]],
-  ['qmv_5fquad_11',['qmv_quad',['../quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad',1,'quantized.h']]],
-  ['qmv_5fquad_5fimpl_12',['qmv_quad_impl',['../quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef',1,'quantized.h']]],
-  ['qouter_13',['qouter',['../quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58',1,'quantized.h']]],
-  ['qr_14',['qr',['../namespacemlx_1_1core_1_1linalg.html#ae6d97829459353fe3b31c8a0867c0ca2',1,'mlx::core::linalg']]],
-  ['qrf_15',['QRF',['../classmlx_1_1core_1_1_q_r_f.html',1,'mlx::core::QRF'],['../classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983',1,'mlx::core::QRF::QRF()']]],
-  ['quad_5fsize_16',['QUAD_SIZE',['../quantized_8h.html#a803e4d5a1459844ba647aea5b004e133',1,'quantized.h']]],
-  ['quantize_17',['quantize',['../group__ops.html#gab43cc28690da7cdd43b43065adbd31da',1,'mlx::core']]],
-  ['quantized_18',['quantized',['../namespacemlx_1_1core_1_1metal.html#a949f029424218ab5c5588563d2e076f5',1,'mlx::core::metal']]],
-  ['quantized_2eh_19',['quantized.h',['../quantized_8h.html',1,'']]],
-  ['quantized_5fmatmul_20',['quantized_matmul',['../group__ops.html#gabfa4208fb1f9b1cdd0abc563b19175af',1,'mlx::core']]],
-  ['quantizedblockloader_21',['QuantizedBlockLoader',['../struct_quantized_block_loader.html',1,'QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;'],['../struct_quantized_block_loader.html#af59b054750a65e7e79c1cd05c4acac93',1,'QuantizedBlockLoader::QuantizedBlockLoader()']]],
-  ['quantizedmatmul_22',['QuantizedMatmul',['../classmlx_1_1core_1_1_quantized_matmul.html',1,'mlx::core::QuantizedMatmul'],['../classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c',1,'mlx::core::QuantizedMatmul::QuantizedMatmul()']]],
-  ['query_5fsequence_5flength_23',['QUERY_SEQUENCE_LENGTH',['../struct_m_l_x_scaled_dot_product_attention_params.html#a46cc2da6a069d822f36983ee18467e5c',1,'MLXScaledDotProductAttentionParams']]],
-  ['queue_24',['queue',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d',1,'mlx::core::metal::DeviceStream']]],
-  ['quiet_5fnan_25',['quiet_NaN',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['qvm_26',['qvm',['../quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5',1,'quantized.h']]],
-  ['qvm_5fimpl_27',['qvm_impl',['../quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a',1,'quantized.h']]],
-  ['qvm_5fsplit_5fk_28',['qvm_split_k',['../quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8',1,'quantized.h']]]
+  ['q_5fstrides_1',['Q_strides',['../structmlx_1_1steel_1_1_attn_params.html#a90bba215328201a37eb1c430ce9f8563',1,'mlx::steel::AttnParams']]],
+  ['qdot_2',['qdot',['../quantized_8h.html#ab364d58ab652e3ad87a8f80910556071',1,'quantized.h']]],
+  ['qdot_5fsafe_3',['qdot_safe',['../quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42',1,'quantized.h']]],
+  ['ql_4',['qL',['../structmlx_1_1steel_1_1_attn_params.html#a59255882cbd78bb6f15e704e3a356a7f',1,'mlx::steel::AttnParams']]],
+  ['qmm_5fn_5',['qmm_n',['../quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7',1,'quantized.h']]],
+  ['qmm_5fn_5fimpl_6',['qmm_n_impl',['../quantized_8h.html#a0ba59096494f1001c195312571523ae9',1,'quantized.h']]],
+  ['qmm_5ft_7',['qmm_t',['../quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10',1,'quantized.h']]],
+  ['qmm_5ft_5fimpl_8',['qmm_t_impl',['../quantized_8h.html#af5750a35e8f5462218effba719f7f5b8',1,'quantized.h']]],
+  ['qmv_9',['qmv',['../quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd',1,'quantized.h']]],
+  ['qmv_5ffast_10',['qmv_fast',['../quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f',1,'quantized.h']]],
+  ['qmv_5ffast_5fimpl_11',['qmv_fast_impl',['../quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81',1,'quantized.h']]],
+  ['qmv_5fimpl_12',['qmv_impl',['../quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd',1,'quantized.h']]],
+  ['qmv_5fquad_13',['qmv_quad',['../quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad',1,'quantized.h']]],
+  ['qmv_5fquad_5fimpl_14',['qmv_quad_impl',['../quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef',1,'quantized.h']]],
+  ['qouter_15',['qouter',['../quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58',1,'quantized.h']]],
+  ['qr_16',['qr',['../namespacemlx_1_1core_1_1linalg.html#ae6d97829459353fe3b31c8a0867c0ca2',1,'mlx::core::linalg']]],
+  ['qrf_17',['QRF',['../classmlx_1_1core_1_1_q_r_f.html',1,'mlx::core::QRF'],['../classmlx_1_1core_1_1_q_r_f.html#a44ed2924dc574c4aeb79b1188b5c3983',1,'mlx::core::QRF::QRF()']]],
+  ['quad_5fsize_18',['QUAD_SIZE',['../quantized_8h.html#a803e4d5a1459844ba647aea5b004e133',1,'quantized.h']]],
+  ['quantize_19',['quantize',['../group__ops.html#gab43cc28690da7cdd43b43065adbd31da',1,'mlx::core']]],
+  ['quantized_20',['quantized',['../namespacemlx_1_1core_1_1metal.html#a949f029424218ab5c5588563d2e076f5',1,'mlx::core::metal']]],
+  ['quantized_2eh_21',['quantized.h',['../quantized_8h.html',1,'']]],
+  ['quantized_5fmatmul_22',['quantized_matmul',['../group__ops.html#gabfa4208fb1f9b1cdd0abc563b19175af',1,'mlx::core']]],
+  ['quantizedblockloader_23',['QuantizedBlockLoader',['../struct_quantized_block_loader.html',1,'QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;'],['../struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589',1,'QuantizedBlockLoader::QuantizedBlockLoader()']]],
+  ['quantizedmatmul_24',['QuantizedMatmul',['../classmlx_1_1core_1_1_quantized_matmul.html',1,'mlx::core::QuantizedMatmul'],['../classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c',1,'mlx::core::QuantizedMatmul::QuantizedMatmul()']]],
+  ['queue_25',['queue',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d',1,'mlx::core::metal::DeviceStream']]],
+  ['quiet_5fnan_26',['quiet_NaN',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['qvm_27',['qvm',['../quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5',1,'quantized.h']]],
+  ['qvm_5fimpl_28',['qvm_impl',['../quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a',1,'quantized.h']]],
+  ['qvm_5fsplit_5fk_29',['qvm_split_k',['../quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8',1,'quantized.h']]]
 ];
diff --git a/docs/build/html/search/all_12.js b/docs/build/html/search/all_12.js
index 2f420b984..c8a65d504 100644
--- a/docs/build/html/search/all_12.js
+++ b/docs/build/html/search/all_12.js
@@ -1,6 +1,6 @@
 var searchData=
 [
-  ['r_0',['r',['../structpocketfft_1_1detail_1_1cmplx.html#afc51cdf222d77690953a8cb8ce3ee692',1,'pocketfft::detail::cmplx']]],
+  ['r_0',['r',['../structpocketfft_1_1detail_1_1cmplx.html#afc51cdf222d77690953a8cb8ce3ee692',1,'pocketfft::detail::cmplx::r'],['../structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe',1,'mlx::steel::Shape2D::r']]],
   ['r2c_1',['r2c',['../namespacepocketfft_1_1detail.html#a4e46762466d399e35b79c324cfe21616',1,'pocketfft::detail::r2c(const shape_t &amp;shape_in, const stride_t &amp;stride_in, const stride_t &amp;stride_out, size_t axis, bool forward, const T *data_in, std::complex&lt; T &gt; *data_out, T fct, size_t nthreads=1)'],['../namespacepocketfft_1_1detail.html#a454179497c44714d4b7425f116468c17',1,'pocketfft::detail::r2c(const shape_t &amp;shape_in, const stride_t &amp;stride_in, const stride_t &amp;stride_out, const shape_t &amp;axes, bool forward, const T *data_in, std::complex&lt; T &gt; *data_out, T fct, size_t nthreads=1)']]],
   ['r2h_2',['r2h',['../structpocketfft_1_1detail_1_1_exec_r2_r.html#a925b398c8e1868614ce9eaf381d02b7e',1,'pocketfft::detail::ExecR2R']]],
   ['r2r_5ffftpack_3',['r2r_fftpack',['../namespacepocketfft_1_1detail.html#a1ccca4cbbc6150d65620e2f9cdff62ac',1,'pocketfft::detail']]],
@@ -120,12 +120,15 @@ var searchData=
   ['round_117',['Round',['../structmlx_1_1core_1_1detail_1_1_round.html',1,'mlx::core::detail::Round'],['../classmlx_1_1core_1_1_round.html',1,'mlx::core::Round'],['../struct_round.html',1,'Round'],['../classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde',1,'mlx::core::Round::Round()']]],
   ['round_118',['round',['../namespacemetal.html#a46c667e169ff9d51a9204a045305442f',1,'metal::round()'],['../namespacemetal_1_1fast.html#a4cb687257a004726d49e496417eaa40f',1,'metal::fast::round()'],['../namespacemetal_1_1precise.html#a5295ab08055d12534cc3775da855ac12',1,'metal::precise::round()'],['../group__ops.html#ga2d74d43f007a069384e89d8416525331',1,'mlx::core::round(const array &amp;a, int decimals, StreamOrDevice s={})'],['../group__ops.html#gaf18fb7e98bf8cf3b7fbc5e64c988a95b',1,'mlx::core::round(const array &amp;a, StreamOrDevice s={})']]],
   ['round_5ferror_119',['round_error',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['row_5fcontiguous_120',['row_contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#a3170fa381dc7a90f6eabcc029bdf9bfd',1,'mlx::core::array::Flags::row_contiguous'],['../struct_indices.html#a255e340a39c6ac28ef2c232b106f85d1',1,'Indices::row_contiguous']]],
-  ['row_5freduce_5fgeneral_5fdispatch_121',['row_reduce_general_dispatch',['../namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b',1,'mlx::core']]],
-  ['row_5freduce_5flooped_122',['row_reduce_looped',['../reduce__row_8h.html#ad98332d74a6824aa7499df3e2f2246ae',1,'reduce_row.h']]],
-  ['row_5freduce_5fsimple_123',['row_reduce_simple',['../reduce__row_8h.html#ac01d30987668930c8b38900e47b8308b',1,'reduce_row.h']]],
-  ['row_5freduce_5fsmall_124',['row_reduce_small',['../reduce__row_8h.html#a27e75312086e31f6bd1bbf4b366679da',1,'reduce_row.h']]],
-  ['rsqrt_125',['Rsqrt',['../structmlx_1_1core_1_1detail_1_1_rsqrt.html',1,'mlx::core::detail::Rsqrt'],['../struct_rsqrt.html',1,'Rsqrt']]],
-  ['rsqrt_126',['rsqrt',['../namespacemetal.html#a1cf4b605c0aa7ff5bfe5e979a16f5157',1,'metal::rsqrt()'],['../namespacemetal_1_1fast.html#aa62097c750f1e4b69d09277f19976ab1',1,'metal::fast::rsqrt()'],['../namespacemetal_1_1precise.html#afb397b477745f12a44423934fa2b05ac',1,'metal::precise::rsqrt()'],['../group__ops.html#ga102f23aa0b0c3d3296a321c694617aa1',1,'mlx::core::rsqrt()']]],
-  ['run_127',['run',['../struct_g_e_m_v_kernel.html#ac4a7b5011a0ea938ab1949bb1767fc1a',1,'GEMVKernel::run()'],['../struct_g_e_m_v_t_kernel.html#a5d68656832de892f33db939005713927',1,'GEMVTKernel::run()'],['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5',1,'mlx::steel::GEMMKernel::run()']]]
+  ['row_5fbin_5fop_120',['row_bin_op',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a318c4279bdc7b39b7919f108b1cd8010',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::row_bin_op()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2',1,'mlx::steel::MMATile::row_bin_op()']]],
+  ['row_5fcontiguous_121',['row_contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#a3170fa381dc7a90f6eabcc029bdf9bfd',1,'mlx::core::array::Flags::row_contiguous'],['../struct_indices.html#a255e340a39c6ac28ef2c232b106f85d1',1,'Indices::row_contiguous']]],
+  ['row_5ffrag_5ftype_122',['row_frag_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a5ec2e40a8f5ad98c71b825544cdd878b',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
+  ['row_5freduce_123',['row_reduce',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a51d662e4cff88b5ad17d7c44bb6b6970',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::row_reduce()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88',1,'mlx::steel::MMATile::row_reduce()']]],
+  ['row_5freduce_5fgeneral_5fdispatch_124',['row_reduce_general_dispatch',['../namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b',1,'mlx::core']]],
+  ['row_5freduce_5flooped_125',['row_reduce_looped',['../reduce__row_8h.html#afba85f5a1c935c124ef52e986d4b2c49',1,'reduce_row.h']]],
+  ['row_5freduce_5fsimple_126',['row_reduce_simple',['../reduce__row_8h.html#aef628dfccdb1361da5546f8b17c510bf',1,'reduce_row.h']]],
+  ['row_5freduce_5fsmall_127',['row_reduce_small',['../reduce__row_8h.html#aeb49e89f1163cb3093770bb710df9f5e',1,'reduce_row.h']]],
+  ['rsqrt_128',['Rsqrt',['../structmlx_1_1core_1_1detail_1_1_rsqrt.html',1,'mlx::core::detail::Rsqrt'],['../struct_rsqrt.html',1,'Rsqrt']]],
+  ['rsqrt_129',['rsqrt',['../namespacemetal.html#a1cf4b605c0aa7ff5bfe5e979a16f5157',1,'metal::rsqrt()'],['../namespacemetal_1_1fast.html#aa62097c750f1e4b69d09277f19976ab1',1,'metal::fast::rsqrt()'],['../namespacemetal_1_1precise.html#afb397b477745f12a44423934fa2b05ac',1,'metal::precise::rsqrt()'],['../group__ops.html#ga102f23aa0b0c3d3296a321c694617aa1',1,'mlx::core::rsqrt()']]],
+  ['run_130',['run',['../struct_g_e_m_v_kernel.html#ac4a7b5011a0ea938ab1949bb1767fc1a',1,'GEMVKernel::run()'],['../struct_g_e_m_v_t_kernel.html#a5d68656832de892f33db939005713927',1,'GEMVTKernel::run()'],['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5',1,'mlx::steel::GEMMKernel::run(const device T *A, const device T *B, device U *D, const constant GEMMParams *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)'],['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5',1,'mlx::steel::GEMMKernel::run(const device T *A, const device T *B, device U *D, const constant GEMMParams *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)']]]
 ];
diff --git a/docs/build/html/search/all_13.js b/docs/build/html/search/all_13.js
index 38e2a2c7f..28a0076aa 100644
--- a/docs/build/html/search/all_13.js
+++ b/docs/build/html/search/all_13.js
@@ -6,193 +6,200 @@ var searchData=
   ['save_5fgguf_3',['save_gguf',['../namespacemlx_1_1core.html#a8bcc29ca8846ec99dce333df4a34dc5f',1,'mlx::core']]],
   ['save_5fsafetensors_4',['save_safetensors',['../namespacemlx_1_1core.html#a9f158db20c2405557f3ebc397e876de8',1,'mlx::core::save_safetensors(std::shared_ptr&lt; io::Writer &gt; in_stream, std::unordered_map&lt; std::string, array &gt;, std::unordered_map&lt; std::string, std::string &gt; metadata={})'],['../namespacemlx_1_1core.html#a21e256d852d587bcdc0827831b2c5c16',1,'mlx::core::save_safetensors(std::string file, std::unordered_map&lt; std::string, array &gt;, std::unordered_map&lt; std::string, std::string &gt; metadata={})']]],
   ['scalar_5',['Scalar',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337af60357a8d17e45793298323f1b372a74',1,'mlx::core']]],
-  ['scale_6',['scale',['../struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b',1,'ScaleOp']]],
+  ['scale_6',['scale',['../struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b',1,'ScaleOp::scale'],['../struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6',1,'TransformScale::scale'],['../structmlx_1_1steel_1_1_attn_params.html#ad81bcd32e6ff8fec0000eca505fb6826',1,'mlx::steel::AttnParams::scale']]],
   ['scaled_5fdot_5fproduct_5fattention_7',['scaled_dot_product_attention',['../namespacemlx_1_1core_1_1fast.html#a3663b50265b0a9c0cca2b5376852e059',1,'mlx::core::fast']]],
-  ['scaled_5fdot_5fproduct_5fattention_5fparams_2eh_8',['scaled_dot_product_attention_params.h',['../scaled__dot__product__attention__params_8h.html',1,'']]],
-  ['scaleddotproductattention_9',['ScaledDotProductAttention',['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html',1,'mlx::core::fast::ScaledDotProductAttention'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a6a904c377fffc3c193102a3123f5e706',1,'mlx::core::fast::ScaledDotProductAttention::ScaledDotProductAttention()']]],
-  ['scaleop_10',['ScaleOp',['../struct_scale_op.html',1,'']]],
-  ['scales_11',['scales',['../struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf',1,'QuantizedBlockLoader']]],
-  ['scan_12',['Scan',['../classmlx_1_1core_1_1_scan.html',1,'mlx::core::Scan'],['../classmlx_1_1core_1_1_scan.html#ac93e8f9c6771de825d2186ef34fa7087',1,'mlx::core::Scan::Scan()']]],
-  ['scan_13',['scan',['../namespacemlx_1_1core_1_1metal.html#a81c2cf124b0803098a54a78f8f6873a6',1,'mlx::core::metal']]],
-  ['scan_2eh_14',['scan.h',['../scan_8h.html',1,'']]],
-  ['scatter_15',['Scatter',['../classmlx_1_1core_1_1_scatter.html',1,'mlx::core::Scatter'],['../classmlx_1_1core_1_1_scatter.html#ac9b3eff67389ef9aa820753379ffeaa3',1,'mlx::core::Scatter::Scatter()']]],
-  ['scatter_16',['scatter',['../namespacemlx_1_1core_1_1metal.html#a32e902c6cd6d35fcc3119ed6685a170f',1,'mlx::core::metal::scatter()'],['../group__ops.html#gad438be8f90bae9d37c6853b8f4225d61',1,'mlx::core::scatter(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gac2c2b379a3ce959dbe1c4a68f112edfe',1,'mlx::core::scatter(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
-  ['scatter_2eh_17',['scatter.h',['../scatter_8h.html',1,'']]],
-  ['scatter_5fadd_18',['scatter_add',['../group__ops.html#gacd14c2b5cfebf343fc2d672722f8d174',1,'mlx::core::scatter_add(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gac13318518e5703f1273c5366eb523a5a',1,'mlx::core::scatter_add(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
-  ['scatter_5fimpl_19',['scatter_impl',['../scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1',1,'scatter.h']]],
-  ['scatter_5fkernels_20',['scatter_kernels',['../jit_2indexing_8h.html#a768c949cd650a44c6b402fc1440c1a56',1,'indexing.h']]],
-  ['scatter_5fmax_21',['scatter_max',['../group__ops.html#ga05881a4157cd113c9392d168a79e6673',1,'mlx::core::scatter_max(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga9adda5f9202bb3486e4d9e1114e3a56f',1,'mlx::core::scatter_max(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
-  ['scatter_5fmin_22',['scatter_min',['../group__ops.html#ga0ca16b7579dfc899f3f7fd40245ba7c5',1,'mlx::core::scatter_min(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga51fa762a997c243ca7a19e1ed3e83199',1,'mlx::core::scatter_min(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
-  ['scatter_5fprod_23',['scatter_prod',['../group__ops.html#ga3708b5bcb61e2c63d213c4ce6ad0ffc0',1,'mlx::core::scatter_prod(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gaf83c53c453faa9083ba27e4b97539339',1,'mlx::core::scatter_prod(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
-  ['scheduled_24',['scheduled',['../classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078af8a6f8eed2395ab89a758dec434393ae',1,'mlx::core::array']]],
-  ['scheduler_25',['Scheduler',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html',1,'mlx::core::scheduler::Scheduler'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a3ae42aed78a2200e9d02776fcd2316ba',1,'mlx::core::scheduler::Scheduler::Scheduler()'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a61a74e3628899e66dde600e24a750648',1,'mlx::core::scheduler::Scheduler::Scheduler(const Scheduler &amp;)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ac3f77b7c93220dadd0b3bb2e903b7059',1,'mlx::core::scheduler::Scheduler::Scheduler(Scheduler &amp;&amp;)=delete']]],
-  ['scheduler_26',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html#ae856e468c2f7c8f8ec672522cc13730b',1,'mlx::core::scheduler']]],
-  ['scheduler_2eh_27',['scheduler.h',['../scheduler_8h.html',1,'']]],
-  ['sdpa_5fvector_28',['sdpa_vector',['../sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae',1,'sdpa_vector.h']]],
-  ['sdpa_5fvector_2eh_29',['sdpa_vector.h',['../sdpa__vector_8h.html',1,'']]],
-  ['seed_30',['seed',['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a9f19c5da2031cba50d0ff996924347d8',1,'mlx::core::random::KeySequence::seed()'],['../namespacemlx_1_1core_1_1random.html#ac4ad325b613257306df74595d3d0e23b',1,'mlx::core::random::seed()']]],
-  ['seek_31',['seek',['../structmlx_1_1core_1_1_contiguous_iterator.html#a24719ee9e8667885d29c2ad74445520c',1,'mlx::core::ContiguousIterator::seek()'],['../classmlx_1_1core_1_1io_1_1_reader.html#acea55078bd39ccaa27a9a36f17a39cd1',1,'mlx::core::io::Reader::seek()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a9c1716dda53aa36faea9c8fb1a3e34d4',1,'mlx::core::io::Writer::seek()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a673c16b669f3cee13f387b7b0a1f39f7',1,'mlx::core::io::ParallelFileReader::seek()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9646f4ea048ae58719daeb588e2de433',1,'mlx::core::io::FileWriter::seek()']]],
-  ['select_32',['Select',['../structmlx_1_1core_1_1detail_1_1_select.html',1,'mlx::core::detail::Select'],['../classmlx_1_1core_1_1_select.html',1,'mlx::core::Select'],['../struct_select.html',1,'Select'],['../classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9',1,'mlx::core::Select::Select()']]],
-  ['send_33',['Send',['../classmlx_1_1core_1_1distributed_1_1_send.html',1,'mlx::core::distributed::Send'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a2481dd876b14d4a13ac466cbca9c4eac',1,'mlx::core::distributed::Send::Send()']]],
-  ['send_34',['send',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#abf33511660ac71df5fc92f2aad6c6e08',1,'mlx::core::distributed::detail::send()'],['../namespacemlx_1_1core_1_1distributed.html#a5a8360edaa3a528a3927fce4d2cf1777',1,'mlx::core::distributed::send()']]],
-  ['set_35',['Set',['../structpocketfft_1_1detail_1_1cmplx.html#a647fece372b64b13c4a7e5877d09a807',1,'pocketfft::detail::cmplx::Set(T r_, T i_)'],['../structpocketfft_1_1detail_1_1cmplx.html#a447d26b2e07f6e45f29d865e906c0a98',1,'pocketfft::detail::cmplx::Set(T r_)']]],
-  ['set_5fcache_5flimit_36',['set_cache_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#af392bced29d9e4e3f1a7cc4725d83764',1,'mlx::core::metal::MetalAllocator::set_cache_limit()'],['../namespacemlx_1_1core_1_1metal.html#ab09c9b60f1e886ab859e6a066c9a5b9d',1,'mlx::core::metal::set_cache_limit()']]],
-  ['set_5fcompile_5fmode_37',['set_compile_mode',['../namespacemlx_1_1core.html#a49445a55f976c4397f25ea18e1e92bef',1,'mlx::core']]],
-  ['set_5fdata_38',['set_data',['../classmlx_1_1core_1_1array.html#a631acd8e318189640b8338f9ae1a554d',1,'mlx::core::array::set_data(allocator::Buffer buffer, deleter_t d=allocator::free)'],['../classmlx_1_1core_1_1array.html#a2112af5fba37b3135cd2e6ac9e851606',1,'mlx::core::array::set_data(allocator::Buffer buffer, size_t data_size, std::vector&lt; size_t &gt; strides, Flags flags, deleter_t d=allocator::free)']]],
-  ['set_5fdefault_5fdevice_39',['set_default_device',['../namespacemlx_1_1core.html#a312a2de41367fe52caeaf8c0f596a120',1,'mlx::core']]],
-  ['set_5fdefault_5fstream_40',['set_default_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a6d15314ac9cf25efc9bd1278de9a66bb',1,'mlx::core::scheduler::Scheduler::set_default_stream()'],['../namespacemlx_1_1core.html#af35a2b06517d8bb7dbb469692b4f841c',1,'mlx::core::set_default_stream()']]],
-  ['set_5finput_5farray_41',['set_input_array',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4',1,'mlx::core::metal::CommandEncoder']]],
-  ['set_5fmemory_5flimit_42',['set_memory_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a179e3127ef9377ce54295f771c34ba1b',1,'mlx::core::metal::MetalAllocator::set_memory_limit()'],['../namespacemlx_1_1core_1_1metal.html#a3fb2c4a237fa4bfdff798156146c4937',1,'mlx::core::metal::set_memory_limit()']]],
-  ['set_5foutput_5farray_43',['set_output_array',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522',1,'mlx::core::metal::CommandEncoder']]],
-  ['set_5fresidency_5fset_44',['set_residency_set',['../classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f',1,'mlx::core::metal::Device']]],
-  ['set_5fsiblings_45',['set_siblings',['../classmlx_1_1core_1_1array.html#a8fccbe7a4edfd8cca168161124e263b1',1,'mlx::core::array']]],
-  ['set_5fstatus_46',['set_status',['../classmlx_1_1core_1_1array.html#a63598018999b49f1340b183cb303f05c',1,'mlx::core::array']]],
-  ['set_5ftracer_47',['set_tracer',['../classmlx_1_1core_1_1array.html#af26e6be1a9e6239471a4c24310c0c7c8',1,'mlx::core::array']]],
-  ['set_5fvalue_48',['set_value',['../classmlx_1_1core_1_1_event.html#a0d077b11f4b28f882b42440b7ac6d40d',1,'mlx::core::Event']]],
-  ['set_5fvector_5fbytes_49',['set_vector_bytes',['../namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf',1,'mlx::core::set_vector_bytes(CommandEncoder &amp;enc, const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)'],['../namespacemlx_1_1core.html#ae309cb543dfb0239cfccc53a8ad0408e',1,'mlx::core::set_vector_bytes(CommandEncoder &amp;enc, const std::vector&lt; T &gt; &amp;vec, int idx)']]],
-  ['set_5fwired_5flimit_50',['set_wired_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a84fa0347da18055bc13ba0a5c4b57253',1,'mlx::core::metal::MetalAllocator::set_wired_limit()'],['../namespacemlx_1_1core_1_1metal.html#a31eab4828d31d292bc84e07b0d961e1e',1,'mlx::core::metal::set_wired_limit()']]],
-  ['shape_51',['shape',['../structmlx_1_1core_1_1_reduction_plan.html#a6cfa8771fa9caf6fdcc3d74c9fca83ae',1,'mlx::core::ReductionPlan::shape'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a',1,'mlx::core::fast::CustomKernelShapeInfo::shape'],['../classpocketfft_1_1detail_1_1arr__info.html#accada8146cb8d3ab7facb4c1e3413ec0',1,'pocketfft::detail::arr_info::shape() const'],['../classpocketfft_1_1detail_1_1arr__info.html#ac601c660c64a4c252aa8be4ae7dfa7a8',1,'pocketfft::detail::arr_info::shape(size_t i) const'],['../classmlx_1_1core_1_1array.html#a4a2a2c8a4a5beafd723fc13f2055d55d',1,'mlx::core::array::shape() const'],['../classmlx_1_1core_1_1array.html#a51ed0c45666264dc172d06fba159eb8f',1,'mlx::core::array::shape(int dim) const']]],
-  ['shape_5ft_52',['shape_t',['../namespacepocketfft_1_1detail.html#a885ee37fcf564a268a5c8ca9ea8603e1',1,'pocketfft::detail']]],
-  ['shapes_53',['shapes',['../struct_indices.html#a5ab170f1a77636180889ddfffd4f7d2f',1,'Indices']]],
-  ['shapes_5fwithout_5freduction_5faxes_54',['shapes_without_reduction_axes',['../namespacemlx_1_1core.html#a44c3ea6db6553c3f6552b9ba64a69494',1,'mlx::core']]],
-  ['shared_5fbuffer_5fslice_55',['shared_buffer_slice',['../namespacemlx_1_1core.html#aea2a6a4eddfd4cfac89d20786059de2a',1,'mlx::core']]],
-  ['shp_56',['shp',['../classpocketfft_1_1detail_1_1arr__info.html#a2467e9e01de1ba4d7cd28c1af783da8d',1,'pocketfft::detail::arr_info']]],
-  ['shutdown_57',['shutdown',['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a0c7c29290fde806031c497f24c4ad411',1,'pocketfft::detail::threading::thread_pool']]],
-  ['siblings_58',['siblings',['../classmlx_1_1core_1_1array.html#acf80fde8f743f65ad5b4be69fcb7a74d',1,'mlx::core::array::siblings() const'],['../classmlx_1_1core_1_1array.html#a7263f23e70a580a9bc2129fbcde36e6c',1,'mlx::core::array::siblings()']]],
-  ['sigmoid_59',['Sigmoid',['../structmlx_1_1core_1_1detail_1_1_sigmoid.html',1,'mlx::core::detail::Sigmoid'],['../classmlx_1_1core_1_1_sigmoid.html',1,'mlx::core::Sigmoid'],['../struct_sigmoid.html',1,'Sigmoid'],['../classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b',1,'mlx::core::Sigmoid::Sigmoid()']]],
-  ['sigmoid_60',['sigmoid',['../group__ops.html#ga708abf8f79609cd6831db7c38cafac0e',1,'mlx::core']]],
-  ['sign_61',['Sign',['../structmlx_1_1core_1_1detail_1_1_sign.html',1,'mlx::core::detail::Sign'],['../classmlx_1_1core_1_1_sign.html',1,'mlx::core::Sign'],['../struct_sign.html',1,'Sign'],['../classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763',1,'mlx::core::Sign::Sign()']]],
-  ['sign_62',['sign',['../group__ops.html#ga20f1a1a8c0cd6206485f9363f3915faa',1,'mlx::core']]],
-  ['signal_63',['signal',['../classmlx_1_1core_1_1_event.html#a65a858445506a61be5889ae0e3651b89',1,'mlx::core::Event']]],
-  ['signaling_5fnan_64',['signaling_NaN',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['signedinteger_65',['signedinteger',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2daed58b4631ff157bec9e35ed1182d2c10',1,'mlx::core::Dtype::signedinteger'],['../namespacemlx_1_1core.html#a24e1618af591d737d73729665e868001',1,'mlx::core::signedinteger']]],
-  ['simd_5fbroadcast_66',['simd_broadcast',['../namespacemetal.html#a498f1e85107eb5f01ba4435977f8efe0',1,'metal']]],
-  ['simd_5fexclusive_5fscan_67',['simd_exclusive_scan',['../struct_cum_prod_3_01bool_01_4.html#a1a86e9398bae24182b7be0a6577bf223',1,'CumProd&lt; bool &gt;::simd_exclusive_scan()'],['../struct_cum_max.html#ae11b67aa6c998e9a01615b2a79af4403',1,'CumMax::simd_exclusive_scan()'],['../struct_cum_min.html#a83e65017ff33018b585c043fb803773b',1,'CumMin::simd_exclusive_scan()']]],
-  ['simd_5fmax_68',['simd_max',['../namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49',1,'metal']]],
-  ['simd_5fmin_69',['simd_min',['../namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b',1,'metal']]],
-  ['simd_5fprefix_5fexclusive_5fproduct_70',['simd_prefix_exclusive_product',['../namespacemetal.html#a5ca40242390b632f737e29636829b2e4',1,'metal']]],
-  ['simd_5fprefix_5fexclusive_5fsum_71',['simd_prefix_exclusive_sum',['../namespacemetal.html#abfbb70c7471f28bf7ff36a612ad014b2',1,'metal']]],
-  ['simd_5fprefix_5finclusive_5fproduct_72',['simd_prefix_inclusive_product',['../namespacemetal.html#a6ca6a7e1996228fa536e969e9e45c446',1,'metal']]],
-  ['simd_5fprefix_5finclusive_5fsum_73',['simd_prefix_inclusive_sum',['../namespacemetal.html#a567acb18199ac0107712eb8cb8aeb8e9',1,'metal']]],
-  ['simd_5fproduct_74',['simd_product',['../namespacemetal.html#ac6e883a04e2265a9790d7db76059e1b4',1,'metal']]],
-  ['simd_5fscan_75',['simd_scan',['../struct_cum_prod_3_01bool_01_4.html#abeb5ec4237b330e7219f4e881cf10d7a',1,'CumProd&lt; bool &gt;::simd_scan()'],['../struct_cum_max.html#adc9ec8bb09b4433d4c2f03022c43d781',1,'CumMax::simd_scan()'],['../struct_cum_min.html#a0a1005d91b1c90e90e2c6dbd6c296649',1,'CumMin::simd_scan()']]],
-  ['simd_5fshuffle_76',['simd_shuffle',['../namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4',1,'metal::simd_shuffle()'],['../backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2',1,'simd_shuffle(uint64_t data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a3bdbdfeb7a1dde40cd3ce1df8d9213b5',1,'simd_shuffle(int64_t data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ab4cbcdb054f9165130da91a3334da0cf',1,'simd_shuffle(bool data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ab8175b66bcc080fb89f738143568c30b',1,'simd_shuffle(complex64_t data, uint16_t lane):&#160;utils.h']]],
-  ['simd_5fshuffle_5fand_5ffill_5fdown_77',['simd_shuffle_and_fill_down',['../namespacemetal.html#ae29a06f0eac636ad7af21dea5b04938b',1,'metal::simd_shuffle_and_fill_down(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)'],['../namespacemetal.html#a0ee6239fa29a5f9ee0201e0dc5ddc8e0',1,'metal::simd_shuffle_and_fill_down(bfloat16_t data, bfloat16_t filling_data, ushort delta)']]],
-  ['simd_5fshuffle_5fand_5ffill_5fup_78',['simd_shuffle_and_fill_up',['../namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6',1,'metal::simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)'],['../namespacemetal.html#a5138d5cdc18139e135707916a243cd8e',1,'metal::simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta)'],['../backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4',1,'simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a7bb56415c5412a6a26f70a990915f064',1,'simd_shuffle_and_fill_up(int64_t data, int64_t filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ad55bd473647f2c6c68e65e5312c132d1',1,'simd_shuffle_and_fill_up(bool data, bool filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a94e02a6ae8c39cbf4cb23aa44df9dbd5',1,'simd_shuffle_and_fill_up(complex64_t data, complex64_t filling, uint16_t delta):&#160;utils.h']]],
-  ['simd_5fshuffle_5fdown_79',['simd_shuffle_down',['../namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c',1,'metal::simd_shuffle_down()'],['../backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c',1,'simd_shuffle_down(uint64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a0c1e4d782fcc56e1ab5565cef12430dd',1,'simd_shuffle_down(int64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a48ae83a8caf5c74810df60b6c6cdb062',1,'simd_shuffle_down(bool data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ad9a671a5f9aaa729ae7a77026f16bcb0',1,'simd_shuffle_down(complex64_t data, uint16_t delta):&#160;utils.h']]],
-  ['simd_5fshuffle_5frotate_5fdown_80',['simd_shuffle_rotate_down',['../namespacemetal.html#a4bb203647a421032db47e73cd649841b',1,'metal']]],
-  ['simd_5fshuffle_5frotate_5fup_81',['simd_shuffle_rotate_up',['../namespacemetal.html#a729b22077d6c944491a6027c18ea80c9',1,'metal']]],
-  ['simd_5fshuffle_5fup_82',['simd_shuffle_up',['../namespacemetal.html#afe81c5fbde3f4890458b081909242c55',1,'metal::simd_shuffle_up()'],['../backend_2metal_2kernels_2utils_8h.html#a39e436e0a942912266aae7e0bd82d7c0',1,'simd_shuffle_up(uint64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a617f3857caf33c569afa6148135f8b7a',1,'simd_shuffle_up(int64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ae0f5c42020275a588234e69f1eb7a485',1,'simd_shuffle_up(bool data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a92b455bac6a23af51c35ea83de2383eb',1,'simd_shuffle_up(complex64_t data, uint16_t delta):&#160;utils.h']]],
-  ['simd_5fshuffle_5fxor_83',['simd_shuffle_xor',['../namespacemetal.html#a5017efc9605e069cfb507137cd1a1852',1,'metal']]],
-  ['simd_5fsize_84',['SIMD_SIZE',['../quantized_8h.html#a62969a218d93680f5e35d0c61b160b99',1,'quantized.h']]],
-  ['simd_5fsize_85',['simd_size',['../backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3',1,'ops.h']]],
-  ['simd_5fsum_86',['simd_sum',['../namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5',1,'metal']]],
-  ['simd_5fxor_87',['simd_xor',['../namespacemetal.html#a1308decbf2d5c33d34d6be523ea1c30f',1,'metal']]],
-  ['simple_5fiter_88',['simple_iter',['../classpocketfft_1_1detail_1_1simple__iter.html',1,'pocketfft::detail::simple_iter'],['../classpocketfft_1_1detail_1_1simple__iter.html#a1e455c615825bebd5f1f62665027b398',1,'pocketfft::detail::simple_iter::simple_iter()']]],
-  ['simplevalueandgradfn_89',['SimpleValueAndGradFn',['../namespacemlx_1_1core.html#a2689b8f1181648cb1685204fea9f3066',1,'mlx::core']]],
-  ['sin_90',['Sin',['../structmlx_1_1core_1_1detail_1_1_sin.html',1,'mlx::core::detail::Sin'],['../classmlx_1_1core_1_1_sin.html',1,'mlx::core::Sin'],['../struct_sin.html',1,'Sin'],['../classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea',1,'mlx::core::Sin::Sin()']]],
-  ['sin_91',['sin',['../namespacepocketfft_1_1detail.html#a07745f4a069f811859308281b2982258',1,'pocketfft::detail::sin()'],['../namespacemetal.html#a619a159ca5f2ddfe3647d3a6bb6e804c',1,'metal::sin()'],['../namespacemetal_1_1fast.html#a3af771cfe7a135104f9d063147dba270',1,'metal::fast::sin()'],['../namespacemetal_1_1precise.html#a71acf77ffd29c56f56afae0195c98a1c',1,'metal::precise::sin()'],['../group__ops.html#gaebf0a73ad3732fba39df37826c235692',1,'mlx::core::sin()']]],
-  ['sincos_5f2pibyn_92',['sincos_2pibyn',['../classpocketfft_1_1detail_1_1sincos__2pibyn.html',1,'pocketfft::detail::sincos_2pibyn&lt; T &gt;'],['../classpocketfft_1_1detail_1_1sincos__2pibyn.html#a88518f2182d854c557edacd4ab8cbc40',1,'pocketfft::detail::sincos_2pibyn::sincos_2pibyn()']]],
-  ['sinh_93',['Sinh',['../structmlx_1_1core_1_1detail_1_1_sinh.html',1,'mlx::core::detail::Sinh'],['../classmlx_1_1core_1_1_sinh.html',1,'mlx::core::Sinh'],['../struct_sinh.html',1,'Sinh'],['../classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96',1,'mlx::core::Sinh::Sinh()']]],
-  ['sinh_94',['sinh',['../namespacemetal.html#a83ba4235ae350ab8880a9df09158620b',1,'metal::sinh()'],['../namespacemetal_1_1fast.html#a990d90b3440e38d1fb4ff5065c6c189b',1,'metal::fast::sinh()'],['../namespacemetal_1_1precise.html#abc8f4f59dd6e7204ab5d84f0af96331c',1,'metal::precise::sinh()'],['../group__ops.html#gaf532375c6563dbd6e329bdedf0224dd7',1,'mlx::core::sinh()']]],
-  ['sinpi_95',['sinpi',['../namespacemetal.html#ae9655f7fa2ba6c0625ca25fbb278e269',1,'metal::sinpi()'],['../namespacemetal_1_1fast.html#ab07a32fe544aa304577d29e0251e87b2',1,'metal::fast::sinpi()'],['../namespacemetal_1_1precise.html#a78b17dab93519d9c82c2575dafec49c9',1,'metal::precise::sinpi()']]],
-  ['size_96',['size',['../classpocketfft_1_1detail_1_1arr.html#a95bca00060957f540ff25b69632c6952',1,'pocketfft::detail::arr::size()'],['../classpocketfft_1_1detail_1_1arr__info.html#a003a7106f7fa59a3c55ac1f0116313a5',1,'pocketfft::detail::arr_info::size()'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a2adf9a9c968f113dde830cc0dc27dcc6',1,'mlx::core::allocator::Allocator::size()'],['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html#aafa92e8310db089b1ac72b840777e26b',1,'mlx::core::allocator::CommonAllocator::size()'],['../classmlx_1_1core_1_1array.html#a598f87161926d9e0b516860f0ea2c8f6',1,'mlx::core::array::size()'],['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a51f6587e8065be16f0418ca42a796e05',1,'mlx::core::metal::MetalAllocator::size()'],['../structmlx_1_1core_1_1distributed_1_1_group.html#abd96a09217e3d1bcc522888257d22cef',1,'mlx::core::distributed::Group::size()'],['../structmlx_1_1core_1_1_dtype.html#ab54051563d85212c7f0f049166bc9971',1,'mlx::core::Dtype::size()']]],
-  ['size_5fof_97',['size_of',['../namespacemlx_1_1core.html#add4794cc0ffe5d717fc146084a235d95',1,'mlx::core']]],
-  ['slice_98',['Slice',['../classmlx_1_1core_1_1_slice.html',1,'mlx::core::Slice'],['../classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f',1,'mlx::core::Slice::Slice()']]],
-  ['slice_99',['slice',['../group__ops.html#gad66135407dbb41b3c5d2cdfd51226c21',1,'mlx::core::slice(const array &amp;a, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, std::vector&lt; int &gt; strides, StreamOrDevice s={})'],['../group__ops.html#gaa97ce866c5e38b92b093e9321affcc57',1,'mlx::core::slice(const array &amp;a, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, StreamOrDevice s={})']]],
-  ['slice_5fgpu_100',['slice_gpu',['../namespacemlx_1_1core.html#a59048c5ff114c101a496bf33f62e3de9',1,'mlx::core']]],
-  ['slice_5fupdate_101',['slice_update',['../group__ops.html#ga3875660e4ce2c8add8bfcf8144078708',1,'mlx::core::slice_update(const array &amp;src, const array &amp;update, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, std::vector&lt; int &gt; strides, StreamOrDevice s={})'],['../group__ops.html#ga03ffbbb4d989a463ef43f41ebf7eabef',1,'mlx::core::slice_update(const array &amp;src, const array &amp;update, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, StreamOrDevice s={})']]],
-  ['sliceupdate_102',['SliceUpdate',['../classmlx_1_1core_1_1_slice_update.html',1,'mlx::core::SliceUpdate'],['../classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990',1,'mlx::core::SliceUpdate::SliceUpdate()']]],
-  ['slicing_2eh_103',['slicing.h',['../common_2slicing_8h.html',1,'(Global Namespace)'],['../metal_2slicing_8h.html',1,'(Global Namespace)']]],
-  ['sm_104',['sm',['../structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3',1,'mlx::steel::BlockMMA']]],
-  ['sn_105',['sn',['../structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a',1,'mlx::steel::BlockMMA']]],
-  ['softmax_106',['Softmax',['../classmlx_1_1core_1_1_softmax.html',1,'mlx::core::Softmax'],['../classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb',1,'mlx::core::Softmax::Softmax()']]],
-  ['softmax_107',['softmax',['../namespacemlx_1_1core_1_1metal.html#a4fe937c2c584fd646926057f31d54ca6',1,'mlx::core::metal::softmax()'],['../group__ops.html#ga7e9bb08b43c8fd0444b7d3c9e09dc1c6',1,'mlx::core::softmax(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool precise=false, StreamOrDevice s={})'],['../group__ops.html#ga1ae3614d07d873892a530d14c3857d0b',1,'mlx::core::softmax(const array &amp;a, bool precise=false, StreamOrDevice s={})'],['../group__ops.html#ga06f570d73716a24303e6de3aaba4457b',1,'mlx::core::softmax(const array &amp;a, int axis, bool precise=false, StreamOrDevice s={})']]],
-  ['softmax_2eh_108',['softmax.h',['../jit_2softmax_8h.html',1,'(Global Namespace)'],['../kernels_2softmax_8h.html',1,'(Global Namespace)']]],
-  ['softmax_5fexp_109',['softmax_exp',['../kernels_2softmax_8h.html#a440d4031ee5e86159a4dd715e44a438b',1,'softmax.h']]],
-  ['softmax_5fkernels_110',['softmax_kernels',['../jit_2softmax_8h.html#a1cbfb210a9a765c6620e9f1247ccef12',1,'softmax.h']]],
-  ['softmax_5flooped_111',['softmax_looped',['../kernels_2softmax_8h.html#a8c47b0924ebfeebcca25f3dd17373276',1,'softmax.h']]],
-  ['softmax_5fn_5freads_112',['SOFTMAX_N_READS',['../defines_8h.html#a722995df24286b27b7da3d74b73f768d',1,'defines.h']]],
-  ['softmax_5fsingle_5frow_113',['softmax_single_row',['../kernels_2softmax_8h.html#a815fe70f879f318e5d6e99acf043f52b',1,'softmax.h']]],
-  ['sort_114',['Sort',['../classmlx_1_1core_1_1_sort.html',1,'mlx::core::Sort'],['../classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44',1,'mlx::core::Sort::Sort()']]],
-  ['sort_115',['sort',['../struct_thread_sort.html#ad9ab3e6b47f7e9b91c0f3b773596986d',1,'ThreadSort::sort()'],['../struct_block_merge_sort.html#acc970f5eb963f7f2010f5ae5ea8b8bc0',1,'BlockMergeSort::sort()'],['../namespacemlx_1_1core_1_1metal.html#ab77c9a9ecaeeab8c66b712862777c24b',1,'mlx::core::metal::sort()'],['../group__ops.html#ga7fb616054665b3c2d61fa234f501f079',1,'mlx::core::sort(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaae1bc47aa737f705d0e5884270063fea',1,'mlx::core::sort(const array &amp;a, int axis, StreamOrDevice s={})']]],
-  ['sort_2eh_116',['sort.h',['../sort_8h.html',1,'']]],
-  ['special_5fmul_117',['special_mul',['../structpocketfft_1_1detail_1_1cmplx.html#a2e79f5c73c1d926361ad126cf57c8874',1,'pocketfft::detail::cmplx::special_mul()'],['../namespacepocketfft_1_1detail.html#a8da1f3d4a0b712a0285529f24187fe76',1,'pocketfft::detail::special_mul()']]],
-  ['split_118',['Split',['../classmlx_1_1core_1_1_split.html',1,'mlx::core::Split'],['../classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385',1,'mlx::core::Split::Split()']]],
-  ['split_119',['split',['../structmlx_1_1core_1_1distributed_1_1_group.html#abbf40f8979488806bc5bca9ecc4130e9',1,'mlx::core::distributed::Group::split()'],['../group__ops.html#ga7534290bceab5fb3831a05d67bebce7d',1,'mlx::core::split(const array &amp;a, int num_splits, int axis, StreamOrDevice s={})'],['../group__ops.html#ga56882d24e5fde59c266774624c892d41',1,'mlx::core::split(const array &amp;a, int num_splits, StreamOrDevice s={})'],['../group__ops.html#ga2cfcb1a53924882e30476c9016c5de74',1,'mlx::core::split(const array &amp;a, const std::vector&lt; int &gt; &amp;indices, int axis, StreamOrDevice s={})'],['../group__ops.html#gac324dfa3e26d3a14a35ab7962e36f0e1',1,'mlx::core::split(const array &amp;a, const std::vector&lt; int &gt; &amp;indices, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a42847b435d037a977592e355eed072af',1,'mlx::core::random::split(const array &amp;key, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a7ec057064c7326c41b536f08178861e5',1,'mlx::core::random::split(const array &amp;key, int num, StreamOrDevice s={})']]],
-  ['split_5fk_5fpartition_5fsize_120',['split_k_partition_size',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a9f5a67b2343645b570e109c3837d4042',1,'mlx::steel::GEMMSpiltKParams']]],
-  ['split_5fk_5fpartition_5fstride_121',['split_k_partition_stride',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a12144ce89d404812cd862611d770b9fb',1,'mlx::steel::GEMMSpiltKParams']]],
-  ['split_5fk_5fpartitions_122',['split_k_partitions',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#ae06c27116905d4ff3b9b436e588a93fd',1,'mlx::steel::GEMMSpiltKParams']]],
-  ['sqrt_123',['Sqrt',['../structmlx_1_1core_1_1detail_1_1_sqrt.html',1,'mlx::core::detail::Sqrt'],['../classmlx_1_1core_1_1_sqrt.html',1,'mlx::core::Sqrt'],['../struct_sqrt.html',1,'Sqrt'],['../classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29',1,'mlx::core::Sqrt::Sqrt()']]],
-  ['sqrt_124',['sqrt',['../namespacepocketfft_1_1detail.html#a774f8b73f28259d4276bd188b540a3e3',1,'pocketfft::detail::sqrt()'],['../namespacemetal.html#ab3f4d4852ca0e591104fbd8e5b50d31b',1,'metal::sqrt()'],['../namespacemetal_1_1fast.html#a4218a85c7d8a74cb8055b4755205627e',1,'metal::fast::sqrt()'],['../namespacemetal_1_1precise.html#acb213467361cd2cab93a8d5ea1aa5bfd',1,'metal::precise::sqrt()'],['../group__ops.html#ga297f853b3d90ec8ae81263977ba2ddb1',1,'mlx::core::sqrt()']]],
-  ['square_125',['Square',['../structmlx_1_1core_1_1detail_1_1_square.html',1,'mlx::core::detail::Square'],['../classmlx_1_1core_1_1_square.html',1,'mlx::core::Square'],['../struct_square.html',1,'Square'],['../classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4',1,'mlx::core::Square::Square()']]],
-  ['square_126',['square',['../group__ops.html#ga1234e4c39cfa79f19d4bdb5b8ea4d45e',1,'mlx::core']]],
-  ['squeeze_127',['squeeze',['../group__ops.html#ga710daa7ec721bd4d3f326082cb195576',1,'mlx::core::squeeze(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga700dd51b77379a3d2260a55783e8ebf3',1,'mlx::core::squeeze(const array &amp;a, int axis, StreamOrDevice s={})'],['../group__ops.html#ga58bad3c61fd85b95927a987ba1cf5dad',1,'mlx::core::squeeze(const array &amp;a, StreamOrDevice s={})']]],
-  ['src_128',['src',['../struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b',1,'QuantizedBlockLoader::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a656a46ee27486482b45ff90b3d626255',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a57552e9cfbafad71d47b2f3a8e027bdf',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7bfbcc4a1e3eef7aef5dd8e8c374a95f',1,'mlx::steel::Conv2DWeightBlockLoader::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#afe21e46e08523232830c25eb1b4ade16',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b04a69952404a04029dacc424df6e8f',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1ee2922961b5fcb1db577928c4d9d731',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a50f458dbb74d61be2ed24727d8d43614',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::src'],['../structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd',1,'mlx::steel::BlockLoader::src']]],
-  ['src_5fld_129',['src_ld',['../struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e',1,'QuantizedBlockLoader::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7464ec687323fa79050702952ed9084f',1,'mlx::steel::Conv2DWeightBlockLoader::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#aa6bedc0cbb447eaf70c03f2e26df2cb2',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6918c1df7712c4e408e2871467ea7987',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::src_ld'],['../structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d',1,'mlx::steel::BlockLoader::src_ld']]],
-  ['stack_130',['stack',['../group__ops.html#gaf8f2ec2b98a4b59eca73d7471df6e032',1,'mlx::core::stack(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#ga82216209dce901296fc737fe8efa5c94',1,'mlx::core::stack(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
-  ['start_5fcapture_131',['start_capture',['../namespacemlx_1_1core_1_1metal.html#aa47cb5651bf3b65c46ab216b7e504d77',1,'mlx::core::metal']]],
-  ['start_5fconcurrent_132',['start_concurrent',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034',1,'mlx::core::metal::CommandEncoder']]],
-  ['start_5frow_133',['start_row',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a220e033b689c8d6a6f319dae02b38334',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral']]],
-  ['status_134',['Status',['../classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078',1,'mlx::core::array']]],
-  ['status_135',['status',['../classmlx_1_1core_1_1array.html#a7102659be87e9ef62966696ab9b07dad',1,'mlx::core::array']]],
-  ['std_136',['std',['../group__ops.html#ga2a466024f8061febc0a64be557644cb0',1,'mlx::core::std(const array &amp;a, bool keepdims, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#gafdcb04d77c64405a3990078a77dd984c',1,'mlx::core::std(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga7f649970bf38b987b6ef847054f3c2f8',1,'mlx::core::std(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga62721a206df671ef5797449eea97af9f',1,'mlx::core::std(const array &amp;a, int axis, bool keepdims=false, int ddof=0, StreamOrDevice s={})']]],
-  ['steel_5fconst_137',['STEEL_CONST',['../steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b',1,'defines.h']]],
-  ['steel_5fconv_138',['steel_conv',['../namespacemlx_1_1core_1_1metal.html#a92f1e559b1121d545746f81ff86eaca1',1,'mlx::core::metal']]],
-  ['steel_5fconv_2eh_139',['steel_conv.h',['../jit_2steel__conv_8h.html',1,'(Global Namespace)'],['../kernels_2steel_2conv_2kernels_2steel__conv_8h.html',1,'(Global Namespace)']]],
-  ['steel_5fconv_5fgeneral_140',['steel_conv_general',['../namespacemlx_1_1core_1_1metal.html#a02edb6a90bdf30f4c9f0d6c25b0267b5',1,'mlx::core::metal']]],
-  ['steel_5fconv_5fgeneral_2eh_141',['steel_conv_general.h',['../steel__conv__general_8h.html',1,'']]],
-  ['steel_5fconv_5fgeneral_5fkernels_142',['steel_conv_general_kernels',['../jit_2steel__conv_8h.html#ae4ca1720029316b08ea92b7662347d47',1,'steel_conv.h']]],
-  ['steel_5fconv_5fkernels_143',['steel_conv_kernels',['../jit_2steel__conv_8h.html#a386d79077465df56659416fd84adb513',1,'steel_conv.h']]],
-  ['steel_5fgemm_2eh_144',['steel_gemm.h',['../steel__gemm_8h.html',1,'']]],
-  ['steel_5fgemm_5ffused_145',['steel_gemm_fused',['../namespacemlx_1_1core_1_1metal.html#a17764366deed71c160fb26091400a803',1,'mlx::core::metal']]],
-  ['steel_5fgemm_5ffused_2eh_146',['steel_gemm_fused.h',['../steel__gemm__fused_8h.html',1,'']]],
-  ['steel_5fgemm_5ffused_5fkernels_147',['steel_gemm_fused_kernels',['../steel__gemm_8h.html#a4c6009fd5357b730805f2fd4ba6e093e',1,'steel_gemm.h']]],
-  ['steel_5fgemm_5fmasked_148',['steel_gemm_masked',['../namespacemlx_1_1core_1_1metal.html#a962272ca73d26c08f76f706a128fd71f',1,'mlx::core::metal']]],
-  ['steel_5fgemm_5fmasked_2eh_149',['steel_gemm_masked.h',['../steel__gemm__masked_8h.html',1,'']]],
-  ['steel_5fgemm_5fmasked_5fkernels_150',['steel_gemm_masked_kernels',['../steel__gemm_8h.html#a62a358fd3ec5365081920d07aceb581c',1,'steel_gemm.h']]],
-  ['steel_5fgemm_5fsplitk_151',['steel_gemm_splitk',['../namespacemlx_1_1core_1_1metal.html#ad0dfd40ba7c09755711ceb731e57a5ac',1,'mlx::core::metal']]],
-  ['steel_5fgemm_5fsplitk_2eh_152',['steel_gemm_splitk.h',['../steel__gemm__splitk_8h.html',1,'']]],
-  ['steel_5fgemm_5fsplitk_5faccum_5faxbpy_5fkernels_153',['steel_gemm_splitk_accum_axbpy_kernels',['../steel__gemm_8h.html#a40a86e1381c241aba8511e51a981a4bf',1,'steel_gemm.h']]],
-  ['steel_5fgemm_5fsplitk_5faccum_5fkernels_154',['steel_gemm_splitk_accum_kernels',['../steel__gemm_8h.html#a144a64b8d94f0371fb144e2cc308fcf9',1,'steel_gemm.h']]],
-  ['steel_5fgemm_5fsplitk_5fkernels_155',['steel_gemm_splitk_kernels',['../steel__gemm_8h.html#a92108ab01d826e38bca83d8569b947d9',1,'steel_gemm.h']]],
-  ['steel_5fmatmul_156',['steel_matmul',['../namespacemlx_1_1core.html#ab43a7633794498e1c6775cca829eb886',1,'mlx::core']]],
-  ['steel_5fmatmul_5fregular_157',['steel_matmul_regular',['../namespacemlx_1_1core.html#a227588758ccc9ee869dba147e830bb74',1,'mlx::core']]],
-  ['steel_5fpragma_5funroll_158',['STEEL_PRAGMA_UNROLL',['../steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6',1,'defines.h']]],
-  ['step_159',['step',['../structmlx_1_1core_1_1_contiguous_iterator.html#ae230bd52b70a0bbdf560090f8a6589ef',1,'mlx::core::ContiguousIterator']]],
-  ['stop_160',['stop',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a456ad1c0c9e731833a2f8411c4ed51aa',1,'mlx::core::scheduler::StreamThread']]],
-  ['stop_5fcapture_161',['stop_capture',['../namespacemlx_1_1core_1_1metal.html#ac90714424e36fb01e04550de69b8314f',1,'mlx::core::metal']]],
-  ['stop_5fgradient_162',['stop_gradient',['../group__ops.html#ga36bc28f1deb2fe668ca9ae1e447b6b1f',1,'mlx::core']]],
-  ['stopgradient_163',['StopGradient',['../classmlx_1_1core_1_1_stop_gradient.html',1,'mlx::core::StopGradient'],['../classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f',1,'mlx::core::StopGradient::StopGradient()']]],
-  ['store_164',['store',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98',1,'mlx::steel::MMATile::store(threadgroup U *dst) const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f',1,'mlx::steel::MMATile::store(device U *dst, const int ld) const']]],
-  ['store_5fresult_165',['store_result',['../structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const']]],
-  ['store_5fresult_5fsafe_166',['store_result_safe',['../structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, short2 dst_tile_dims)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const']]],
-  ['store_5fsafe_167',['store_safe',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba',1,'mlx::steel::MMATile::store_safe()']]],
-  ['str_168',['str',['../classpocketfft_1_1detail_1_1arr__info.html#abe1f7b92501b4e0e5a38fd26294ac5a4',1,'pocketfft::detail::arr_info::str'],['../struct_m_l_x_conv_params.html#a862191e8ab1bc8a47aa1396b36d46058',1,'MLXConvParams::str']]],
-  ['stream_169',['Stream',['../structmlx_1_1core_1_1_stream.html',1,'mlx::core::Stream'],['../structmlx_1_1core_1_1_stream.html#a7f0815ff4886da74cbbff5f93d82dd3e',1,'mlx::core::Stream::Stream()']]],
-  ['stream_170',['stream',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a8462e4acffcd385c6248bd7102e6bcb1',1,'mlx::core::scheduler::StreamThread::stream'],['../classmlx_1_1core_1_1_event.html#a193143bad31b68c699fa27f135b45614',1,'mlx::core::Event::stream()'],['../classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a',1,'mlx::core::Primitive::stream()']]],
-  ['stream_2eh_171',['stream.h',['../stream_8h.html',1,'']]],
-  ['streamcontext_172',['StreamContext',['../structmlx_1_1core_1_1_stream_context.html',1,'mlx::core::StreamContext'],['../structmlx_1_1core_1_1_stream_context.html#a89d803151e9d7dce29382aa83d5c6ef1',1,'mlx::core::StreamContext::StreamContext()']]],
-  ['streamordevice_173',['StreamOrDevice',['../namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58',1,'mlx::core']]],
-  ['streamthread_174',['StreamThread',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html',1,'mlx::core::scheduler::StreamThread'],['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#ac528109a11abcb82e6e221c5efa4493c',1,'mlx::core::scheduler::StreamThread::StreamThread()']]],
-  ['stride_175',['stride',['../classpocketfft_1_1detail_1_1arr__info.html#a9d10aa83a1117e75d36f7396b8c2a093',1,'pocketfft::detail::arr_info::stride() const'],['../classpocketfft_1_1detail_1_1arr__info.html#ac1f6a9bd6703eceef6003f5f6315d39b',1,'pocketfft::detail::arr_info::stride(size_t i) const']]],
-  ['stride_5fin_176',['stride_in',['../classpocketfft_1_1detail_1_1multi__iter.html#ac947f03b1cfcb63436a7e61ff020a88c',1,'pocketfft::detail::multi_iter']]],
-  ['stride_5fout_177',['stride_out',['../classpocketfft_1_1detail_1_1multi__iter.html#a81d71a13bf0b85e556fbb9834167ecc7',1,'pocketfft::detail::multi_iter']]],
-  ['stride_5ft_178',['stride_t',['../namespacepocketfft_1_1detail.html#afb987c919e9424a996d0fc8b3c23cc84',1,'pocketfft::detail']]],
-  ['strided_5fdevice_5fidx_179',['strided_device_idx',['../struct_read_writer.html#a4c0b12484aac4fd6759d67c190391989',1,'ReadWriter']]],
-  ['strided_5freduce_5fgeneral_5fdispatch_180',['strided_reduce_general_dispatch',['../namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000',1,'mlx::core']]],
-  ['strided_5fscan_181',['strided_scan',['../scan_8h.html#a7abb6ffb6c3b96b88c2a63cd4cc2f7ae',1,'scan.h']]],
-  ['strided_5fshared_5fidx_182',['strided_shared_idx',['../struct_read_writer.html#ace40adb02cfb33d89c98353327c251fc',1,'ReadWriter']]],
-  ['strides_183',['strides',['../structmlx_1_1core_1_1_reduction_plan.html#a9bf7cae845ab633247c1811613ece8bd',1,'mlx::core::ReductionPlan::strides'],['../struct_indices.html#a7f73d7652f0f751e6a06c2663e329a4a',1,'Indices::strides'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2',1,'mlx::core::fast::CustomKernelShapeInfo::strides'],['../classmlx_1_1core_1_1array.html#a186cf2648da92584d5c1c8b24e69629b',1,'mlx::core::array::strides() const'],['../classmlx_1_1core_1_1array.html#a919f850ca087d1c40aa68f854cb30be2',1,'mlx::core::array::strides(int dim) const']]],
-  ['submit_184',['submit',['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a8698d49e8f406cdb88006aac6a91f9a4',1,'pocketfft::detail::threading::thread_pool']]],
-  ['subtract_185',['Subtract',['../structmlx_1_1core_1_1detail_1_1_subtract.html',1,'mlx::core::detail::Subtract'],['../classmlx_1_1core_1_1_subtract.html',1,'mlx::core::Subtract'],['../struct_subtract.html',1,'Subtract'],['../classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c',1,'mlx::core::Subtract::Subtract()']]],
-  ['subtract_186',['subtract',['../group__ops.html#ga196c240d3d0fcbb4713802c485e15133',1,'mlx::core']]],
-  ['sum_187',['Sum',['../struct_sum.html',1,'Sum&lt; U &gt;'],['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924a1fc7c1f09c80650ab0497e2d6781d65f',1,'mlx::core::distributed::AllReduce::Sum'],['../classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a8582875544f1d3d396a1a376473ef1dd',1,'mlx::core::Reduce::Sum'],['../classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ade23893033e4849f5596e7ce76a5fc36',1,'mlx::core::Scan::Sum'],['../classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca14abe2d8818efa71726be4e156813d6f',1,'mlx::core::Scatter::Sum']]],
-  ['sum_188',['sum',['../namespacemlx_1_1steel.html#ab4a6ddea4beb7c447cf5b69b9d46cc3b',1,'mlx::steel::sum(T x)'],['../namespacemlx_1_1steel.html#acd6e194d37b617d7a5818bc384a97fe4',1,'mlx::steel::sum(T x, Us... us)'],['../group__ops.html#gade905ee92eb6ab7edfc312aeddfbaeb6',1,'mlx::core::sum(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga3627754d7868487bdab1bd83f05d9c81',1,'mlx::core::sum(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaccd0a6be2c5b5128fdc2d87b5c8e67f4',1,'mlx::core::sum(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gafcd39b0bf39a56c26a967981c7ab8a8d',1,'mlx::core::sum(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['svd_189',['SVD',['../classmlx_1_1core_1_1_s_v_d.html',1,'mlx::core::SVD'],['../classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1',1,'mlx::core::SVD::SVD()']]],
-  ['svd_190',['svd',['../namespacemlx_1_1core_1_1linalg.html#a64364b880e99914cf47bf756fa8dbaf0',1,'mlx::core::linalg']]],
-  ['swapaxes_191',['swapaxes',['../group__ops.html#gabc46eed81ab6c6247903e4ec0c4ec1fb',1,'mlx::core']]],
-  ['swizzle_192',['swizzle',['../structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760',1,'mlx::steel::BlockSwizzle']]],
-  ['swizzle_5flog_193',['swizzle_log',['../struct_m_l_x_fast_attention_params.html#a68a338d522ffeb6761b7b168869361e2',1,'MLXFastAttentionParams::swizzle_log'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ad0713159d4f710cd9a066596593d8840',1,'mlx::steel::ImplicitGemmConv2DParams::swizzle_log'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#af9ff2c06dd8994126634531440325be7',1,'mlx::steel::GEMMParams::swizzle_log']]],
-  ['synchronize_194',['synchronize',['../namespacemlx_1_1core.html#a14287949d82ffefad0306cef5eb5f9e4',1,'mlx::core::synchronize()'],['../namespacemlx_1_1core.html#a6648a71937b055e5ff513d98056c2fb5',1,'mlx::core::synchronize(Stream)']]]
+  ['scaleddotproductattention_8',['ScaledDotProductAttention',['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html',1,'mlx::core::fast::ScaledDotProductAttention'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a6a904c377fffc3c193102a3123f5e706',1,'mlx::core::fast::ScaledDotProductAttention::ScaledDotProductAttention()']]],
+  ['scaleop_9',['ScaleOp',['../struct_scale_op.html',1,'']]],
+  ['scales_10',['scales',['../struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf',1,'QuantizedBlockLoader']]],
+  ['scan_11',['Scan',['../classmlx_1_1core_1_1_scan.html',1,'mlx::core::Scan'],['../classmlx_1_1core_1_1_scan.html#ac93e8f9c6771de825d2186ef34fa7087',1,'mlx::core::Scan::Scan()']]],
+  ['scan_12',['scan',['../namespacemlx_1_1core_1_1metal.html#a81c2cf124b0803098a54a78f8f6873a6',1,'mlx::core::metal']]],
+  ['scan_2eh_13',['scan.h',['../scan_8h.html',1,'']]],
+  ['scatter_14',['Scatter',['../classmlx_1_1core_1_1_scatter.html',1,'mlx::core::Scatter'],['../classmlx_1_1core_1_1_scatter.html#ac9b3eff67389ef9aa820753379ffeaa3',1,'mlx::core::Scatter::Scatter()']]],
+  ['scatter_15',['scatter',['../namespacemlx_1_1core_1_1metal.html#a32e902c6cd6d35fcc3119ed6685a170f',1,'mlx::core::metal::scatter()'],['../group__ops.html#gad438be8f90bae9d37c6853b8f4225d61',1,'mlx::core::scatter(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gac2c2b379a3ce959dbe1c4a68f112edfe',1,'mlx::core::scatter(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
+  ['scatter_2eh_16',['scatter.h',['../scatter_8h.html',1,'']]],
+  ['scatter_5fadd_17',['scatter_add',['../group__ops.html#gacd14c2b5cfebf343fc2d672722f8d174',1,'mlx::core::scatter_add(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gac13318518e5703f1273c5366eb523a5a',1,'mlx::core::scatter_add(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
+  ['scatter_5fimpl_18',['scatter_impl',['../scatter_8h.html#a0df7206d4519defb48a6275afc12f87c',1,'scatter.h']]],
+  ['scatter_5fkernels_19',['scatter_kernels',['../jit_2indexing_8h.html#a768c949cd650a44c6b402fc1440c1a56',1,'indexing.h']]],
+  ['scatter_5fmax_20',['scatter_max',['../group__ops.html#ga05881a4157cd113c9392d168a79e6673',1,'mlx::core::scatter_max(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga9adda5f9202bb3486e4d9e1114e3a56f',1,'mlx::core::scatter_max(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
+  ['scatter_5fmin_21',['scatter_min',['../group__ops.html#ga0ca16b7579dfc899f3f7fd40245ba7c5',1,'mlx::core::scatter_min(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga51fa762a997c243ca7a19e1ed3e83199',1,'mlx::core::scatter_min(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
+  ['scatter_5fprod_22',['scatter_prod',['../group__ops.html#ga3708b5bcb61e2c63d213c4ce6ad0ffc0',1,'mlx::core::scatter_prod(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gaf83c53c453faa9083ba27e4b97539339',1,'mlx::core::scatter_prod(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
+  ['scheduled_23',['scheduled',['../classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078af8a6f8eed2395ab89a758dec434393ae',1,'mlx::core::array']]],
+  ['scheduler_24',['Scheduler',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html',1,'mlx::core::scheduler::Scheduler'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a3ae42aed78a2200e9d02776fcd2316ba',1,'mlx::core::scheduler::Scheduler::Scheduler()'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a61a74e3628899e66dde600e24a750648',1,'mlx::core::scheduler::Scheduler::Scheduler(const Scheduler &amp;)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ac3f77b7c93220dadd0b3bb2e903b7059',1,'mlx::core::scheduler::Scheduler::Scheduler(Scheduler &amp;&amp;)=delete']]],
+  ['scheduler_25',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html#ae856e468c2f7c8f8ec672522cc13730b',1,'mlx::core::scheduler']]],
+  ['scheduler_2eh_26',['scheduler.h',['../scheduler_8h.html',1,'']]],
+  ['sdpa_5fvector_27',['sdpa_vector',['../sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae',1,'sdpa_vector.h']]],
+  ['sdpa_5fvector_2eh_28',['sdpa_vector.h',['../sdpa__vector_8h.html',1,'']]],
+  ['sdpa_5fvector_5f2pass_5f1_29',['sdpa_vector_2pass_1',['../sdpa__vector_8h.html#ae070ec482c79c5b3bd19dd03ea42ec74',1,'sdpa_vector.h']]],
+  ['sdpa_5fvector_5f2pass_5f2_30',['sdpa_vector_2pass_2',['../sdpa__vector_8h.html#a1368cf3618a4e03dbf743b3463205efe',1,'sdpa_vector.h']]],
+  ['seed_31',['seed',['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a9f19c5da2031cba50d0ff996924347d8',1,'mlx::core::random::KeySequence::seed()'],['../namespacemlx_1_1core_1_1random.html#ac4ad325b613257306df74595d3d0e23b',1,'mlx::core::random::seed()']]],
+  ['seek_32',['seek',['../structmlx_1_1core_1_1_contiguous_iterator.html#a24719ee9e8667885d29c2ad74445520c',1,'mlx::core::ContiguousIterator::seek()'],['../classmlx_1_1core_1_1io_1_1_reader.html#acea55078bd39ccaa27a9a36f17a39cd1',1,'mlx::core::io::Reader::seek()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a9c1716dda53aa36faea9c8fb1a3e34d4',1,'mlx::core::io::Writer::seek()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a673c16b669f3cee13f387b7b0a1f39f7',1,'mlx::core::io::ParallelFileReader::seek()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9646f4ea048ae58719daeb588e2de433',1,'mlx::core::io::FileWriter::seek()']]],
+  ['select_33',['Select',['../structmlx_1_1core_1_1detail_1_1_select.html',1,'mlx::core::detail::Select'],['../classmlx_1_1core_1_1_select.html',1,'mlx::core::Select'],['../struct_select.html',1,'Select'],['../classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9',1,'mlx::core::Select::Select()']]],
+  ['send_34',['Send',['../classmlx_1_1core_1_1distributed_1_1_send.html',1,'mlx::core::distributed::Send'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a2481dd876b14d4a13ac466cbca9c4eac',1,'mlx::core::distributed::Send::Send()']]],
+  ['send_35',['send',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#abf33511660ac71df5fc92f2aad6c6e08',1,'mlx::core::distributed::detail::send()'],['../namespacemlx_1_1core_1_1distributed.html#a5a8360edaa3a528a3927fce4d2cf1777',1,'mlx::core::distributed::send()']]],
+  ['set_36',['Set',['../structpocketfft_1_1detail_1_1cmplx.html#a647fece372b64b13c4a7e5877d09a807',1,'pocketfft::detail::cmplx::Set(T r_, T i_)'],['../structpocketfft_1_1detail_1_1cmplx.html#a447d26b2e07f6e45f29d865e906c0a98',1,'pocketfft::detail::cmplx::Set(T r_)']]],
+  ['set_5fbytes_37',['set_bytes',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9c343f791812a45c6c03a5c9f27f74d5',1,'mlx::core::metal::CommandEncoder::set_bytes(const T *v, int n, int idx)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#abc52d18ea87d213c47fd26062c829849',1,'mlx::core::metal::CommandEncoder::set_bytes(const T &amp;v, int idx)']]],
+  ['set_5fcache_5flimit_38',['set_cache_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#af392bced29d9e4e3f1a7cc4725d83764',1,'mlx::core::metal::MetalAllocator::set_cache_limit()'],['../namespacemlx_1_1core_1_1metal.html#ab09c9b60f1e886ab859e6a066c9a5b9d',1,'mlx::core::metal::set_cache_limit()']]],
+  ['set_5fcompile_5fmode_39',['set_compile_mode',['../namespacemlx_1_1core.html#a49445a55f976c4397f25ea18e1e92bef',1,'mlx::core']]],
+  ['set_5fcompute_5fpipeline_5fstate_40',['set_compute_pipeline_state',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6d4c03a6585deedb5ccd1a1057d0c6ef',1,'mlx::core::metal::CommandEncoder']]],
+  ['set_5fdata_41',['set_data',['../classmlx_1_1core_1_1array.html#a631acd8e318189640b8338f9ae1a554d',1,'mlx::core::array::set_data(allocator::Buffer buffer, deleter_t d=allocator::free)'],['../classmlx_1_1core_1_1array.html#a2112af5fba37b3135cd2e6ac9e851606',1,'mlx::core::array::set_data(allocator::Buffer buffer, size_t data_size, std::vector&lt; size_t &gt; strides, Flags flags, deleter_t d=allocator::free)']]],
+  ['set_5fdefault_5fdevice_42',['set_default_device',['../namespacemlx_1_1core.html#a312a2de41367fe52caeaf8c0f596a120',1,'mlx::core']]],
+  ['set_5fdefault_5fstream_43',['set_default_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a6d15314ac9cf25efc9bd1278de9a66bb',1,'mlx::core::scheduler::Scheduler::set_default_stream()'],['../namespacemlx_1_1core.html#af35a2b06517d8bb7dbb469692b4f841c',1,'mlx::core::set_default_stream()']]],
+  ['set_5finput_5farray_44',['set_input_array',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4',1,'mlx::core::metal::CommandEncoder']]],
+  ['set_5fmemory_5flimit_45',['set_memory_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a179e3127ef9377ce54295f771c34ba1b',1,'mlx::core::metal::MetalAllocator::set_memory_limit()'],['../namespacemlx_1_1core_1_1metal.html#a3fb2c4a237fa4bfdff798156146c4937',1,'mlx::core::metal::set_memory_limit()']]],
+  ['set_5foutput_5farray_46',['set_output_array',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522',1,'mlx::core::metal::CommandEncoder']]],
+  ['set_5fresidency_5fset_47',['set_residency_set',['../classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f',1,'mlx::core::metal::Device']]],
+  ['set_5fsiblings_48',['set_siblings',['../classmlx_1_1core_1_1array.html#a8fccbe7a4edfd8cca168161124e263b1',1,'mlx::core::array']]],
+  ['set_5fstatus_49',['set_status',['../classmlx_1_1core_1_1array.html#a63598018999b49f1340b183cb303f05c',1,'mlx::core::array']]],
+  ['set_5ftracer_50',['set_tracer',['../classmlx_1_1core_1_1array.html#af26e6be1a9e6239471a4c24310c0c7c8',1,'mlx::core::array']]],
+  ['set_5fvalue_51',['set_value',['../classmlx_1_1core_1_1_event.html#a0d077b11f4b28f882b42440b7ac6d40d',1,'mlx::core::Event']]],
+  ['set_5fvector_5fbytes_52',['set_vector_bytes',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b',1,'mlx::core::metal::CommandEncoder::set_vector_bytes(const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a7375adf9ee5355bcf4b7f5f210efd115',1,'mlx::core::metal::CommandEncoder::set_vector_bytes(const std::vector&lt; T &gt; &amp;vec, int idx)']]],
+  ['set_5fwired_5flimit_53',['set_wired_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a84fa0347da18055bc13ba0a5c4b57253',1,'mlx::core::metal::MetalAllocator::set_wired_limit()'],['../namespacemlx_1_1core_1_1metal.html#a31eab4828d31d292bc84e07b0d961e1e',1,'mlx::core::metal::set_wired_limit()']]],
+  ['shape_54',['shape',['../structmlx_1_1core_1_1_reduction_plan.html#a6cfa8771fa9caf6fdcc3d74c9fca83ae',1,'mlx::core::ReductionPlan::shape'],['../structmlx_1_1steel_1_1_layout2_d.html#a23183747ab1ddbdd3f1fcac6d0faa2cd',1,'mlx::steel::Layout2D::shape'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a',1,'mlx::core::fast::CustomKernelShapeInfo::shape'],['../classpocketfft_1_1detail_1_1arr__info.html#accada8146cb8d3ab7facb4c1e3413ec0',1,'pocketfft::detail::arr_info::shape() const'],['../classpocketfft_1_1detail_1_1arr__info.html#ac601c660c64a4c252aa8be4ae7dfa7a8',1,'pocketfft::detail::arr_info::shape(size_t i) const'],['../classmlx_1_1core_1_1array.html#a4a2a2c8a4a5beafd723fc13f2055d55d',1,'mlx::core::array::shape() const'],['../classmlx_1_1core_1_1array.html#a51ed0c45666264dc172d06fba159eb8f',1,'mlx::core::array::shape(int dim) const']]],
+  ['shape2d_55',['Shape2D',['../structmlx_1_1steel_1_1_shape2_d.html',1,'mlx::steel::Shape2D&lt; RInt, CInt &gt;'],['../structmlx_1_1steel_1_1_shape2_d.html#a070ce70eb6d84361c7f313159c438a5c',1,'mlx::steel::Shape2D::Shape2D()']]],
+  ['shape_5ft_56',['shape_t',['../namespacepocketfft_1_1detail.html#a885ee37fcf564a268a5c8ca9ea8603e1',1,'pocketfft::detail']]],
+  ['shapes_57',['shapes',['../struct_indices.html#a5ab170f1a77636180889ddfffd4f7d2f',1,'Indices']]],
+  ['shapes_5fwithout_5freduction_5faxes_58',['shapes_without_reduction_axes',['../namespacemlx_1_1core.html#a44c3ea6db6553c3f6552b9ba64a69494',1,'mlx::core']]],
+  ['shared_5fbuffer_5fslice_59',['shared_buffer_slice',['../namespacemlx_1_1core.html#aea2a6a4eddfd4cfac89d20786059de2a',1,'mlx::core']]],
+  ['shp_60',['shp',['../classpocketfft_1_1detail_1_1arr__info.html#a2467e9e01de1ba4d7cd28c1af783da8d',1,'pocketfft::detail::arr_info']]],
+  ['shutdown_61',['shutdown',['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a0c7c29290fde806031c497f24c4ad411',1,'pocketfft::detail::threading::thread_pool']]],
+  ['siblings_62',['siblings',['../classmlx_1_1core_1_1array.html#acf80fde8f743f65ad5b4be69fcb7a74d',1,'mlx::core::array::siblings() const'],['../classmlx_1_1core_1_1array.html#a7263f23e70a580a9bc2129fbcde36e6c',1,'mlx::core::array::siblings()']]],
+  ['sigmoid_63',['Sigmoid',['../structmlx_1_1core_1_1detail_1_1_sigmoid.html',1,'mlx::core::detail::Sigmoid'],['../classmlx_1_1core_1_1_sigmoid.html',1,'mlx::core::Sigmoid'],['../struct_sigmoid.html',1,'Sigmoid'],['../classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b',1,'mlx::core::Sigmoid::Sigmoid()']]],
+  ['sigmoid_64',['sigmoid',['../group__ops.html#ga708abf8f79609cd6831db7c38cafac0e',1,'mlx::core']]],
+  ['sign_65',['Sign',['../structmlx_1_1core_1_1detail_1_1_sign.html',1,'mlx::core::detail::Sign'],['../classmlx_1_1core_1_1_sign.html',1,'mlx::core::Sign'],['../struct_sign.html',1,'Sign'],['../classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763',1,'mlx::core::Sign::Sign()']]],
+  ['sign_66',['sign',['../group__ops.html#ga20f1a1a8c0cd6206485f9363f3915faa',1,'mlx::core']]],
+  ['signal_67',['signal',['../classmlx_1_1core_1_1_event.html#a65a858445506a61be5889ae0e3651b89',1,'mlx::core::Event']]],
+  ['signaling_5fnan_68',['signaling_NaN',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['signedinteger_69',['signedinteger',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2daed58b4631ff157bec9e35ed1182d2c10',1,'mlx::core::Dtype::signedinteger'],['../namespacemlx_1_1core.html#a24e1618af591d737d73729665e868001',1,'mlx::core::signedinteger']]],
+  ['simd_5fbroadcast_70',['simd_broadcast',['../namespacemetal.html#a498f1e85107eb5f01ba4435977f8efe0',1,'metal']]],
+  ['simd_5fexclusive_5fscan_71',['simd_exclusive_scan',['../struct_cum_prod_3_01bool_01_4.html#a1a86e9398bae24182b7be0a6577bf223',1,'CumProd&lt; bool &gt;::simd_exclusive_scan()'],['../struct_cum_max.html#ae11b67aa6c998e9a01615b2a79af4403',1,'CumMax::simd_exclusive_scan()'],['../struct_cum_min.html#a83e65017ff33018b585c043fb803773b',1,'CumMin::simd_exclusive_scan()']]],
+  ['simd_5fmax_72',['simd_max',['../namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49',1,'metal']]],
+  ['simd_5fmin_73',['simd_min',['../namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b',1,'metal']]],
+  ['simd_5fprefix_5fexclusive_5fproduct_74',['simd_prefix_exclusive_product',['../namespacemetal.html#a5ca40242390b632f737e29636829b2e4',1,'metal']]],
+  ['simd_5fprefix_5fexclusive_5fsum_75',['simd_prefix_exclusive_sum',['../namespacemetal.html#abfbb70c7471f28bf7ff36a612ad014b2',1,'metal']]],
+  ['simd_5fprefix_5finclusive_5fproduct_76',['simd_prefix_inclusive_product',['../namespacemetal.html#a6ca6a7e1996228fa536e969e9e45c446',1,'metal']]],
+  ['simd_5fprefix_5finclusive_5fsum_77',['simd_prefix_inclusive_sum',['../namespacemetal.html#a567acb18199ac0107712eb8cb8aeb8e9',1,'metal']]],
+  ['simd_5fproduct_78',['simd_product',['../namespacemetal.html#ac6e883a04e2265a9790d7db76059e1b4',1,'metal']]],
+  ['simd_5fscan_79',['simd_scan',['../struct_cum_prod_3_01bool_01_4.html#abeb5ec4237b330e7219f4e881cf10d7a',1,'CumProd&lt; bool &gt;::simd_scan()'],['../struct_cum_max.html#adc9ec8bb09b4433d4c2f03022c43d781',1,'CumMax::simd_scan()'],['../struct_cum_min.html#a0a1005d91b1c90e90e2c6dbd6c296649',1,'CumMin::simd_scan()']]],
+  ['simd_5fshuffle_80',['simd_shuffle',['../namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4',1,'metal::simd_shuffle()'],['../backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2',1,'simd_shuffle(uint64_t data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a3bdbdfeb7a1dde40cd3ce1df8d9213b5',1,'simd_shuffle(int64_t data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ab4cbcdb054f9165130da91a3334da0cf',1,'simd_shuffle(bool data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ab8175b66bcc080fb89f738143568c30b',1,'simd_shuffle(complex64_t data, uint16_t lane):&#160;utils.h']]],
+  ['simd_5fshuffle_5fand_5ffill_5fdown_81',['simd_shuffle_and_fill_down',['../namespacemetal.html#ae29a06f0eac636ad7af21dea5b04938b',1,'metal::simd_shuffle_and_fill_down(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)'],['../namespacemetal.html#a0ee6239fa29a5f9ee0201e0dc5ddc8e0',1,'metal::simd_shuffle_and_fill_down(bfloat16_t data, bfloat16_t filling_data, ushort delta)']]],
+  ['simd_5fshuffle_5fand_5ffill_5fup_82',['simd_shuffle_and_fill_up',['../namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6',1,'metal::simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)'],['../namespacemetal.html#a5138d5cdc18139e135707916a243cd8e',1,'metal::simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta)'],['../backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4',1,'simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a7bb56415c5412a6a26f70a990915f064',1,'simd_shuffle_and_fill_up(int64_t data, int64_t filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ad55bd473647f2c6c68e65e5312c132d1',1,'simd_shuffle_and_fill_up(bool data, bool filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a94e02a6ae8c39cbf4cb23aa44df9dbd5',1,'simd_shuffle_and_fill_up(complex64_t data, complex64_t filling, uint16_t delta):&#160;utils.h']]],
+  ['simd_5fshuffle_5fdown_83',['simd_shuffle_down',['../namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c',1,'metal::simd_shuffle_down()'],['../backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c',1,'simd_shuffle_down(uint64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a0c1e4d782fcc56e1ab5565cef12430dd',1,'simd_shuffle_down(int64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a48ae83a8caf5c74810df60b6c6cdb062',1,'simd_shuffle_down(bool data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ad9a671a5f9aaa729ae7a77026f16bcb0',1,'simd_shuffle_down(complex64_t data, uint16_t delta):&#160;utils.h']]],
+  ['simd_5fshuffle_5frotate_5fdown_84',['simd_shuffle_rotate_down',['../namespacemetal.html#a4bb203647a421032db47e73cd649841b',1,'metal']]],
+  ['simd_5fshuffle_5frotate_5fup_85',['simd_shuffle_rotate_up',['../namespacemetal.html#a729b22077d6c944491a6027c18ea80c9',1,'metal']]],
+  ['simd_5fshuffle_5fup_86',['simd_shuffle_up',['../namespacemetal.html#afe81c5fbde3f4890458b081909242c55',1,'metal::simd_shuffle_up()'],['../backend_2metal_2kernels_2utils_8h.html#a39e436e0a942912266aae7e0bd82d7c0',1,'simd_shuffle_up(uint64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a617f3857caf33c569afa6148135f8b7a',1,'simd_shuffle_up(int64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ae0f5c42020275a588234e69f1eb7a485',1,'simd_shuffle_up(bool data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a92b455bac6a23af51c35ea83de2383eb',1,'simd_shuffle_up(complex64_t data, uint16_t delta):&#160;utils.h']]],
+  ['simd_5fshuffle_5fxor_87',['simd_shuffle_xor',['../namespacemetal.html#a5017efc9605e069cfb507137cd1a1852',1,'metal']]],
+  ['simd_5fsize_88',['SIMD_SIZE',['../quantized_8h.html#a62969a218d93680f5e35d0c61b160b99',1,'quantized.h']]],
+  ['simd_5fsize_89',['simd_size',['../backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3',1,'ops.h']]],
+  ['simd_5fsum_90',['simd_sum',['../namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5',1,'metal']]],
+  ['simd_5fxor_91',['simd_xor',['../namespacemetal.html#a1308decbf2d5c33d34d6be523ea1c30f',1,'metal']]],
+  ['simple_5fiter_92',['simple_iter',['../classpocketfft_1_1detail_1_1simple__iter.html',1,'pocketfft::detail::simple_iter'],['../classpocketfft_1_1detail_1_1simple__iter.html#a1e455c615825bebd5f1f62665027b398',1,'pocketfft::detail::simple_iter::simple_iter()']]],
+  ['simplevalueandgradfn_93',['SimpleValueAndGradFn',['../namespacemlx_1_1core.html#a2689b8f1181648cb1685204fea9f3066',1,'mlx::core']]],
+  ['sin_94',['Sin',['../structmlx_1_1core_1_1detail_1_1_sin.html',1,'mlx::core::detail::Sin'],['../classmlx_1_1core_1_1_sin.html',1,'mlx::core::Sin'],['../struct_sin.html',1,'Sin'],['../classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea',1,'mlx::core::Sin::Sin()']]],
+  ['sin_95',['sin',['../namespacepocketfft_1_1detail.html#a07745f4a069f811859308281b2982258',1,'pocketfft::detail::sin()'],['../namespacemetal.html#a619a159ca5f2ddfe3647d3a6bb6e804c',1,'metal::sin()'],['../namespacemetal_1_1fast.html#a3af771cfe7a135104f9d063147dba270',1,'metal::fast::sin()'],['../namespacemetal_1_1precise.html#a71acf77ffd29c56f56afae0195c98a1c',1,'metal::precise::sin()'],['../group__ops.html#gaebf0a73ad3732fba39df37826c235692',1,'mlx::core::sin()']]],
+  ['sincos_5f2pibyn_96',['sincos_2pibyn',['../classpocketfft_1_1detail_1_1sincos__2pibyn.html',1,'pocketfft::detail::sincos_2pibyn&lt; T &gt;'],['../classpocketfft_1_1detail_1_1sincos__2pibyn.html#a88518f2182d854c557edacd4ab8cbc40',1,'pocketfft::detail::sincos_2pibyn::sincos_2pibyn()']]],
+  ['sinh_97',['Sinh',['../structmlx_1_1core_1_1detail_1_1_sinh.html',1,'mlx::core::detail::Sinh'],['../classmlx_1_1core_1_1_sinh.html',1,'mlx::core::Sinh'],['../struct_sinh.html',1,'Sinh'],['../classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96',1,'mlx::core::Sinh::Sinh()']]],
+  ['sinh_98',['sinh',['../namespacemetal.html#a83ba4235ae350ab8880a9df09158620b',1,'metal::sinh()'],['../namespacemetal_1_1fast.html#a990d90b3440e38d1fb4ff5065c6c189b',1,'metal::fast::sinh()'],['../namespacemetal_1_1precise.html#abc8f4f59dd6e7204ab5d84f0af96331c',1,'metal::precise::sinh()'],['../group__ops.html#gaf532375c6563dbd6e329bdedf0224dd7',1,'mlx::core::sinh()']]],
+  ['sinpi_99',['sinpi',['../namespacemetal.html#ae9655f7fa2ba6c0625ca25fbb278e269',1,'metal::sinpi()'],['../namespacemetal_1_1fast.html#ab07a32fe544aa304577d29e0251e87b2',1,'metal::fast::sinpi()'],['../namespacemetal_1_1precise.html#a78b17dab93519d9c82c2575dafec49c9',1,'metal::precise::sinpi()']]],
+  ['size_100',['size',['../classpocketfft_1_1detail_1_1arr.html#a95bca00060957f540ff25b69632c6952',1,'pocketfft::detail::arr::size()'],['../classpocketfft_1_1detail_1_1arr__info.html#a003a7106f7fa59a3c55ac1f0116313a5',1,'pocketfft::detail::arr_info::size()'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a2adf9a9c968f113dde830cc0dc27dcc6',1,'mlx::core::allocator::Allocator::size()'],['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html#aafa92e8310db089b1ac72b840777e26b',1,'mlx::core::allocator::CommonAllocator::size()'],['../classmlx_1_1core_1_1array.html#a598f87161926d9e0b516860f0ea2c8f6',1,'mlx::core::array::size()'],['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a51f6587e8065be16f0418ca42a796e05',1,'mlx::core::metal::MetalAllocator::size()'],['../structmlx_1_1core_1_1distributed_1_1_group.html#abd96a09217e3d1bcc522888257d22cef',1,'mlx::core::distributed::Group::size()'],['../structmlx_1_1core_1_1_dtype.html#ab54051563d85212c7f0f049166bc9971',1,'mlx::core::Dtype::size()']]],
+  ['size_5fof_101',['size_of',['../namespacemlx_1_1core.html#add4794cc0ffe5d717fc146084a235d95',1,'mlx::core']]],
+  ['slice_102',['Slice',['../classmlx_1_1core_1_1_slice.html',1,'mlx::core::Slice'],['../classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f',1,'mlx::core::Slice::Slice()']]],
+  ['slice_103',['slice',['../group__ops.html#gad66135407dbb41b3c5d2cdfd51226c21',1,'mlx::core::slice(const array &amp;a, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, std::vector&lt; int &gt; strides, StreamOrDevice s={})'],['../group__ops.html#gaa97ce866c5e38b92b093e9321affcc57',1,'mlx::core::slice(const array &amp;a, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, StreamOrDevice s={})']]],
+  ['slice_5fgpu_104',['slice_gpu',['../namespacemlx_1_1core.html#a59048c5ff114c101a496bf33f62e3de9',1,'mlx::core']]],
+  ['slice_5fupdate_105',['slice_update',['../group__ops.html#ga3875660e4ce2c8add8bfcf8144078708',1,'mlx::core::slice_update(const array &amp;src, const array &amp;update, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, std::vector&lt; int &gt; strides, StreamOrDevice s={})'],['../group__ops.html#ga03ffbbb4d989a463ef43f41ebf7eabef',1,'mlx::core::slice_update(const array &amp;src, const array &amp;update, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, StreamOrDevice s={})']]],
+  ['sliceupdate_106',['SliceUpdate',['../classmlx_1_1core_1_1_slice_update.html',1,'mlx::core::SliceUpdate'],['../classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990',1,'mlx::core::SliceUpdate::SliceUpdate()']]],
+  ['slicing_2eh_107',['slicing.h',['../common_2slicing_8h.html',1,'(Global Namespace)'],['../metal_2slicing_8h.html',1,'(Global Namespace)']]],
+  ['sm_108',['sm',['../structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3',1,'mlx::steel::BlockMMA']]],
+  ['sn_109',['sn',['../structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a',1,'mlx::steel::BlockMMA']]],
+  ['softmax_110',['Softmax',['../classmlx_1_1core_1_1_softmax.html',1,'mlx::core::Softmax'],['../classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb',1,'mlx::core::Softmax::Softmax()']]],
+  ['softmax_111',['softmax',['../namespacemlx_1_1core_1_1metal.html#a4fe937c2c584fd646926057f31d54ca6',1,'mlx::core::metal::softmax()'],['../group__ops.html#ga7e9bb08b43c8fd0444b7d3c9e09dc1c6',1,'mlx::core::softmax(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool precise=false, StreamOrDevice s={})'],['../group__ops.html#ga1ae3614d07d873892a530d14c3857d0b',1,'mlx::core::softmax(const array &amp;a, bool precise=false, StreamOrDevice s={})'],['../group__ops.html#ga06f570d73716a24303e6de3aaba4457b',1,'mlx::core::softmax(const array &amp;a, int axis, bool precise=false, StreamOrDevice s={})']]],
+  ['softmax_2eh_112',['softmax.h',['../jit_2softmax_8h.html',1,'(Global Namespace)'],['../kernels_2softmax_8h.html',1,'(Global Namespace)']]],
+  ['softmax_5fexp_113',['softmax_exp',['../kernels_2softmax_8h.html#a440d4031ee5e86159a4dd715e44a438b',1,'softmax.h']]],
+  ['softmax_5fkernels_114',['softmax_kernels',['../jit_2softmax_8h.html#a1cbfb210a9a765c6620e9f1247ccef12',1,'softmax.h']]],
+  ['softmax_5flooped_115',['softmax_looped',['../kernels_2softmax_8h.html#a8c47b0924ebfeebcca25f3dd17373276',1,'softmax.h']]],
+  ['softmax_5fn_5freads_116',['SOFTMAX_N_READS',['../defines_8h.html#a722995df24286b27b7da3d74b73f768d',1,'defines.h']]],
+  ['softmax_5fsingle_5frow_117',['softmax_single_row',['../kernels_2softmax_8h.html#a815fe70f879f318e5d6e99acf043f52b',1,'softmax.h']]],
+  ['sort_118',['Sort',['../classmlx_1_1core_1_1_sort.html',1,'mlx::core::Sort'],['../classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44',1,'mlx::core::Sort::Sort()']]],
+  ['sort_119',['sort',['../struct_thread_sort.html#ad9ab3e6b47f7e9b91c0f3b773596986d',1,'ThreadSort::sort()'],['../struct_block_merge_sort.html#acc970f5eb963f7f2010f5ae5ea8b8bc0',1,'BlockMergeSort::sort()'],['../namespacemlx_1_1core_1_1metal.html#ab77c9a9ecaeeab8c66b712862777c24b',1,'mlx::core::metal::sort()'],['../group__ops.html#ga7fb616054665b3c2d61fa234f501f079',1,'mlx::core::sort(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaae1bc47aa737f705d0e5884270063fea',1,'mlx::core::sort(const array &amp;a, int axis, StreamOrDevice s={})']]],
+  ['sort_2eh_120',['sort.h',['../sort_8h.html',1,'']]],
+  ['special_5fmul_121',['special_mul',['../structpocketfft_1_1detail_1_1cmplx.html#a2e79f5c73c1d926361ad126cf57c8874',1,'pocketfft::detail::cmplx::special_mul()'],['../namespacepocketfft_1_1detail.html#a8da1f3d4a0b712a0285529f24187fe76',1,'pocketfft::detail::special_mul()']]],
+  ['split_122',['Split',['../classmlx_1_1core_1_1_split.html',1,'mlx::core::Split'],['../classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385',1,'mlx::core::Split::Split()']]],
+  ['split_123',['split',['../structmlx_1_1core_1_1distributed_1_1_group.html#abbf40f8979488806bc5bca9ecc4130e9',1,'mlx::core::distributed::Group::split()'],['../group__ops.html#ga7534290bceab5fb3831a05d67bebce7d',1,'mlx::core::split(const array &amp;a, int num_splits, int axis, StreamOrDevice s={})'],['../group__ops.html#ga56882d24e5fde59c266774624c892d41',1,'mlx::core::split(const array &amp;a, int num_splits, StreamOrDevice s={})'],['../group__ops.html#ga2cfcb1a53924882e30476c9016c5de74',1,'mlx::core::split(const array &amp;a, const std::vector&lt; int &gt; &amp;indices, int axis, StreamOrDevice s={})'],['../group__ops.html#gac324dfa3e26d3a14a35ab7962e36f0e1',1,'mlx::core::split(const array &amp;a, const std::vector&lt; int &gt; &amp;indices, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a42847b435d037a977592e355eed072af',1,'mlx::core::random::split(const array &amp;key, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a7ec057064c7326c41b536f08178861e5',1,'mlx::core::random::split(const array &amp;key, int num, StreamOrDevice s={})']]],
+  ['split_5fk_5fpartition_5fsize_124',['split_k_partition_size',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a9f5a67b2343645b570e109c3837d4042',1,'mlx::steel::GEMMSpiltKParams']]],
+  ['split_5fk_5fpartition_5fstride_125',['split_k_partition_stride',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a12144ce89d404812cd862611d770b9fb',1,'mlx::steel::GEMMSpiltKParams']]],
+  ['split_5fk_5fpartitions_126',['split_k_partitions',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#ae06c27116905d4ff3b9b436e588a93fd',1,'mlx::steel::GEMMSpiltKParams']]],
+  ['sqrt_127',['Sqrt',['../structmlx_1_1core_1_1detail_1_1_sqrt.html',1,'mlx::core::detail::Sqrt'],['../classmlx_1_1core_1_1_sqrt.html',1,'mlx::core::Sqrt'],['../struct_sqrt.html',1,'Sqrt'],['../classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29',1,'mlx::core::Sqrt::Sqrt()']]],
+  ['sqrt_128',['sqrt',['../namespacepocketfft_1_1detail.html#a774f8b73f28259d4276bd188b540a3e3',1,'pocketfft::detail::sqrt()'],['../namespacemetal.html#ab3f4d4852ca0e591104fbd8e5b50d31b',1,'metal::sqrt()'],['../namespacemetal_1_1fast.html#a4218a85c7d8a74cb8055b4755205627e',1,'metal::fast::sqrt()'],['../namespacemetal_1_1precise.html#acb213467361cd2cab93a8d5ea1aa5bfd',1,'metal::precise::sqrt()'],['../group__ops.html#ga297f853b3d90ec8ae81263977ba2ddb1',1,'mlx::core::sqrt()']]],
+  ['square_129',['Square',['../structmlx_1_1core_1_1detail_1_1_square.html',1,'mlx::core::detail::Square'],['../classmlx_1_1core_1_1_square.html',1,'mlx::core::Square'],['../struct_square.html',1,'Square'],['../classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4',1,'mlx::core::Square::Square()']]],
+  ['square_130',['square',['../group__ops.html#ga1234e4c39cfa79f19d4bdb5b8ea4d45e',1,'mlx::core']]],
+  ['squeeze_131',['squeeze',['../group__ops.html#ga710daa7ec721bd4d3f326082cb195576',1,'mlx::core::squeeze(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga700dd51b77379a3d2260a55783e8ebf3',1,'mlx::core::squeeze(const array &amp;a, int axis, StreamOrDevice s={})'],['../group__ops.html#ga58bad3c61fd85b95927a987ba1cf5dad',1,'mlx::core::squeeze(const array &amp;a, StreamOrDevice s={})']]],
+  ['src_132',['src',['../struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76',1,'QuantizedBlockLoader::src'],['../structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa',1,'mlx::steel::BlockLoader::src'],['../structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777',1,'mlx::steel::BlockLoaderT::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a656a46ee27486482b45ff90b3d626255',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a57552e9cfbafad71d47b2f3a8e027bdf',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7bfbcc4a1e3eef7aef5dd8e8c374a95f',1,'mlx::steel::Conv2DWeightBlockLoader::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#afe21e46e08523232830c25eb1b4ade16',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b04a69952404a04029dacc424df6e8f',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1ee2922961b5fcb1db577928c4d9d731',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a50f458dbb74d61be2ed24727d8d43614',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::src']]],
+  ['src_5fld_133',['src_ld',['../struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e',1,'QuantizedBlockLoader::src_ld'],['../structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d',1,'mlx::steel::BlockLoader::src_ld'],['../structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321',1,'mlx::steel::BlockLoaderT::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7464ec687323fa79050702952ed9084f',1,'mlx::steel::Conv2DWeightBlockLoader::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#aa6bedc0cbb447eaf70c03f2e26df2cb2',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6918c1df7712c4e408e2871467ea7987',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::src_ld']]],
+  ['stack_134',['stack',['../group__ops.html#gaf8f2ec2b98a4b59eca73d7471df6e032',1,'mlx::core::stack(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#ga82216209dce901296fc737fe8efa5c94',1,'mlx::core::stack(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
+  ['start_5fcapture_135',['start_capture',['../namespacemlx_1_1core_1_1metal.html#aa47cb5651bf3b65c46ab216b7e504d77',1,'mlx::core::metal']]],
+  ['start_5fconcurrent_136',['start_concurrent',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034',1,'mlx::core::metal::CommandEncoder']]],
+  ['start_5frow_137',['start_row',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a220e033b689c8d6a6f319dae02b38334',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral']]],
+  ['status_138',['Status',['../classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078',1,'mlx::core::array']]],
+  ['status_139',['status',['../classmlx_1_1core_1_1array.html#a7102659be87e9ef62966696ab9b07dad',1,'mlx::core::array']]],
+  ['std_140',['std',['../group__ops.html#ga2a466024f8061febc0a64be557644cb0',1,'mlx::core::std(const array &amp;a, bool keepdims, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#gafdcb04d77c64405a3990078a77dd984c',1,'mlx::core::std(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga7f649970bf38b987b6ef847054f3c2f8',1,'mlx::core::std(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga62721a206df671ef5797449eea97af9f',1,'mlx::core::std(const array &amp;a, int axis, bool keepdims=false, int ddof=0, StreamOrDevice s={})']]],
+  ['steel_5fattention_2eh_141',['steel_attention.h',['../steel__attention_8h.html',1,'']]],
+  ['steel_5fconst_142',['STEEL_CONST',['../steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b',1,'defines.h']]],
+  ['steel_5fconv_143',['steel_conv',['../namespacemlx_1_1core_1_1metal.html#a92f1e559b1121d545746f81ff86eaca1',1,'mlx::core::metal']]],
+  ['steel_5fconv_2eh_144',['steel_conv.h',['../jit_2steel__conv_8h.html',1,'(Global Namespace)'],['../kernels_2steel_2conv_2kernels_2steel__conv_8h.html',1,'(Global Namespace)']]],
+  ['steel_5fconv_5fgeneral_145',['steel_conv_general',['../namespacemlx_1_1core_1_1metal.html#a02edb6a90bdf30f4c9f0d6c25b0267b5',1,'mlx::core::metal']]],
+  ['steel_5fconv_5fgeneral_2eh_146',['steel_conv_general.h',['../steel__conv__general_8h.html',1,'']]],
+  ['steel_5fconv_5fgeneral_5fkernels_147',['steel_conv_general_kernels',['../jit_2steel__conv_8h.html#ae4ca1720029316b08ea92b7662347d47',1,'steel_conv.h']]],
+  ['steel_5fconv_5fkernels_148',['steel_conv_kernels',['../jit_2steel__conv_8h.html#a386d79077465df56659416fd84adb513',1,'steel_conv.h']]],
+  ['steel_5fgemm_2eh_149',['steel_gemm.h',['../steel__gemm_8h.html',1,'']]],
+  ['steel_5fgemm_5ffused_150',['steel_gemm_fused',['../namespacemlx_1_1core_1_1metal.html#a17764366deed71c160fb26091400a803',1,'mlx::core::metal']]],
+  ['steel_5fgemm_5ffused_2eh_151',['steel_gemm_fused.h',['../steel__gemm__fused_8h.html',1,'']]],
+  ['steel_5fgemm_5ffused_5fkernels_152',['steel_gemm_fused_kernels',['../steel__gemm_8h.html#a4c6009fd5357b730805f2fd4ba6e093e',1,'steel_gemm.h']]],
+  ['steel_5fgemm_5fmasked_153',['steel_gemm_masked',['../namespacemlx_1_1core_1_1metal.html#a962272ca73d26c08f76f706a128fd71f',1,'mlx::core::metal']]],
+  ['steel_5fgemm_5fmasked_2eh_154',['steel_gemm_masked.h',['../steel__gemm__masked_8h.html',1,'']]],
+  ['steel_5fgemm_5fmasked_5fkernels_155',['steel_gemm_masked_kernels',['../steel__gemm_8h.html#a62a358fd3ec5365081920d07aceb581c',1,'steel_gemm.h']]],
+  ['steel_5fgemm_5fsplitk_156',['steel_gemm_splitk',['../namespacemlx_1_1core_1_1metal.html#ad0dfd40ba7c09755711ceb731e57a5ac',1,'mlx::core::metal']]],
+  ['steel_5fgemm_5fsplitk_2eh_157',['steel_gemm_splitk.h',['../steel__gemm__splitk_8h.html',1,'']]],
+  ['steel_5fgemm_5fsplitk_5faccum_5faxbpy_5fkernels_158',['steel_gemm_splitk_accum_axbpy_kernels',['../steel__gemm_8h.html#a40a86e1381c241aba8511e51a981a4bf',1,'steel_gemm.h']]],
+  ['steel_5fgemm_5fsplitk_5faccum_5fkernels_159',['steel_gemm_splitk_accum_kernels',['../steel__gemm_8h.html#a144a64b8d94f0371fb144e2cc308fcf9',1,'steel_gemm.h']]],
+  ['steel_5fgemm_5fsplitk_5fkernels_160',['steel_gemm_splitk_kernels',['../steel__gemm_8h.html#a92108ab01d826e38bca83d8569b947d9',1,'steel_gemm.h']]],
+  ['steel_5fmatmul_161',['steel_matmul',['../namespacemlx_1_1core.html#ab43a7633794498e1c6775cca829eb886',1,'mlx::core']]],
+  ['steel_5fmatmul_5fregular_162',['steel_matmul_regular',['../namespacemlx_1_1core.html#a227588758ccc9ee869dba147e830bb74',1,'mlx::core']]],
+  ['steel_5fpragma_5funroll_163',['STEEL_PRAGMA_UNROLL',['../steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6',1,'defines.h']]],
+  ['step_164',['step',['../structmlx_1_1core_1_1_contiguous_iterator.html#ae230bd52b70a0bbdf560090f8a6589ef',1,'mlx::core::ContiguousIterator']]],
+  ['stop_165',['stop',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a456ad1c0c9e731833a2f8411c4ed51aa',1,'mlx::core::scheduler::StreamThread']]],
+  ['stop_5fcapture_166',['stop_capture',['../namespacemlx_1_1core_1_1metal.html#ac90714424e36fb01e04550de69b8314f',1,'mlx::core::metal']]],
+  ['stop_5fgradient_167',['stop_gradient',['../group__ops.html#ga36bc28f1deb2fe668ca9ae1e447b6b1f',1,'mlx::core']]],
+  ['stopgradient_168',['StopGradient',['../classmlx_1_1core_1_1_stop_gradient.html',1,'mlx::core::StopGradient'],['../classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f',1,'mlx::core::StopGradient::StopGradient()']]],
+  ['store_169',['store',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98',1,'mlx::steel::MMATile::store(threadgroup U *dst) const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f',1,'mlx::steel::MMATile::store(device U *dst, const int ld) const'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98',1,'mlx::steel::MMATile::store(threadgroup U *dst) const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f',1,'mlx::steel::MMATile::store(device U *dst, const int ld) const']]],
+  ['store_5fresult_170',['store_result',['../structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const']]],
+  ['store_5fresult_5fsafe_171',['store_result_safe',['../structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, short2 dst_tile_dims)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, short2 dst_tile_dims)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const']]],
+  ['store_5fsafe_172',['store_safe',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba',1,'mlx::steel::MMATile::store_safe()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba',1,'mlx::steel::MMATile::store_safe()']]],
+  ['str_173',['str',['../classpocketfft_1_1detail_1_1arr__info.html#abe1f7b92501b4e0e5a38fd26294ac5a4',1,'pocketfft::detail::arr_info::str'],['../struct_m_l_x_conv_params.html#a862191e8ab1bc8a47aa1396b36d46058',1,'MLXConvParams::str']]],
+  ['stream_174',['Stream',['../structmlx_1_1core_1_1_stream.html',1,'mlx::core::Stream'],['../structmlx_1_1core_1_1_stream.html#a7f0815ff4886da74cbbff5f93d82dd3e',1,'mlx::core::Stream::Stream()']]],
+  ['stream_175',['stream',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a8462e4acffcd385c6248bd7102e6bcb1',1,'mlx::core::scheduler::StreamThread::stream'],['../classmlx_1_1core_1_1_event.html#a193143bad31b68c699fa27f135b45614',1,'mlx::core::Event::stream()'],['../classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a',1,'mlx::core::Primitive::stream()']]],
+  ['stream_2eh_176',['stream.h',['../stream_8h.html',1,'']]],
+  ['streamcontext_177',['StreamContext',['../structmlx_1_1core_1_1_stream_context.html',1,'mlx::core::StreamContext'],['../structmlx_1_1core_1_1_stream_context.html#a89d803151e9d7dce29382aa83d5c6ef1',1,'mlx::core::StreamContext::StreamContext()']]],
+  ['streamordevice_178',['StreamOrDevice',['../namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58',1,'mlx::core']]],
+  ['streamthread_179',['StreamThread',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html',1,'mlx::core::scheduler::StreamThread'],['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#ac528109a11abcb82e6e221c5efa4493c',1,'mlx::core::scheduler::StreamThread::StreamThread()']]],
+  ['stride_180',['stride',['../classpocketfft_1_1detail_1_1arr__info.html#a9d10aa83a1117e75d36f7396b8c2a093',1,'pocketfft::detail::arr_info::stride() const'],['../classpocketfft_1_1detail_1_1arr__info.html#ac1f6a9bd6703eceef6003f5f6315d39b',1,'pocketfft::detail::arr_info::stride(size_t i) const']]],
+  ['stride_5fin_181',['stride_in',['../classpocketfft_1_1detail_1_1multi__iter.html#ac947f03b1cfcb63436a7e61ff020a88c',1,'pocketfft::detail::multi_iter']]],
+  ['stride_5fout_182',['stride_out',['../classpocketfft_1_1detail_1_1multi__iter.html#a81d71a13bf0b85e556fbb9834167ecc7',1,'pocketfft::detail::multi_iter']]],
+  ['stride_5ft_183',['stride_t',['../namespacepocketfft_1_1detail.html#afb987c919e9424a996d0fc8b3c23cc84',1,'pocketfft::detail']]],
+  ['strided_5fdevice_5fidx_184',['strided_device_idx',['../struct_read_writer.html#a4c0b12484aac4fd6759d67c190391989',1,'ReadWriter']]],
+  ['strided_5freduce_5fgeneral_5fdispatch_185',['strided_reduce_general_dispatch',['../namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000',1,'mlx::core']]],
+  ['strided_5fscan_186',['strided_scan',['../scan_8h.html#a7abb6ffb6c3b96b88c2a63cd4cc2f7ae',1,'scan.h']]],
+  ['strided_5fshared_5fidx_187',['strided_shared_idx',['../struct_read_writer.html#ace40adb02cfb33d89c98353327c251fc',1,'ReadWriter']]],
+  ['strides_188',['strides',['../structmlx_1_1core_1_1_reduction_plan.html#a9bf7cae845ab633247c1811613ece8bd',1,'mlx::core::ReductionPlan::strides'],['../struct_indices.html#a7f73d7652f0f751e6a06c2663e329a4a',1,'Indices::strides'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2',1,'mlx::core::fast::CustomKernelShapeInfo::strides'],['../classmlx_1_1core_1_1array.html#a186cf2648da92584d5c1c8b24e69629b',1,'mlx::core::array::strides() const'],['../classmlx_1_1core_1_1array.html#a919f850ca087d1c40aa68f854cb30be2',1,'mlx::core::array::strides(int dim) const']]],
+  ['submit_189',['submit',['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a8698d49e8f406cdb88006aac6a91f9a4',1,'pocketfft::detail::threading::thread_pool']]],
+  ['subop_190',['SubOp',['../struct_sub_op.html',1,'']]],
+  ['subtract_191',['Subtract',['../structmlx_1_1core_1_1detail_1_1_subtract.html',1,'mlx::core::detail::Subtract'],['../classmlx_1_1core_1_1_subtract.html',1,'mlx::core::Subtract'],['../struct_subtract.html',1,'Subtract'],['../classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c',1,'mlx::core::Subtract::Subtract()']]],
+  ['subtract_192',['subtract',['../group__ops.html#ga196c240d3d0fcbb4713802c485e15133',1,'mlx::core']]],
+  ['sum_193',['Sum',['../struct_sum.html',1,'Sum&lt; U &gt;'],['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924a1fc7c1f09c80650ab0497e2d6781d65f',1,'mlx::core::distributed::AllReduce::Sum'],['../classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a8582875544f1d3d396a1a376473ef1dd',1,'mlx::core::Reduce::Sum'],['../classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1ade23893033e4849f5596e7ce76a5fc36',1,'mlx::core::Scan::Sum'],['../classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613ca14abe2d8818efa71726be4e156813d6f',1,'mlx::core::Scatter::Sum']]],
+  ['sum_194',['sum',['../namespacemlx_1_1steel.html#ab4a6ddea4beb7c447cf5b69b9d46cc3b',1,'mlx::steel::sum(T x)'],['../namespacemlx_1_1steel.html#acd6e194d37b617d7a5818bc384a97fe4',1,'mlx::steel::sum(T x, Us... us)'],['../group__ops.html#gade905ee92eb6ab7edfc312aeddfbaeb6',1,'mlx::core::sum(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga3627754d7868487bdab1bd83f05d9c81',1,'mlx::core::sum(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaccd0a6be2c5b5128fdc2d87b5c8e67f4',1,'mlx::core::sum(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gafcd39b0bf39a56c26a967981c7ab8a8d',1,'mlx::core::sum(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['sumop_195',['SumOp',['../struct_sum_op.html',1,'']]],
+  ['svd_196',['SVD',['../classmlx_1_1core_1_1_s_v_d.html',1,'mlx::core::SVD'],['../classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1',1,'mlx::core::SVD::SVD()']]],
+  ['svd_197',['svd',['../namespacemlx_1_1core_1_1linalg.html#a64364b880e99914cf47bf756fa8dbaf0',1,'mlx::core::linalg']]],
+  ['swapaxes_198',['swapaxes',['../group__ops.html#gabc46eed81ab6c6247903e4ec0c4ec1fb',1,'mlx::core']]],
+  ['swizzle_199',['swizzle',['../structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760',1,'mlx::steel::BlockSwizzle::swizzle(uint3 tid, const int swizzle_log)'],['../structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760',1,'mlx::steel::BlockSwizzle::swizzle(uint3 tid, const int swizzle_log)']]],
+  ['swizzle_5flog_200',['swizzle_log',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ad0713159d4f710cd9a066596593d8840',1,'mlx::steel::ImplicitGemmConv2DParams::swizzle_log'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#af9ff2c06dd8994126634531440325be7',1,'mlx::steel::GEMMParams::swizzle_log']]],
+  ['synchronize_201',['synchronize',['../namespacemlx_1_1core.html#a14287949d82ffefad0306cef5eb5f9e4',1,'mlx::core::synchronize()'],['../namespacemlx_1_1core.html#a6648a71937b055e5ff513d98056c2fb5',1,'mlx::core::synchronize(Stream)']]]
 ];
diff --git a/docs/build/html/search/all_14.js b/docs/build/html/search/all_14.js
index aadbb7343..b9e7c8c6e 100644
--- a/docs/build/html/search/all_14.js
+++ b/docs/build/html/search/all_14.js
@@ -19,10 +19,10 @@ var searchData=
   ['tensordot_16',['tensordot',['../group__ops.html#gaf5c9735f4690327e1500e04e728fae70',1,'mlx::core::tensordot(const array &amp;a, const array &amp;b, const int axis=2, StreamOrDevice s={})'],['../group__ops.html#gad7fe00b566f89d607639c1a497cabbc6',1,'mlx::core::tensordot(const array &amp;a, const array &amp;b, const std::vector&lt; int &gt; &amp;axes_a, const std::vector&lt; int &gt; &amp;axes_b, StreamOrDevice s={})']]],
   ['ternary_17',['ternary',['../namespacemlx_1_1core_1_1metal.html#a2d1c92ba6897c0a7a428fed63279b61f',1,'mlx::core::metal']]],
   ['ternary_2eh_18',['ternary.h',['../common_2ternary_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2ternary_8h.html',1,'(Global Namespace)'],['../metal_2ternary_8h.html',1,'(Global Namespace)']]],
-  ['ternary_5fg_19',['ternary_g',['../metal_2kernels_2ternary_8h.html#adf8b5989de971e43829875dc0097cdfb',1,'ternary.h']]],
+  ['ternary_5fg_19',['ternary_g',['../metal_2kernels_2ternary_8h.html#ab2051fd944c2e24c57d5b4af54894d72',1,'ternary.h']]],
   ['ternary_5fg_5fnd1_20',['ternary_g_nd1',['../metal_2kernels_2ternary_8h.html#a1bd5918559850f3f80e3adee2391fe6a',1,'ternary.h']]],
-  ['ternary_5fg_5fnd2_21',['ternary_g_nd2',['../metal_2kernels_2ternary_8h.html#afdf0d9d0cb21fcb3f176500785076af8',1,'ternary.h']]],
-  ['ternary_5fg_5fnd3_22',['ternary_g_nd3',['../metal_2kernels_2ternary_8h.html#a113df0c8a841b0e986900d580644e047',1,'ternary.h']]],
+  ['ternary_5fg_5fnd2_21',['ternary_g_nd2',['../metal_2kernels_2ternary_8h.html#adec9ca8a8bf527cb15d70da5857af15d',1,'ternary.h']]],
+  ['ternary_5fg_5fnd3_22',['ternary_g_nd3',['../metal_2kernels_2ternary_8h.html#a046dcbf67cd2318d45355dc7516e3ff4',1,'ternary.h']]],
   ['ternary_5fop_5fgpu_23',['ternary_op_gpu',['../namespacemlx_1_1core.html#aa63e62b6d3906e4cac871d498515a1cd',1,'mlx::core']]],
   ['ternary_5fop_5fgpu_5finplace_24',['ternary_op_gpu_inplace',['../namespacemlx_1_1core.html#a37645c0adccb3eb46844115def1a68d7',1,'mlx::core']]],
   ['ternary_5fops_25',['ternary_ops',['../namespacemlx_1_1core_1_1metal.html#a11b593b07e9a33e5f78fe4695fb99ec9',1,'mlx::core::metal']]],
@@ -39,7 +39,7 @@ var searchData=
   ['thread_5fcount_36',['thread_count',['../structpocketfft_1_1detail_1_1util.html#a3b012d5a19215bcd32cf6e228556fa87',1,'pocketfft::detail::util']]],
   ['thread_5ffn_37',['thread_fn',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a06a62c21c1174e4eb4d242e50aad7adf',1,'mlx::core::scheduler::StreamThread']]],
   ['thread_5fid_38',['thread_id',['../namespacepocketfft_1_1detail_1_1threading.html#aebe85d6273d92c7d3728e2c621ccc82b',1,'pocketfft::detail::threading']]],
-  ['thread_5fidx_39',['thread_idx',['../struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475',1,'QuantizedBlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a70da26a715135d973f88371a70255be9',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac18de37cde1459595bfe18b0d5ef146d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ab1cb2ade639787243e0325dcd3dc0a11',1,'mlx::steel::Conv2DWeightBlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9642399b8066e29123524f36ebc7b482',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acacdac168004c87fee27c8554ac905a7',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a401f0c7cf1588552556603c7ffba2316',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08a517bc50caf41155b98be0690bfe44',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::thread_idx'],['../structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b',1,'mlx::steel::BlockLoader::thread_idx']]],
+  ['thread_5fidx_39',['thread_idx',['../struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475',1,'QuantizedBlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b',1,'mlx::steel::BlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da',1,'mlx::steel::BlockLoaderT::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a70da26a715135d973f88371a70255be9',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac18de37cde1459595bfe18b0d5ef146d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ab1cb2ade639787243e0325dcd3dc0a11',1,'mlx::steel::Conv2DWeightBlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9642399b8066e29123524f36ebc7b482',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acacdac168004c87fee27c8554ac905a7',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a401f0c7cf1588552556603c7ffba2316',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08a517bc50caf41155b98be0690bfe44',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::thread_idx']]],
   ['thread_5fmap_40',['thread_map',['../namespacepocketfft_1_1detail_1_1threading.html#a4fcf674db39f0e2c1c59d48491daed6e',1,'pocketfft::detail::threading']]],
   ['thread_5fpool_41',['thread_pool',['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html',1,'pocketfft::detail::threading::thread_pool'],['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a37a8121a99dd06a9d44b3e80ba0ea560',1,'pocketfft::detail::threading::thread_pool::thread_pool(size_t nthreads)'],['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#aefaadaa60c0183b862ad96338177a5e0',1,'pocketfft::detail::threading::thread_pool::thread_pool()'],['../namespacemlx_1_1core_1_1io.html#a05f27b765443a178a972abae772e863d',1,'mlx::core::io::thread_pool()']]],
   ['thread_5freduce_42',['thread_reduce',['../reduce__row_8h.html#afd80a25fa84e6cc884dcc8698859ade1',1,'reduce_row.h']]],
@@ -56,11 +56,11 @@ var searchData=
   ['threefry2x32_5fhash_53',['threefry2x32_hash',['../namespacemlx_1_1core_1_1random.html#ac7e92c89a2bac1b0bed922a3d4c3c66b',1,'mlx::core::random']]],
   ['tile_54',['tile',['../group__ops.html#gab105a57b9a4d84496fe1e4d60e13d361',1,'mlx::core']]],
   ['tile_5fmatmad_55',['tile_matmad',['../namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad',1,'mlx::steel']]],
-  ['tile_5fstride_56',['tile_stride',['../struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320',1,'QuantizedBlockLoader::tile_stride'],['../structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d',1,'mlx::steel::BlockLoader::tile_stride']]],
+  ['tile_5fstride_56',['tile_stride',['../struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320',1,'QuantizedBlockLoader::tile_stride'],['../structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d',1,'mlx::steel::BlockLoader::tile_stride'],['../structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f',1,'mlx::steel::BlockLoaderT::tile_stride']]],
   ['tile_5fstride_5fa_57',['tile_stride_a',['../structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330',1,'mlx::steel::BlockMMA']]],
   ['tile_5fstride_5fb_58',['tile_stride_b',['../structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4',1,'mlx::steel::BlockMMA']]],
-  ['tiles_5fm_59',['tiles_m',['../struct_m_l_x_fast_attention_params.html#a0df159c839fc27b9426b8ac4336cc0ad',1,'MLXFastAttentionParams::tiles_m'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a4c5e33edf70be99cf93ac5723c12eb24',1,'mlx::steel::ImplicitGemmConv2DParams::tiles_m'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#ad23a5a7f74cd5859741a36e4bc7823ca',1,'mlx::steel::GEMMParams::tiles_m'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a0970989624e17088d5326c2e198cb95b',1,'mlx::steel::GEMMSpiltKParams::tiles_m']]],
-  ['tiles_5fn_60',['tiles_n',['../struct_m_l_x_fast_attention_params.html#a608aa256216ac6d80af00209303d2029',1,'MLXFastAttentionParams::tiles_n'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a76f9f381e7187a993d65128b9b681b2d',1,'mlx::steel::ImplicitGemmConv2DParams::tiles_n'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a0e6b8b629232f1b43fbce9a395174bed',1,'mlx::steel::GEMMParams::tiles_n'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a5b46dfb9cee3606efa05d217349a20a6',1,'mlx::steel::GEMMSpiltKParams::tiles_n']]],
+  ['tiles_5fm_59',['tiles_m',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a4c5e33edf70be99cf93ac5723c12eb24',1,'mlx::steel::ImplicitGemmConv2DParams::tiles_m'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#ad23a5a7f74cd5859741a36e4bc7823ca',1,'mlx::steel::GEMMParams::tiles_m'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a0970989624e17088d5326c2e198cb95b',1,'mlx::steel::GEMMSpiltKParams::tiles_m']]],
+  ['tiles_5fn_60',['tiles_n',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a76f9f381e7187a993d65128b9b681b2d',1,'mlx::steel::ImplicitGemmConv2DParams::tiles_n'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a0e6b8b629232f1b43fbce9a395174bed',1,'mlx::steel::GEMMParams::tiles_n'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a5b46dfb9cee3606efa05d217349a20a6',1,'mlx::steel::GEMMSpiltKParams::tiles_n']]],
   ['tm_61',['TM',['../structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591',1,'mlx::steel::BlockMMA']]],
   ['tm_5fstride_62',['TM_stride',['../structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307',1,'mlx::steel::BlockMMA']]],
   ['tn_63',['TN',['../structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424',1,'mlx::steel::BlockMMA']]],
@@ -69,25 +69,26 @@ var searchData=
   ['to_5fstream_66',['to_stream',['../namespacemlx_1_1core.html#a4734a596e57434492ddfe79f2cb9dbf9',1,'mlx::core']]],
   ['topk_67',['topk',['../group__ops.html#ga5487dd887c43e5341f3e68ffe47f0f5a',1,'mlx::core::topk(const array &amp;a, int k, StreamOrDevice s={})'],['../group__ops.html#ga35b8436c79ff953f6c809598b646f498',1,'mlx::core::topk(const array &amp;a, int k, int axis, StreamOrDevice s={})']]],
   ['trace_68',['trace',['../group__ops.html#gabf786129c7660ed8d5acb5499bc6fefd',1,'mlx::core::trace(const array &amp;a, int offset, int axis1, int axis2, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga5ed43c2dbf7d6cbddbaa2fd682deaafd',1,'mlx::core::trace(const array &amp;a, int offset, int axis1, int axis2, StreamOrDevice s={})'],['../group__ops.html#gaf25c00108feaafaa6350a4434cb0062e',1,'mlx::core::trace(const array &amp;a, StreamOrDevice s={})']]],
-  ['transformadd_69',['TransformAdd',['../structmlx_1_1steel_1_1_transform_add.html',1,'mlx::steel::TransformAdd&lt; OutT, InT &gt;'],['../structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae',1,'mlx::steel::TransformAdd::TransformAdd()']]],
-  ['transformaxpby_70',['TransformAxpby',['../structmlx_1_1steel_1_1_transform_axpby.html',1,'mlx::steel::TransformAxpby&lt; OutT, InT &gt;'],['../structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9',1,'mlx::steel::TransformAxpby::TransformAxpby()']]],
+  ['transformadd_69',['TransformAdd',['../structmlx_1_1steel_1_1_transform_add.html',1,'mlx::steel::TransformAdd&lt; OutT, InT &gt;'],['../structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae',1,'mlx::steel::TransformAdd::TransformAdd(const float, const float)'],['../structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae',1,'mlx::steel::TransformAdd::TransformAdd(const float, const float)']]],
+  ['transformaxpby_70',['TransformAxpby',['../structmlx_1_1steel_1_1_transform_axpby.html',1,'mlx::steel::TransformAxpby&lt; OutT, InT &gt;'],['../structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9',1,'mlx::steel::TransformAxpby::TransformAxpby(const float alpha_, const float beta_)'],['../structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9',1,'mlx::steel::TransformAxpby::TransformAxpby(const float alpha_, const float beta_)']]],
   ['transformnone_71',['TransformNone',['../structmlx_1_1steel_1_1_transform_none.html',1,'mlx::steel']]],
-  ['transforms_2eh_72',['transforms.h',['../backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html',1,'(Global Namespace)'],['../transforms_8h.html',1,'(Global Namespace)']]],
+  ['transforms_2eh_72',['transforms.h',['../backend_2metal_2kernels_2steel_2attn_2transforms_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html',1,'(Global Namespace)'],['../transforms_8h.html',1,'(Global Namespace)']]],
   ['transforms_5fimpl_2eh_73',['transforms_impl.h',['../transforms__impl_8h.html',1,'']]],
-  ['transpose_74',['Transpose',['../classmlx_1_1core_1_1_transpose.html',1,'mlx::core::Transpose'],['../classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a',1,'mlx::core::Transpose::Transpose()']]],
-  ['transpose_75',['transpose',['../group__ops.html#gac1869f3b7094869b44fe7ac4ce58638b',1,'mlx::core::transpose(const array &amp;a, std::vector&lt; int &gt; axes, StreamOrDevice s={})'],['../group__ops.html#ga260ac332956f3a6bf1dfdb9095c84dc5',1,'mlx::core::transpose(const array &amp;a, std::initializer_list&lt; int &gt; axes, StreamOrDevice s={})'],['../group__ops.html#ga68da0176fefbe0c0096783c6fd926c6a',1,'mlx::core::transpose(const array &amp;a, StreamOrDevice s={})']]],
-  ['tri_76',['tri',['../group__ops.html#ga4f3389e5b89e70e862e7d2b40d6c7f78',1,'mlx::core::tri(int n, int m, int k, Dtype type, StreamOrDevice s={})'],['../group__ops.html#gac19a1bd6ed6d5c7bc9d258820189dbb5',1,'mlx::core::tri(int n, Dtype type, StreamOrDevice s={})']]],
-  ['tri_5finv_77',['tri_inv',['../namespacemlx_1_1core_1_1linalg.html#aba1994571326326717b5b5e38c2e0661',1,'mlx::core::linalg']]],
-  ['tril_78',['tril',['../group__ops.html#ga83e0bb45dc770cf014531d873b78c5a2',1,'mlx::core']]],
-  ['triu_79',['triu',['../group__ops.html#gaa9df5917876eeb0cb28b7fa81f880412',1,'mlx::core']]],
-  ['trows_80',['TROWS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a2528ff5ed472e4ed35415ada42276b07',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3957fb263fe040fe70683fd1d7b06487',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a94f0ce5bb7d87bc1fb6a7c2ba2b892d4',1,'mlx::steel::Conv2DWeightBlockLoader::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a789683f9ac9d9309d07c05f3bdedd2fd',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a5cefb1285ed13ad3490198e9303453de',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a2aff22af70f685f858adea73f5575cf7',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acbc28f364381166faaeec2783dc88e10',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::TROWS']]],
-  ['true_5ftype_81',['true_type',['../namespacemlx_1_1steel.html#a594a6ccb75b38b5ae4ddd0d9ad047b3a',1,'mlx::steel']]],
-  ['trunc_82',['trunc',['../namespacemetal.html#a93cb75a11a362bfc8310ea19c554c887',1,'metal::trunc()'],['../namespacemetal_1_1fast.html#aa62e1075e86c626d97038f16e9433415',1,'metal::fast::trunc()'],['../namespacemetal_1_1precise.html#a334183e7a2dd49b983d072d1e8ee2b27',1,'metal::precise::trunc()']]],
-  ['truncated_5fnormal_83',['truncated_normal',['../namespacemlx_1_1core_1_1random.html#a00aa5746bac6d729d2ba9465153bb279',1,'mlx::core::random::truncated_normal(const array &amp;lower, const array &amp;upper, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a39663eda0fd7b274d01499a7b1c9035f',1,'mlx::core::random::truncated_normal(const array &amp;lower, const array &amp;upper, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
-  ['try_5fpop_84',['try_pop',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#aa3807d46a126d229f9054c779105ea43',1,'pocketfft::detail::threading::concurrent_queue']]],
-  ['two_85',['two',['../classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421a41877eab6fa3db7d7ed2cda9eba14251',1,'mlx::core::Log']]],
-  ['type_86',['type',['../structpocketfft_1_1detail_1_1add__vec.html#a7568dc83136c1b41eb71dcb78527227e',1,'pocketfft::detail::add_vec::type'],['../structpocketfft_1_1detail_1_1add__vec_3_01cmplx_3_01_t_01_4_01_4.html#a257b1c81fb9f559c48ee90497013494e',1,'pocketfft::detail::add_vec&lt; cmplx&lt; T &gt; &gt;::type'],['../structmlx_1_1steel_1_1integral__constant.html#a6492c15b37d160d3a33e1cbe770aa3f1',1,'mlx::steel::integral_constant::type'],['../structmetal_1_1make__void.html#aee74916713465374928c5379ab0d9b75',1,'metal::make_void::type'],['../structmetal_1_1pointer__element_3_01thread_01_t_01_5_01_4.html#a98fbc2aa99dd26bb35aa9cd1826318d8',1,'metal::pointer_element&lt; thread T * &gt;::type'],['../structmetal_1_1pointer__element_3_01device_01_t_01_5_01_4.html#ab36a7c5a64c0693dd3d8ccb322c163d4',1,'metal::pointer_element&lt; device T * &gt;::type'],['../structmetal_1_1pointer__element_3_01constant_01_t_01_5_01_4.html#ad154b55b9e450a6376016488c8e68c53',1,'metal::pointer_element&lt; constant T * &gt;::type'],['../structmetal_1_1pointer__element_3_01threadgroup_01_t_01_5_01_4.html#a78c718d6da9d393c139a385f42472362',1,'metal::pointer_element&lt; threadgroup T * &gt;::type'],['../structpocketfft_1_1detail_1_1_exec_dcst.html#a9b170cbd74a9c6f45ac014ce349219ea',1,'pocketfft::detail::ExecDcst::type'],['../structmlx_1_1core_1_1_reduction_plan.html#a24e407f13d4d02156380ecc1a6748a76',1,'mlx::core::ReductionPlan::type'],['../structmlx_1_1core_1_1_device.html#a763264ec90f7f23c5dced36c3f0db2e5',1,'mlx::core::Device::type']]],
-  ['type_5fto_5fname_87',['type_to_name',['../namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae',1,'mlx::core']]],
-  ['type_5ftraits_2eh_88',['type_traits.h',['../type__traits_8h.html',1,'']]],
-  ['typetodtype_89',['TypeToDtype',['../structmlx_1_1core_1_1_type_to_dtype.html',1,'mlx::core']]]
+  ['transformscale_74',['TransformScale',['../struct_transform_scale.html',1,'TransformScale&lt; T &gt;'],['../struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70',1,'TransformScale::TransformScale()']]],
+  ['transpose_75',['Transpose',['../classmlx_1_1core_1_1_transpose.html',1,'mlx::core::Transpose'],['../classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a',1,'mlx::core::Transpose::Transpose()']]],
+  ['transpose_76',['transpose',['../group__ops.html#gac1869f3b7094869b44fe7ac4ce58638b',1,'mlx::core::transpose(const array &amp;a, std::vector&lt; int &gt; axes, StreamOrDevice s={})'],['../group__ops.html#ga260ac332956f3a6bf1dfdb9095c84dc5',1,'mlx::core::transpose(const array &amp;a, std::initializer_list&lt; int &gt; axes, StreamOrDevice s={})'],['../group__ops.html#ga68da0176fefbe0c0096783c6fd926c6a',1,'mlx::core::transpose(const array &amp;a, StreamOrDevice s={})']]],
+  ['tri_77',['tri',['../group__ops.html#ga4f3389e5b89e70e862e7d2b40d6c7f78',1,'mlx::core::tri(int n, int m, int k, Dtype type, StreamOrDevice s={})'],['../group__ops.html#gac19a1bd6ed6d5c7bc9d258820189dbb5',1,'mlx::core::tri(int n, Dtype type, StreamOrDevice s={})']]],
+  ['tri_5finv_78',['tri_inv',['../namespacemlx_1_1core_1_1linalg.html#aba1994571326326717b5b5e38c2e0661',1,'mlx::core::linalg']]],
+  ['tril_79',['tril',['../group__ops.html#ga83e0bb45dc770cf014531d873b78c5a2',1,'mlx::core']]],
+  ['triu_80',['triu',['../group__ops.html#gaa9df5917876eeb0cb28b7fa81f880412',1,'mlx::core']]],
+  ['trows_81',['TROWS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a2528ff5ed472e4ed35415ada42276b07',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3957fb263fe040fe70683fd1d7b06487',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a94f0ce5bb7d87bc1fb6a7c2ba2b892d4',1,'mlx::steel::Conv2DWeightBlockLoader::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a789683f9ac9d9309d07c05f3bdedd2fd',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a5cefb1285ed13ad3490198e9303453de',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a2aff22af70f685f858adea73f5575cf7',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::TROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acbc28f364381166faaeec2783dc88e10',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::TROWS']]],
+  ['true_5ftype_82',['true_type',['../namespacemlx_1_1steel.html#a594a6ccb75b38b5ae4ddd0d9ad047b3a',1,'mlx::steel']]],
+  ['trunc_83',['trunc',['../namespacemetal.html#a93cb75a11a362bfc8310ea19c554c887',1,'metal::trunc()'],['../namespacemetal_1_1fast.html#aa62e1075e86c626d97038f16e9433415',1,'metal::fast::trunc()'],['../namespacemetal_1_1precise.html#a334183e7a2dd49b983d072d1e8ee2b27',1,'metal::precise::trunc()']]],
+  ['truncated_5fnormal_84',['truncated_normal',['../namespacemlx_1_1core_1_1random.html#a00aa5746bac6d729d2ba9465153bb279',1,'mlx::core::random::truncated_normal(const array &amp;lower, const array &amp;upper, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a39663eda0fd7b274d01499a7b1c9035f',1,'mlx::core::random::truncated_normal(const array &amp;lower, const array &amp;upper, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
+  ['try_5fpop_85',['try_pop',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#aa3807d46a126d229f9054c779105ea43',1,'pocketfft::detail::threading::concurrent_queue']]],
+  ['two_86',['two',['../classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421a41877eab6fa3db7d7ed2cda9eba14251',1,'mlx::core::Log']]],
+  ['type_87',['type',['../structpocketfft_1_1detail_1_1add__vec.html#a7568dc83136c1b41eb71dcb78527227e',1,'pocketfft::detail::add_vec::type'],['../structpocketfft_1_1detail_1_1add__vec_3_01cmplx_3_01_t_01_4_01_4.html#a257b1c81fb9f559c48ee90497013494e',1,'pocketfft::detail::add_vec&lt; cmplx&lt; T &gt; &gt;::type'],['../structmlx_1_1steel_1_1integral__constant.html#a6492c15b37d160d3a33e1cbe770aa3f1',1,'mlx::steel::integral_constant::type'],['../structmetal_1_1make__void.html#aee74916713465374928c5379ab0d9b75',1,'metal::make_void::type'],['../structmetal_1_1pointer__element_3_01thread_01_t_01_5_01_4.html#a98fbc2aa99dd26bb35aa9cd1826318d8',1,'metal::pointer_element&lt; thread T * &gt;::type'],['../structmetal_1_1pointer__element_3_01device_01_t_01_5_01_4.html#ab36a7c5a64c0693dd3d8ccb322c163d4',1,'metal::pointer_element&lt; device T * &gt;::type'],['../structmetal_1_1pointer__element_3_01constant_01_t_01_5_01_4.html#ad154b55b9e450a6376016488c8e68c53',1,'metal::pointer_element&lt; constant T * &gt;::type'],['../structmetal_1_1pointer__element_3_01threadgroup_01_t_01_5_01_4.html#a78c718d6da9d393c139a385f42472362',1,'metal::pointer_element&lt; threadgroup T * &gt;::type'],['../structpocketfft_1_1detail_1_1_exec_dcst.html#a9b170cbd74a9c6f45ac014ce349219ea',1,'pocketfft::detail::ExecDcst::type'],['../structmlx_1_1core_1_1_reduction_plan.html#a24e407f13d4d02156380ecc1a6748a76',1,'mlx::core::ReductionPlan::type'],['../structmlx_1_1core_1_1_device.html#a763264ec90f7f23c5dced36c3f0db2e5',1,'mlx::core::Device::type']]],
+  ['type_5fto_5fname_88',['type_to_name',['../namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164',1,'mlx::core::type_to_name(const Dtype &amp;t)'],['../namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae',1,'mlx::core::type_to_name(const array &amp;a)']]],
+  ['type_5ftraits_2eh_89',['type_traits.h',['../type__traits_8h.html',1,'']]],
+  ['typetodtype_90',['TypeToDtype',['../structmlx_1_1core_1_1_type_to_dtype.html',1,'mlx::core']]]
 ];
diff --git a/docs/build/html/search/all_15.js b/docs/build/html/search/all_15.js
index 2b9eaeeb7..a19220bcd 100644
--- a/docs/build/html/search/all_15.js
+++ b/docs/build/html/search/all_15.js
@@ -2,13 +2,13 @@ var searchData=
 [
   ['u_0',['u',['../structmlx_1_1core_1_1_dtype.html#adb1ea8b45a0c53e04a0e73b168702715a7b774effe4a349c6dd82ad4f4f21d34c',1,'mlx::core::Dtype::u'],['../types_2bf16_8h.html#aa21e554721eddcf127b7fcfa7fdc56bd',1,'u:&#160;bf16.h'],['../fp16_8h.html#aa21e554721eddcf127b7fcfa7fdc56bd',1,'u:&#160;fp16.h']]],
   ['uint16_1',['uint16',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daaa00ef2ef85ff67b7b39339886f19044f',1,'mlx::core::Dtype::uint16'],['../namespacemlx_1_1core.html#a312a70c487366968af5e6cbf5038c812',1,'mlx::core::uint16']]],
-  ['uint16_5fto_5fbfloat16_2',['uint16_to_bfloat16',['../bf16__math_8h.html#a030d871474c0e7d907fccffcc8c047e0',1,'bf16_math.h']]],
+  ['uint16_5fto_5fbfloat16_2',['uint16_to_bfloat16',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4',1,'uint16_to_bfloat16(const uint16_t x):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4',1,'uint16_to_bfloat16(const uint16_t x):&#160;bf16.h']]],
   ['uint32_3',['uint32',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa3de84ad0700f2a1571f633d399e1900e',1,'mlx::core::Dtype::uint32'],['../namespacemlx_1_1core.html#ac63820d6fe10545907c33faf466a929e',1,'mlx::core::uint32']]],
   ['uint64_4',['uint64',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa2e8d31865e5d4b9d8611e1b991baed07',1,'mlx::core::Dtype::uint64'],['../namespacemlx_1_1core.html#a1f42e3dd4787d2ecec7114a12daefec8',1,'mlx::core::uint64']]],
   ['uint8_5',['uint8',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa5f423e669d0a8f4ab7c4c3e6da27161a',1,'mlx::core::Dtype::uint8'],['../namespacemlx_1_1core.html#a9778d50afbf456b0bd738751243b3b68',1,'mlx::core::uint8']]],
   ['unary_6',['unary',['../namespacemlx_1_1core_1_1metal.html#afac64fd56ac492d6baf6de7e8a00b039',1,'mlx::core::metal']]],
   ['unary_2eh_7',['unary.h',['../common_2unary_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2unary_8h.html',1,'(Global Namespace)'],['../metal_2unary_8h.html',1,'(Global Namespace)']]],
-  ['unary_5fg_8',['unary_g',['../metal_2kernels_2unary_8h.html#ac965f8d3ed62f8580dbfb645e83d4ae5',1,'unary.h']]],
+  ['unary_5fg_8',['unary_g',['../metal_2kernels_2unary_8h.html#ac2a85fee50af49620ff62c1a71e2575d',1,'unary.h']]],
   ['unary_5fop_5fgpu_9',['unary_op_gpu',['../namespacemlx_1_1core.html#aba2b4accc059f30d4dca88db9f7a6e13',1,'mlx::core']]],
   ['unary_5fop_5fgpu_5finplace_10',['unary_op_gpu_inplace',['../namespacemlx_1_1core.html#a668fde2bd280a88f63a68b68a343d375',1,'mlx::core']]],
   ['unary_5fops_11',['unary_ops',['../namespacemlx_1_1core_1_1metal.html#a17b471fa52ea5f24ee63e081f46528f5',1,'mlx::core::metal']]],
@@ -20,8 +20,9 @@ var searchData=
   ['uniform_17',['uniform',['../namespacemlx_1_1core_1_1random.html#adaa626cf75ab891978954bd1eb79a38b',1,'mlx::core::random::uniform(const array &amp;low, const array &amp;high, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#ac0dd00f7e37377d621f9f5bfb5a3f8e4',1,'mlx::core::random::uniform(T low, U high, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a2922e133d9f82dcf925bae0a784cc4a7',1,'mlx::core::random::uniform(const std::vector&lt; int &gt; &amp;shape, Dtype dtype, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a133d2855ff4d8daf41029cffdf43cdf9',1,'mlx::core::random::uniform(const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
   ['unscheduled_18',['unscheduled',['../classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078ae8a9988458b0355001674020a45656fb',1,'mlx::core::array']]],
   ['unsignedinteger_19',['unsignedinteger',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2da9c035d4e66b2c72f583cde964cf3a0d3',1,'mlx::core::Dtype::unsignedinteger'],['../namespacemlx_1_1core.html#a42e9706a5521bb25eaf12ccad94bfc81',1,'mlx::core::unsignedinteger']]],
-  ['use_5fout_5fsource_20',['use_out_source',['../steel__gemm__fused_8h.html#a3fe4e4382bda8a419557a5e6f77bc084',1,'steel_gemm_fused.h']]],
-  ['util_21',['util',['../structpocketfft_1_1detail_1_1util.html',1,'pocketfft::detail']]],
-  ['utils_22',['utils',['../namespacemlx_1_1core_1_1metal.html#a529dc6c2d4a37ba544b66b2c3cd792cc',1,'mlx::core::metal']]],
-  ['utils_2eh_23',['utils.h',['../backend_2accelerate_2utils_8h.html',1,'(Global Namespace)'],['../backend_2common_2utils_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2steel_2utils_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2utils_8h.html',1,'(Global Namespace)'],['../backend_2metal_2utils_8h.html',1,'(Global Namespace)'],['../utils_8h.html',1,'(Global Namespace)']]]
+  ['update_5ffence_20',['update_fence',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#aeef08f5f3c015578d40de756a6465aa2',1,'mlx::core::metal::CommandEncoder']]],
+  ['use_5fout_5fsource_21',['use_out_source',['../steel__gemm__fused_8h.html#a3fe4e4382bda8a419557a5e6f77bc084',1,'steel_gemm_fused.h']]],
+  ['util_22',['util',['../structpocketfft_1_1detail_1_1util.html',1,'pocketfft::detail']]],
+  ['utils_23',['utils',['../namespacemlx_1_1core_1_1metal.html#a529dc6c2d4a37ba544b66b2c3cd792cc',1,'mlx::core::metal']]],
+  ['utils_2eh_24',['utils.h',['../backend_2accelerate_2utils_8h.html',1,'(Global Namespace)'],['../backend_2common_2utils_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2steel_2utils_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2utils_8h.html',1,'(Global Namespace)'],['../backend_2metal_2utils_8h.html',1,'(Global Namespace)'],['../utils_8h.html',1,'(Global Namespace)']]]
 ];
diff --git a/docs/build/html/search/all_16.js b/docs/build/html/search/all_16.js
index 7c6cf43a6..ffbd5899f 100644
--- a/docs/build/html/search/all_16.js
+++ b/docs/build/html/search/all_16.js
@@ -1,27 +1,28 @@
 var searchData=
 [
   ['v_0',['V',['../structmlx_1_1core_1_1_dtype.html#adb1ea8b45a0c53e04a0e73b168702715a5206560a306a2e085a437fd258eb57ce',1,'mlx::core::Dtype']]],
-  ['v_1',['v',['../structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#afbef88bfb901a71e8423de911b7c7347',1,'mlx::steel::BlockLoader::ReadVector']]],
-  ['val_2',['Val',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1da',1,'mlx::core::Dtype']]],
-  ['val_3',['val',['../structpocketfft_1_1detail_1_1_v_l_e_n.html#ab1fdc340dedde723e636746c828a4534',1,'pocketfft::detail::VLEN::val'],['../structmlx__atomic.html#a6f6651b8dd8149917c50cd99b13c6747',1,'mlx_atomic::val'],['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html#a8dbf729fcd8c4a16e41b546c7405543d',1,'mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;::val'],['../structmlx_1_1core_1_1_dtype.html#a7a99656f121c8922ab82e72c8e9bd7f1',1,'mlx::core::Dtype::val()']]],
-  ['val_5ffrags_4',['val_frags',['../structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44',1,'mlx::steel::MMATile']]],
-  ['val_5ft_5',['val_t',['../struct_kernel_merge_sort.html#a4e3f09896275956fc4c23e1f157dca3b',1,'KernelMergeSort']]],
-  ['valid_6',['valid',['../classmlx_1_1core_1_1_event.html#aa77afd9669e2ef9d5e9ae1c2c6fd24fa',1,'mlx::core::Event']]],
-  ['value_7',['value',['../structmlx_1_1steel_1_1integral__constant.html#a4efa69cb3fd42ac0dcad46578600d637',1,'mlx::steel::integral_constant::value'],['../classmlx_1_1core_1_1_event.html#ab71c7baee3d1d02ad6a2001bbf90b970',1,'mlx::core::Event::value()']]],
-  ['value_5fand_5fgrad_8',['value_and_grad',['../namespacemlx_1_1core.html#abf49b337a00997231c0f7fd389efa8f3',1,'mlx::core::value_and_grad(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;argnums)'],['../namespacemlx_1_1core.html#a7b987f404b8699de00f9e0099ab6b1b0',1,'mlx::core::value_and_grad(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, int argnum=0)'],['../namespacemlx_1_1core.html#a5a64dc878b29403d27e50bd7a288cc04',1,'mlx::core::value_and_grad(const std::function&lt; array(const array &amp;)&gt; &amp;fun)'],['../namespacemlx_1_1core.html#a7620f1ae298127cb6181db9162f012a7',1,'mlx::core::value_and_grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;argnums)'],['../namespacemlx_1_1core.html#a2f69ffc30d66b1fca8f24b65be161a51',1,'mlx::core::value_and_grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, int argnum=0)']]],
-  ['value_5ftype_9',['value_type',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#ab86a2740ed9ce3199135372ff1d88c76',1,'pocketfft::detail::threading::aligned_allocator::value_type'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ae24fe304397e961687d0d4c7012b8ae4',1,'mlx::core::array::ArrayIterator::value_type'],['../structmlx_1_1steel_1_1integral__constant.html#a0569cc1334e0bc4f474304b33d365759',1,'mlx::steel::integral_constant::value_type']]],
-  ['valueandgradfn_10',['ValueAndGradFn',['../namespacemlx_1_1core.html#ab79d66ddf1ec38b2f2c01234892a2230',1,'mlx::core']]],
-  ['var_11',['var',['../group__ops.html#ga7e133df686439588a8cd1fb10ce0c6e9',1,'mlx::core::var(const array &amp;a, bool keepdims, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga7d7b38d118fa2613214078ef0f7d5a42',1,'mlx::core::var(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga78ddeb966cbe7a5b0aa17e1de43025f2',1,'mlx::core::var(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga4fbf3e3f98f2e4956faf87af320aa9d0',1,'mlx::core::var(const array &amp;a, int axis, bool keepdims=false, int ddof=0, StreamOrDevice s={})']]],
-  ['vec_5fsize_12',['vec_size',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#adcc83bf6c02391cc2375e55c06a1c9a4',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a71c313e1597a2bb99f7b07d434e119d2',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a10109dc9553207f5a365799e4969c6d2',1,'mlx::steel::Conv2DWeightBlockLoader::vec_size'],['../structmlx_1_1steel_1_1_channel_helper.html#a2b24f991a9380fdad6b51a038770b925',1,'mlx::steel::ChannelHelper::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a71449551bbfe56058440755dfd50fc75',1,'mlx::steel::ChannelHelper&lt; 1 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#acfb18991a77a9d1d4a79918ac5f387af',1,'mlx::steel::ChannelHelper&lt; 2 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a5cb83774601c29564a6bbc010fc0bf7f',1,'mlx::steel::ChannelHelper&lt; 3 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#af28cdbe2a3c027d95832de07f60448ca',1,'mlx::steel::ChannelHelper&lt; 4 &gt;::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a6b0b18428516d1d6dcae3beb3faee81c',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a006153d274aa13d5fd4448b4607fff3a',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1587047caa339cf5b2c06adc4b332ab8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08dba753ec7c8ea2892775746933b3e7',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::vec_size'],['../structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092',1,'mlx::steel::BlockLoader::vec_size']]],
-  ['vector_13',['Vector',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337a57dea6f5039281b7fee517fc43bf3110',1,'mlx::core']]],
-  ['view_14',['View',['../classmlx_1_1core_1_1_view.html',1,'mlx::core::View'],['../classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e',1,'mlx::core::View::View()']]],
-  ['view_15',['view',['../group__ops.html#ga3602aa91b7b124a0b41ec1b2137a1b02',1,'mlx::core']]],
-  ['vjp_16',['vjp',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225',1,'mlx::core::distributed::AllReduce::vjp()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb',1,'mlx::core::distributed::AllGather::vjp()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91',1,'mlx::core::fast::Custom::vjp()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb',1,'mlx::core::fast::RMSNorm::vjp()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6',1,'mlx::core::fast::LayerNorm::vjp()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533',1,'mlx::core::fast::RoPE::vjp()'],['../classmlx_1_1core_1_1_primitive.html#a1dcb6807326eeab62474c6a0e3836d42',1,'mlx::core::Primitive::vjp()'],['../classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592',1,'mlx::core::Abs::vjp()'],['../classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607',1,'mlx::core::Add::vjp()'],['../classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6',1,'mlx::core::AddMM::vjp()'],['../classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92',1,'mlx::core::ArcCos::vjp()'],['../classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26',1,'mlx::core::ArcCosh::vjp()'],['../classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1',1,'mlx::core::ArcSin::vjp()'],['../classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e',1,'mlx::core::ArcSinh::vjp()'],['../classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2',1,'mlx::core::ArcTan::vjp()'],['../classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2',1,'mlx::core::ArcTan2::vjp()'],['../classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72',1,'mlx::core::ArcTanh::vjp()'],['../classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0',1,'mlx::core::ArgPartition::vjp()'],['../classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a',1,'mlx::core::ArgReduce::vjp()'],['../classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18',1,'mlx::core::AsType::vjp()'],['../classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062',1,'mlx::core::AsStrided::vjp()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61',1,'mlx::core::BitwiseBinary::vjp()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120',1,'mlx::core::BlockMaskedMM::vjp()'],['../classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda',1,'mlx::core::GatherMM::vjp()'],['../classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18',1,'mlx::core::Broadcast::vjp()'],['../classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb',1,'mlx::core::Ceil::vjp()'],['../classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133',1,'mlx::core::Compiled::vjp()'],['../classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0',1,'mlx::core::Concatenate::vjp()'],['../classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690',1,'mlx::core::Convolution::vjp()'],['../classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd',1,'mlx::core::Copy::vjp()'],['../classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00',1,'mlx::core::Cos::vjp()'],['../classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4',1,'mlx::core::Cosh::vjp()'],['../classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209',1,'mlx::core::CustomTransforms::vjp()'],['../classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0',1,'mlx::core::Depends::vjp()'],['../classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6',1,'mlx::core::Divide::vjp()'],['../classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1',1,'mlx::core::DivMod::vjp()'],['../classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3',1,'mlx::core::Select::vjp()'],['../classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6',1,'mlx::core::Remainder::vjp()'],['../classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736',1,'mlx::core::Equal::vjp()'],['../classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909',1,'mlx::core::Erf::vjp()'],['../classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189',1,'mlx::core::ErfInv::vjp()'],['../classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8',1,'mlx::core::Exp::vjp()'],['../classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43',1,'mlx::core::Expm1::vjp()'],['../classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090',1,'mlx::core::FFT::vjp()'],['../classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e',1,'mlx::core::Floor::vjp()'],['../classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969',1,'mlx::core::Full::vjp()'],['../classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426',1,'mlx::core::Gather::vjp()'],['../classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679',1,'mlx::core::Greater::vjp()'],['../classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee',1,'mlx::core::GreaterEqual::vjp()'],['../classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656',1,'mlx::core::Hadamard::vjp()'],['../classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b',1,'mlx::core::Imag::vjp()'],['../classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50',1,'mlx::core::Less::vjp()'],['../classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028',1,'mlx::core::LessEqual::vjp()'],['../classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280',1,'mlx::core::Log::vjp()'],['../classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880',1,'mlx::core::Log1p::vjp()'],['../classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50',1,'mlx::core::LogicalNot::vjp()'],['../classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54',1,'mlx::core::LogicalAnd::vjp()'],['../classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847',1,'mlx::core::LogicalOr::vjp()'],['../classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4',1,'mlx::core::LogAddExp::vjp()'],['../classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0',1,'mlx::core::Matmul::vjp()'],['../classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3',1,'mlx::core::Maximum::vjp()'],['../classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204',1,'mlx::core::Minimum::vjp()'],['../classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1',1,'mlx::core::Multiply::vjp()'],['../classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a',1,'mlx::core::Negative::vjp()'],['../classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b',1,'mlx::core::NotEqual::vjp()'],['../classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038',1,'mlx::core::Pad::vjp()'],['../classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9',1,'mlx::core::Partition::vjp()'],['../classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082',1,'mlx::core::Power::vjp()'],['../classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26',1,'mlx::core::QuantizedMatmul::vjp()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab',1,'mlx::core::GatherQMM::vjp()'],['../classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe',1,'mlx::core::Real::vjp()'],['../classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365',1,'mlx::core::Reshape::vjp()'],['../classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e',1,'mlx::core::Reduce::vjp()'],['../classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce',1,'mlx::core::Round::vjp()'],['../classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e',1,'mlx::core::Scan::vjp()'],['../classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a',1,'mlx::core::Scatter::vjp()'],['../classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf',1,'mlx::core::Sigmoid::vjp()'],['../classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce',1,'mlx::core::Sign::vjp()'],['../classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0',1,'mlx::core::Sin::vjp()'],['../classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0',1,'mlx::core::Sinh::vjp()'],['../classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f',1,'mlx::core::Slice::vjp()'],['../classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77',1,'mlx::core::SliceUpdate::vjp()'],['../classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b',1,'mlx::core::Softmax::vjp()'],['../classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358',1,'mlx::core::Sort::vjp()'],['../classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674',1,'mlx::core::Split::vjp()'],['../classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263',1,'mlx::core::Square::vjp()'],['../classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3',1,'mlx::core::Sqrt::vjp()'],['../classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b',1,'mlx::core::Subtract::vjp()'],['../classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7',1,'mlx::core::Tan::vjp()'],['../classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95',1,'mlx::core::Tanh::vjp()'],['../classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80',1,'mlx::core::Transpose::vjp()'],['../namespacemlx_1_1core.html#a1b33e2c2e3471420490cf0be2de6de18',1,'mlx::core::vjp(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotangents)'],['../namespacemlx_1_1core.html#a2065a11249c3f4356ffd69b7a8c487ff',1,'mlx::core::vjp(const std::function&lt; array(const array &amp;)&gt; &amp;fun, const array &amp;primal, const array &amp;cotangent)']]],
-  ['vlen_17',['VLEN',['../structpocketfft_1_1detail_1_1_v_l_e_n.html',1,'pocketfft::detail']]],
-  ['vmap_18',['vmap',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a',1,'mlx::core::distributed::AllReduce::vmap()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031',1,'mlx::core::distributed::AllGather::vmap()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93',1,'mlx::core::distributed::Send::vmap()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d',1,'mlx::core::fast::Custom::vmap()'],['../classmlx_1_1core_1_1_primitive.html#ac632b9619dd7a6a0f177bd36202e8103',1,'mlx::core::Primitive::vmap()'],['../classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f',1,'mlx::core::Abs::vmap()'],['../classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646',1,'mlx::core::Add::vmap()'],['../classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81',1,'mlx::core::AddMM::vmap()'],['../classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83',1,'mlx::core::ArcCos::vmap()'],['../classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461',1,'mlx::core::ArcCosh::vmap()'],['../classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82',1,'mlx::core::ArcSin::vmap()'],['../classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d',1,'mlx::core::ArcSinh::vmap()'],['../classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556',1,'mlx::core::ArcTan::vmap()'],['../classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634',1,'mlx::core::ArcTan2::vmap()'],['../classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040',1,'mlx::core::ArcTanh::vmap()'],['../classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a',1,'mlx::core::ArgPartition::vmap()'],['../classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba',1,'mlx::core::ArgReduce::vmap()'],['../classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e',1,'mlx::core::ArgSort::vmap()'],['../classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc',1,'mlx::core::AsType::vmap()'],['../classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965',1,'mlx::core::BitwiseBinary::vmap()'],['../classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f',1,'mlx::core::Broadcast::vmap()'],['../classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4',1,'mlx::core::Ceil::vmap()'],['../classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d',1,'mlx::core::Compiled::vmap()'],['../classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1',1,'mlx::core::Concatenate::vmap()'],['../classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60',1,'mlx::core::Conjugate::vmap()'],['../classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61',1,'mlx::core::Copy::vmap()'],['../classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6',1,'mlx::core::Cos::vmap()'],['../classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406',1,'mlx::core::Cosh::vmap()'],['../classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b',1,'mlx::core::CustomTransforms::vmap()'],['../classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242',1,'mlx::core::Divide::vmap()'],['../classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942',1,'mlx::core::DivMod::vmap()'],['../classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f',1,'mlx::core::Select::vmap()'],['../classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d',1,'mlx::core::Remainder::vmap()'],['../classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca',1,'mlx::core::Equal::vmap()'],['../classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa',1,'mlx::core::Erf::vmap()'],['../classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9',1,'mlx::core::ErfInv::vmap()'],['../classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37',1,'mlx::core::Exp::vmap()'],['../classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296',1,'mlx::core::Expm1::vmap()'],['../classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1',1,'mlx::core::FFT::vmap()'],['../classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10',1,'mlx::core::Floor::vmap()'],['../classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95',1,'mlx::core::Full::vmap()'],['../classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275',1,'mlx::core::Gather::vmap()'],['../classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0',1,'mlx::core::Greater::vmap()'],['../classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d',1,'mlx::core::GreaterEqual::vmap()'],['../classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c',1,'mlx::core::Hadamard::vmap()'],['../classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3',1,'mlx::core::Imag::vmap()'],['../classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e',1,'mlx::core::Less::vmap()'],['../classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480',1,'mlx::core::LessEqual::vmap()'],['../classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49',1,'mlx::core::Log::vmap()'],['../classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71',1,'mlx::core::Log1p::vmap()'],['../classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d',1,'mlx::core::LogicalNot::vmap()'],['../classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5',1,'mlx::core::LogicalAnd::vmap()'],['../classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3',1,'mlx::core::LogicalOr::vmap()'],['../classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78',1,'mlx::core::LogAddExp::vmap()'],['../classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2',1,'mlx::core::Matmul::vmap()'],['../classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3',1,'mlx::core::Maximum::vmap()'],['../classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980',1,'mlx::core::Minimum::vmap()'],['../classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf',1,'mlx::core::Multiply::vmap()'],['../classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0',1,'mlx::core::Negative::vmap()'],['../classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5',1,'mlx::core::NotEqual::vmap()'],['../classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2',1,'mlx::core::NumberOfElements::vmap()'],['../classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf',1,'mlx::core::Pad::vmap()'],['../classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c',1,'mlx::core::Partition::vmap()'],['../classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f',1,'mlx::core::Power::vmap()'],['../classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763',1,'mlx::core::QuantizedMatmul::vmap()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f',1,'mlx::core::GatherQMM::vmap()'],['../classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415',1,'mlx::core::RandomBits::vmap()'],['../classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6',1,'mlx::core::Real::vmap()'],['../classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d',1,'mlx::core::Reshape::vmap()'],['../classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38',1,'mlx::core::Reduce::vmap()'],['../classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd',1,'mlx::core::Round::vmap()'],['../classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804',1,'mlx::core::Scan::vmap()'],['../classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322',1,'mlx::core::Scatter::vmap()'],['../classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85',1,'mlx::core::Sigmoid::vmap()'],['../classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295',1,'mlx::core::Sign::vmap()'],['../classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba',1,'mlx::core::Sin::vmap()'],['../classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788',1,'mlx::core::Sinh::vmap()'],['../classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2',1,'mlx::core::Slice::vmap()'],['../classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3',1,'mlx::core::SliceUpdate::vmap()'],['../classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19',1,'mlx::core::Softmax::vmap()'],['../classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c',1,'mlx::core::Sort::vmap()'],['../classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6',1,'mlx::core::Split::vmap()'],['../classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5',1,'mlx::core::Square::vmap()'],['../classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e',1,'mlx::core::Sqrt::vmap()'],['../classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0',1,'mlx::core::StopGradient::vmap()'],['../classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098',1,'mlx::core::Subtract::vmap()'],['../classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7',1,'mlx::core::Tan::vmap()'],['../classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f',1,'mlx::core::Tanh::vmap()'],['../classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926',1,'mlx::core::Uniform::vmap()'],['../classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121',1,'mlx::core::View::vmap()'],['../classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe',1,'mlx::core::Transpose::vmap()'],['../classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8',1,'mlx::core::SVD::vmap()'],['../classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2',1,'mlx::core::Inverse::vmap()'],['../classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5',1,'mlx::core::Cholesky::vmap()'],['../classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f',1,'mlx::core::Eigh::vmap()'],['../namespacemlx_1_1core.html#ac3caec2fa65375ed4c3bf1206177b84c',1,'mlx::core::vmap(const std::function&lt; array(const array &amp;)&gt; &amp;fun, int in_axis=0, int out_axis=0)'],['../namespacemlx_1_1core.html#a8481a3bb4c12c2b7dc6ba576c2be3d0d',1,'mlx::core::vmap(const std::function&lt; array(const array &amp;, const array &amp;)&gt; &amp;fun, int in_axis_a=0, int in_axis_b=0, int out_axis=0)'],['../namespacemlx_1_1core.html#a95a7757e8d18fced38acfc6a3e8d686a',1,'mlx::core::vmap(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;in_axes={}, const std::vector&lt; int &gt; &amp;out_axes={})']]],
-  ['vmap_5freplace_19',['vmap_replace',['../namespacemlx_1_1core_1_1detail.html#a31a5582530faea230eb8acafc0f7e154',1,'mlx::core::detail']]],
-  ['vmap_5ftrace_20',['vmap_trace',['../namespacemlx_1_1core_1_1detail.html#a5ba794afe1a557e0505887cfb481c515',1,'mlx::core::detail']]],
-  ['void_5ft_21',['void_t',['../namespacemetal.html#a192322c772aa8b168d59edc55fb806f1',1,'metal']]],
-  ['vtype_22',['VTYPE',['../structpocketfft_1_1detail_1_1_v_t_y_p_e.html',1,'pocketfft::detail']]],
-  ['vtype_5ft_23',['vtype_t',['../namespacepocketfft_1_1detail.html#a3edfb93aeed2f8258183d463ea291d62',1,'pocketfft::detail']]]
+  ['v_1',['v',['../structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d',1,'mlx::steel::BlockLoader::ReadVector']]],
+  ['v_5fstrides_2',['V_strides',['../structmlx_1_1steel_1_1_attn_params.html#acc4860c3ce09c7230b470182ed002d3c',1,'mlx::steel::AttnParams']]],
+  ['val_3',['Val',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1da',1,'mlx::core::Dtype']]],
+  ['val_4',['val',['../structpocketfft_1_1detail_1_1_v_l_e_n.html#ab1fdc340dedde723e636746c828a4534',1,'pocketfft::detail::VLEN::val'],['../structmlx__atomic.html#a6f6651b8dd8149917c50cd99b13c6747',1,'mlx_atomic::val'],['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html#a8dbf729fcd8c4a16e41b546c7405543d',1,'mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;::val'],['../structmlx_1_1core_1_1_dtype.html#a7a99656f121c8922ab82e72c8e9bd7f1',1,'mlx::core::Dtype::val()']]],
+  ['val_5ffrags_5',['val_frags',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62',1,'mlx::steel::MMATile']]],
+  ['val_5ft_6',['val_t',['../struct_kernel_merge_sort.html#a4e3f09896275956fc4c23e1f157dca3b',1,'KernelMergeSort']]],
+  ['valid_7',['valid',['../classmlx_1_1core_1_1_event.html#aa77afd9669e2ef9d5e9ae1c2c6fd24fa',1,'mlx::core::Event']]],
+  ['value_8',['value',['../structmlx_1_1steel_1_1integral__constant.html#a4efa69cb3fd42ac0dcad46578600d637',1,'mlx::steel::integral_constant::value'],['../classmlx_1_1core_1_1_event.html#ab71c7baee3d1d02ad6a2001bbf90b970',1,'mlx::core::Event::value()']]],
+  ['value_5fand_5fgrad_9',['value_and_grad',['../namespacemlx_1_1core.html#abf49b337a00997231c0f7fd389efa8f3',1,'mlx::core::value_and_grad(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;argnums)'],['../namespacemlx_1_1core.html#a7b987f404b8699de00f9e0099ab6b1b0',1,'mlx::core::value_and_grad(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, int argnum=0)'],['../namespacemlx_1_1core.html#a5a64dc878b29403d27e50bd7a288cc04',1,'mlx::core::value_and_grad(const std::function&lt; array(const array &amp;)&gt; &amp;fun)'],['../namespacemlx_1_1core.html#a7620f1ae298127cb6181db9162f012a7',1,'mlx::core::value_and_grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;argnums)'],['../namespacemlx_1_1core.html#a2f69ffc30d66b1fca8f24b65be161a51',1,'mlx::core::value_and_grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, int argnum=0)']]],
+  ['value_5ftype_10',['value_type',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#ab86a2740ed9ce3199135372ff1d88c76',1,'pocketfft::detail::threading::aligned_allocator::value_type'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ae24fe304397e961687d0d4c7012b8ae4',1,'mlx::core::array::ArrayIterator::value_type'],['../structmlx_1_1steel_1_1integral__constant.html#a0569cc1334e0bc4f474304b33d365759',1,'mlx::steel::integral_constant::value_type']]],
+  ['valueandgradfn_11',['ValueAndGradFn',['../namespacemlx_1_1core.html#ab79d66ddf1ec38b2f2c01234892a2230',1,'mlx::core']]],
+  ['var_12',['var',['../group__ops.html#ga7e133df686439588a8cd1fb10ce0c6e9',1,'mlx::core::var(const array &amp;a, bool keepdims, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga7d7b38d118fa2613214078ef0f7d5a42',1,'mlx::core::var(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga78ddeb966cbe7a5b0aa17e1de43025f2',1,'mlx::core::var(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga4fbf3e3f98f2e4956faf87af320aa9d0',1,'mlx::core::var(const array &amp;a, int axis, bool keepdims=false, int ddof=0, StreamOrDevice s={})']]],
+  ['vec_5fsize_13',['vec_size',['../structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092',1,'mlx::steel::BlockLoader::vec_size'],['../structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5',1,'mlx::steel::BlockLoaderT::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#adcc83bf6c02391cc2375e55c06a1c9a4',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a71c313e1597a2bb99f7b07d434e119d2',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a10109dc9553207f5a365799e4969c6d2',1,'mlx::steel::Conv2DWeightBlockLoader::vec_size'],['../structmlx_1_1steel_1_1_channel_helper.html#a2b24f991a9380fdad6b51a038770b925',1,'mlx::steel::ChannelHelper::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a71449551bbfe56058440755dfd50fc75',1,'mlx::steel::ChannelHelper&lt; 1 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#acfb18991a77a9d1d4a79918ac5f387af',1,'mlx::steel::ChannelHelper&lt; 2 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a5cb83774601c29564a6bbc010fc0bf7f',1,'mlx::steel::ChannelHelper&lt; 3 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#af28cdbe2a3c027d95832de07f60448ca',1,'mlx::steel::ChannelHelper&lt; 4 &gt;::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a6b0b18428516d1d6dcae3beb3faee81c',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a006153d274aa13d5fd4448b4607fff3a',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1587047caa339cf5b2c06adc4b332ab8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08dba753ec7c8ea2892775746933b3e7',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::vec_size']]],
+  ['vector_14',['Vector',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337a57dea6f5039281b7fee517fc43bf3110',1,'mlx::core']]],
+  ['view_15',['View',['../classmlx_1_1core_1_1_view.html',1,'mlx::core::View'],['../classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e',1,'mlx::core::View::View()']]],
+  ['view_16',['view',['../group__ops.html#ga3602aa91b7b124a0b41ec1b2137a1b02',1,'mlx::core']]],
+  ['vjp_17',['vjp',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225',1,'mlx::core::distributed::AllReduce::vjp()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb',1,'mlx::core::distributed::AllGather::vjp()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91',1,'mlx::core::fast::Custom::vjp()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb',1,'mlx::core::fast::RMSNorm::vjp()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6',1,'mlx::core::fast::LayerNorm::vjp()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533',1,'mlx::core::fast::RoPE::vjp()'],['../classmlx_1_1core_1_1_primitive.html#a1dcb6807326eeab62474c6a0e3836d42',1,'mlx::core::Primitive::vjp()'],['../classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592',1,'mlx::core::Abs::vjp()'],['../classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607',1,'mlx::core::Add::vjp()'],['../classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6',1,'mlx::core::AddMM::vjp()'],['../classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92',1,'mlx::core::ArcCos::vjp()'],['../classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26',1,'mlx::core::ArcCosh::vjp()'],['../classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1',1,'mlx::core::ArcSin::vjp()'],['../classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e',1,'mlx::core::ArcSinh::vjp()'],['../classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2',1,'mlx::core::ArcTan::vjp()'],['../classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2',1,'mlx::core::ArcTan2::vjp()'],['../classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72',1,'mlx::core::ArcTanh::vjp()'],['../classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0',1,'mlx::core::ArgPartition::vjp()'],['../classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a',1,'mlx::core::ArgReduce::vjp()'],['../classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18',1,'mlx::core::AsType::vjp()'],['../classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062',1,'mlx::core::AsStrided::vjp()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61',1,'mlx::core::BitwiseBinary::vjp()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120',1,'mlx::core::BlockMaskedMM::vjp()'],['../classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda',1,'mlx::core::GatherMM::vjp()'],['../classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18',1,'mlx::core::Broadcast::vjp()'],['../classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb',1,'mlx::core::Ceil::vjp()'],['../classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133',1,'mlx::core::Compiled::vjp()'],['../classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0',1,'mlx::core::Concatenate::vjp()'],['../classmlx_1_1core_1_1_contiguous.html#abf488f02057fd5852f38b2e8a600ad2a',1,'mlx::core::Contiguous::vjp()'],['../classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690',1,'mlx::core::Convolution::vjp()'],['../classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd',1,'mlx::core::Copy::vjp()'],['../classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00',1,'mlx::core::Cos::vjp()'],['../classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4',1,'mlx::core::Cosh::vjp()'],['../classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209',1,'mlx::core::CustomTransforms::vjp()'],['../classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0',1,'mlx::core::Depends::vjp()'],['../classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6',1,'mlx::core::Divide::vjp()'],['../classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1',1,'mlx::core::DivMod::vjp()'],['../classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3',1,'mlx::core::Select::vjp()'],['../classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6',1,'mlx::core::Remainder::vjp()'],['../classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736',1,'mlx::core::Equal::vjp()'],['../classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909',1,'mlx::core::Erf::vjp()'],['../classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189',1,'mlx::core::ErfInv::vjp()'],['../classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8',1,'mlx::core::Exp::vjp()'],['../classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43',1,'mlx::core::Expm1::vjp()'],['../classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090',1,'mlx::core::FFT::vjp()'],['../classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e',1,'mlx::core::Floor::vjp()'],['../classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969',1,'mlx::core::Full::vjp()'],['../classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426',1,'mlx::core::Gather::vjp()'],['../classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679',1,'mlx::core::Greater::vjp()'],['../classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee',1,'mlx::core::GreaterEqual::vjp()'],['../classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656',1,'mlx::core::Hadamard::vjp()'],['../classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b',1,'mlx::core::Imag::vjp()'],['../classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50',1,'mlx::core::Less::vjp()'],['../classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028',1,'mlx::core::LessEqual::vjp()'],['../classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280',1,'mlx::core::Log::vjp()'],['../classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880',1,'mlx::core::Log1p::vjp()'],['../classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50',1,'mlx::core::LogicalNot::vjp()'],['../classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54',1,'mlx::core::LogicalAnd::vjp()'],['../classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847',1,'mlx::core::LogicalOr::vjp()'],['../classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4',1,'mlx::core::LogAddExp::vjp()'],['../classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0',1,'mlx::core::Matmul::vjp()'],['../classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3',1,'mlx::core::Maximum::vjp()'],['../classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204',1,'mlx::core::Minimum::vjp()'],['../classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1',1,'mlx::core::Multiply::vjp()'],['../classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a',1,'mlx::core::Negative::vjp()'],['../classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b',1,'mlx::core::NotEqual::vjp()'],['../classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038',1,'mlx::core::Pad::vjp()'],['../classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9',1,'mlx::core::Partition::vjp()'],['../classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082',1,'mlx::core::Power::vjp()'],['../classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26',1,'mlx::core::QuantizedMatmul::vjp()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab',1,'mlx::core::GatherQMM::vjp()'],['../classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe',1,'mlx::core::Real::vjp()'],['../classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365',1,'mlx::core::Reshape::vjp()'],['../classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e',1,'mlx::core::Reduce::vjp()'],['../classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce',1,'mlx::core::Round::vjp()'],['../classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e',1,'mlx::core::Scan::vjp()'],['../classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a',1,'mlx::core::Scatter::vjp()'],['../classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf',1,'mlx::core::Sigmoid::vjp()'],['../classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce',1,'mlx::core::Sign::vjp()'],['../classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0',1,'mlx::core::Sin::vjp()'],['../classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0',1,'mlx::core::Sinh::vjp()'],['../classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f',1,'mlx::core::Slice::vjp()'],['../classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77',1,'mlx::core::SliceUpdate::vjp()'],['../classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b',1,'mlx::core::Softmax::vjp()'],['../classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358',1,'mlx::core::Sort::vjp()'],['../classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674',1,'mlx::core::Split::vjp()'],['../classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263',1,'mlx::core::Square::vjp()'],['../classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3',1,'mlx::core::Sqrt::vjp()'],['../classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b',1,'mlx::core::Subtract::vjp()'],['../classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7',1,'mlx::core::Tan::vjp()'],['../classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95',1,'mlx::core::Tanh::vjp()'],['../classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80',1,'mlx::core::Transpose::vjp()'],['../namespacemlx_1_1core.html#a1b33e2c2e3471420490cf0be2de6de18',1,'mlx::core::vjp(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotangents)'],['../namespacemlx_1_1core.html#a2065a11249c3f4356ffd69b7a8c487ff',1,'mlx::core::vjp(const std::function&lt; array(const array &amp;)&gt; &amp;fun, const array &amp;primal, const array &amp;cotangent)']]],
+  ['vlen_18',['VLEN',['../structpocketfft_1_1detail_1_1_v_l_e_n.html',1,'pocketfft::detail']]],
+  ['vmap_19',['vmap',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a',1,'mlx::core::distributed::AllReduce::vmap()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031',1,'mlx::core::distributed::AllGather::vmap()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93',1,'mlx::core::distributed::Send::vmap()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d',1,'mlx::core::fast::Custom::vmap()'],['../classmlx_1_1core_1_1_primitive.html#ac632b9619dd7a6a0f177bd36202e8103',1,'mlx::core::Primitive::vmap()'],['../classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f',1,'mlx::core::Abs::vmap()'],['../classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646',1,'mlx::core::Add::vmap()'],['../classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81',1,'mlx::core::AddMM::vmap()'],['../classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83',1,'mlx::core::ArcCos::vmap()'],['../classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461',1,'mlx::core::ArcCosh::vmap()'],['../classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82',1,'mlx::core::ArcSin::vmap()'],['../classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d',1,'mlx::core::ArcSinh::vmap()'],['../classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556',1,'mlx::core::ArcTan::vmap()'],['../classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634',1,'mlx::core::ArcTan2::vmap()'],['../classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040',1,'mlx::core::ArcTanh::vmap()'],['../classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a',1,'mlx::core::ArgPartition::vmap()'],['../classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba',1,'mlx::core::ArgReduce::vmap()'],['../classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e',1,'mlx::core::ArgSort::vmap()'],['../classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc',1,'mlx::core::AsType::vmap()'],['../classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965',1,'mlx::core::BitwiseBinary::vmap()'],['../classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f',1,'mlx::core::Broadcast::vmap()'],['../classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4',1,'mlx::core::Ceil::vmap()'],['../classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d',1,'mlx::core::Compiled::vmap()'],['../classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1',1,'mlx::core::Concatenate::vmap()'],['../classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60',1,'mlx::core::Conjugate::vmap()'],['../classmlx_1_1core_1_1_contiguous.html#a563221e90b15aa90bfae23d29c10e4ec',1,'mlx::core::Contiguous::vmap()'],['../classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61',1,'mlx::core::Copy::vmap()'],['../classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6',1,'mlx::core::Cos::vmap()'],['../classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406',1,'mlx::core::Cosh::vmap()'],['../classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b',1,'mlx::core::CustomTransforms::vmap()'],['../classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242',1,'mlx::core::Divide::vmap()'],['../classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942',1,'mlx::core::DivMod::vmap()'],['../classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f',1,'mlx::core::Select::vmap()'],['../classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d',1,'mlx::core::Remainder::vmap()'],['../classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca',1,'mlx::core::Equal::vmap()'],['../classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa',1,'mlx::core::Erf::vmap()'],['../classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9',1,'mlx::core::ErfInv::vmap()'],['../classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37',1,'mlx::core::Exp::vmap()'],['../classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296',1,'mlx::core::Expm1::vmap()'],['../classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1',1,'mlx::core::FFT::vmap()'],['../classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10',1,'mlx::core::Floor::vmap()'],['../classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95',1,'mlx::core::Full::vmap()'],['../classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275',1,'mlx::core::Gather::vmap()'],['../classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0',1,'mlx::core::Greater::vmap()'],['../classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d',1,'mlx::core::GreaterEqual::vmap()'],['../classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c',1,'mlx::core::Hadamard::vmap()'],['../classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3',1,'mlx::core::Imag::vmap()'],['../classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e',1,'mlx::core::Less::vmap()'],['../classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480',1,'mlx::core::LessEqual::vmap()'],['../classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49',1,'mlx::core::Log::vmap()'],['../classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71',1,'mlx::core::Log1p::vmap()'],['../classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d',1,'mlx::core::LogicalNot::vmap()'],['../classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5',1,'mlx::core::LogicalAnd::vmap()'],['../classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3',1,'mlx::core::LogicalOr::vmap()'],['../classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78',1,'mlx::core::LogAddExp::vmap()'],['../classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2',1,'mlx::core::Matmul::vmap()'],['../classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3',1,'mlx::core::Maximum::vmap()'],['../classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980',1,'mlx::core::Minimum::vmap()'],['../classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf',1,'mlx::core::Multiply::vmap()'],['../classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0',1,'mlx::core::Negative::vmap()'],['../classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5',1,'mlx::core::NotEqual::vmap()'],['../classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2',1,'mlx::core::NumberOfElements::vmap()'],['../classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf',1,'mlx::core::Pad::vmap()'],['../classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c',1,'mlx::core::Partition::vmap()'],['../classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f',1,'mlx::core::Power::vmap()'],['../classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763',1,'mlx::core::QuantizedMatmul::vmap()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f',1,'mlx::core::GatherQMM::vmap()'],['../classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415',1,'mlx::core::RandomBits::vmap()'],['../classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6',1,'mlx::core::Real::vmap()'],['../classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d',1,'mlx::core::Reshape::vmap()'],['../classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38',1,'mlx::core::Reduce::vmap()'],['../classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd',1,'mlx::core::Round::vmap()'],['../classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804',1,'mlx::core::Scan::vmap()'],['../classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322',1,'mlx::core::Scatter::vmap()'],['../classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85',1,'mlx::core::Sigmoid::vmap()'],['../classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295',1,'mlx::core::Sign::vmap()'],['../classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba',1,'mlx::core::Sin::vmap()'],['../classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788',1,'mlx::core::Sinh::vmap()'],['../classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2',1,'mlx::core::Slice::vmap()'],['../classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3',1,'mlx::core::SliceUpdate::vmap()'],['../classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19',1,'mlx::core::Softmax::vmap()'],['../classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c',1,'mlx::core::Sort::vmap()'],['../classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6',1,'mlx::core::Split::vmap()'],['../classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5',1,'mlx::core::Square::vmap()'],['../classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e',1,'mlx::core::Sqrt::vmap()'],['../classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0',1,'mlx::core::StopGradient::vmap()'],['../classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098',1,'mlx::core::Subtract::vmap()'],['../classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7',1,'mlx::core::Tan::vmap()'],['../classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f',1,'mlx::core::Tanh::vmap()'],['../classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926',1,'mlx::core::Uniform::vmap()'],['../classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121',1,'mlx::core::View::vmap()'],['../classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe',1,'mlx::core::Transpose::vmap()'],['../classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8',1,'mlx::core::SVD::vmap()'],['../classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2',1,'mlx::core::Inverse::vmap()'],['../classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5',1,'mlx::core::Cholesky::vmap()'],['../classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f',1,'mlx::core::Eigh::vmap()'],['../namespacemlx_1_1core.html#ac3caec2fa65375ed4c3bf1206177b84c',1,'mlx::core::vmap(const std::function&lt; array(const array &amp;)&gt; &amp;fun, int in_axis=0, int out_axis=0)'],['../namespacemlx_1_1core.html#a8481a3bb4c12c2b7dc6ba576c2be3d0d',1,'mlx::core::vmap(const std::function&lt; array(const array &amp;, const array &amp;)&gt; &amp;fun, int in_axis_a=0, int in_axis_b=0, int out_axis=0)'],['../namespacemlx_1_1core.html#a95a7757e8d18fced38acfc6a3e8d686a',1,'mlx::core::vmap(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;in_axes={}, const std::vector&lt; int &gt; &amp;out_axes={})']]],
+  ['vmap_5freplace_20',['vmap_replace',['../namespacemlx_1_1core_1_1detail.html#a31a5582530faea230eb8acafc0f7e154',1,'mlx::core::detail']]],
+  ['vmap_5ftrace_21',['vmap_trace',['../namespacemlx_1_1core_1_1detail.html#a5ba794afe1a557e0505887cfb481c515',1,'mlx::core::detail']]],
+  ['void_5ft_22',['void_t',['../namespacemetal.html#a192322c772aa8b168d59edc55fb806f1',1,'metal']]],
+  ['vtype_23',['VTYPE',['../structpocketfft_1_1detail_1_1_v_t_y_p_e.html',1,'pocketfft::detail']]],
+  ['vtype_5ft_24',['vtype_t',['../namespacepocketfft_1_1detail.html#a3edfb93aeed2f8258183d463ea291d62',1,'pocketfft::detail']]]
 ];
diff --git a/docs/build/html/search/all_17.js b/docs/build/html/search/all_17.js
index 40e854311..7b4dff754 100644
--- a/docs/build/html/search/all_17.js
+++ b/docs/build/html/search/all_17.js
@@ -1,19 +1,20 @@
 var searchData=
 [
   ['wait_0',['wait',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#af503189cc9247047fbdfc3ebf1daacc1',1,'pocketfft::detail::threading::latch::wait()'],['../classmlx_1_1core_1_1array.html#a648592006f1c92287734ba2428eaa45e',1,'mlx::core::array::wait()'],['../classmlx_1_1core_1_1_event.html#a634afd918e6ed847f354531ba9f48252',1,'mlx::core::Event::wait()']]],
-  ['wait_5ffor_5fone_1',['wait_for_one',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a01c574bb388f10d67aaaaa541894d807',1,'mlx::core::scheduler::Scheduler::wait_for_one()'],['../namespacemlx_1_1core_1_1scheduler.html#a8cc4d5fd1f5ce722b377ead1863a2291',1,'mlx::core::scheduler::wait_for_one()']]],
-  ['weight_5fbase_2',['weight_base',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html#a1d88677c4617f4bdae157e40a64a407b',1,'mlx::steel::Conv2DGeneralBaseInfo']]],
-  ['weight_5fh_3',['weight_h',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3be4815d4090cb27ebe2f9bad1a39e95',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::weight_h'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a366c3cee4ed1165545287c8d5ce49445',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::weight_h'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a397412909eb955babc935a35d97c3fd4',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::weight_h'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a5997fd8ef249e4cd3df7dad7b251d8d5',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::weight_h']]],
-  ['weight_5fhw_4',['weight_hw',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ae39d43f741c9c87cce9c6d3144dc8b94',1,'mlx::steel::Conv2DWeightBlockLoader::weight_hw'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7dd320bc5b0a9a2e425d6b292ddac037',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::weight_hw'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a5752e0309a4dc873cb31ce724c11ada6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::weight_hw']]],
-  ['weight_5fsize_5',['weight_size',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html#aff119a4325b97fdbd745d8fcaed9f041',1,'mlx::steel::Conv2DGeneralBaseInfo']]],
-  ['weight_5fw_6',['weight_w',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#add1186c7accb62bfa8a4a7e87fc4cc84',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::weight_w'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a4744bd79fb05e81eaa53d2eabe017446',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::weight_w'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a0261d0349a0a95ca1a02a959b73e9352',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::weight_w'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6efa6268a37f18f4d225674bf1780cf6',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::weight_w']]],
-  ['where_7',['where',['../group__ops.html#ga8a2056f8c9bb30914c40bcf509386491',1,'mlx::core']]],
-  ['write_8',['write',['../struct_read_writer.html#ac2ea71e41740ddc863890e3e8e6f09d0',1,'ReadWriter::write()'],['../classmlx_1_1core_1_1io_1_1_writer.html#ad9515b7f007338674de1e124cf77e125',1,'mlx::core::io::Writer::write()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#abca32838c9886f734d93430c34c07d7f',1,'mlx::core::io::FileWriter::write()'],['../struct_read_writer.html#a7a3d1396b0f83aa7506207bd6e7336bf',1,'ReadWriter::write() const'],['../struct_read_writer.html#ae1f0d3555b74998cc2d2288bce72a1f4',1,'ReadWriter::write() const']]],
-  ['write_5fpadded_9',['write_padded',['../struct_read_writer.html#a95367307acace2aa88226cf8956d2d88',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#abaf2a6ad4c88bd9f65fe1db1f73a8d87',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#a420453a56e77d6b3891ed4b5f178af9c',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const']]],
-  ['write_5fsafe_10',['write_safe',['../scan_8h.html#ae86aef08e5ebc8790031eb51eefa754c',1,'scan.h']]],
-  ['write_5fstrided_11',['write_strided',['../struct_read_writer.html#a77a4d7eac217305e22a3c25b3756ef67',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a12e7f43cd9de2d9990054184c0a32839',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a959ccaa08f2999c50cea063b01e492e4',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a5592b24dad5ad030a1e4769b0a278f35',1,'ReadWriter::write_strided(int stride, int overall_n)']]],
-  ['write_5funsafe_12',['write_unsafe',['../scan_8h.html#a8010e7bdf7a72cbd35ce7cd7ecb08e32',1,'scan.h']]],
-  ['writer_13',['Writer',['../classmlx_1_1core_1_1io_1_1_writer.html',1,'mlx::core::io']]],
-  ['ws_14',['wS',['../struct_m_l_x_conv_params.html#aba2074189644b1b59567d018409277a9',1,'MLXConvParams']]],
-  ['wt_5fstrides_15',['wt_strides',['../struct_m_l_x_conv_params.html#a887fee0da1494d038526fb0f59faff45',1,'MLXConvParams']]]
+  ['wait_5ffor_5ffence_1',['wait_for_fence',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefdadbff4e003dc6f77506840babc088',1,'mlx::core::metal::CommandEncoder']]],
+  ['wait_5ffor_5fone_2',['wait_for_one',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a01c574bb388f10d67aaaaa541894d807',1,'mlx::core::scheduler::Scheduler::wait_for_one()'],['../namespacemlx_1_1core_1_1scheduler.html#a8cc4d5fd1f5ce722b377ead1863a2291',1,'mlx::core::scheduler::wait_for_one()']]],
+  ['weight_5fbase_3',['weight_base',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html#a1d88677c4617f4bdae157e40a64a407b',1,'mlx::steel::Conv2DGeneralBaseInfo']]],
+  ['weight_5fh_4',['weight_h',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3be4815d4090cb27ebe2f9bad1a39e95',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::weight_h'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a366c3cee4ed1165545287c8d5ce49445',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::weight_h'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a397412909eb955babc935a35d97c3fd4',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::weight_h'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a5997fd8ef249e4cd3df7dad7b251d8d5',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::weight_h']]],
+  ['weight_5fhw_5',['weight_hw',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ae39d43f741c9c87cce9c6d3144dc8b94',1,'mlx::steel::Conv2DWeightBlockLoader::weight_hw'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7dd320bc5b0a9a2e425d6b292ddac037',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::weight_hw'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a5752e0309a4dc873cb31ce724c11ada6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::weight_hw']]],
+  ['weight_5fsize_6',['weight_size',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html#aff119a4325b97fdbd745d8fcaed9f041',1,'mlx::steel::Conv2DGeneralBaseInfo']]],
+  ['weight_5fw_7',['weight_w',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#add1186c7accb62bfa8a4a7e87fc4cc84',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::weight_w'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a4744bd79fb05e81eaa53d2eabe017446',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::weight_w'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a0261d0349a0a95ca1a02a959b73e9352',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::weight_w'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6efa6268a37f18f4d225674bf1780cf6',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::weight_w']]],
+  ['where_8',['where',['../group__ops.html#ga8a2056f8c9bb30914c40bcf509386491',1,'mlx::core']]],
+  ['write_9',['write',['../struct_read_writer.html#ac2ea71e41740ddc863890e3e8e6f09d0',1,'ReadWriter::write()'],['../classmlx_1_1core_1_1io_1_1_writer.html#ad9515b7f007338674de1e124cf77e125',1,'mlx::core::io::Writer::write()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#abca32838c9886f734d93430c34c07d7f',1,'mlx::core::io::FileWriter::write()'],['../struct_read_writer.html#a7a3d1396b0f83aa7506207bd6e7336bf',1,'ReadWriter::write() const'],['../struct_read_writer.html#ae1f0d3555b74998cc2d2288bce72a1f4',1,'ReadWriter::write() const']]],
+  ['write_5fpadded_10',['write_padded',['../struct_read_writer.html#a95367307acace2aa88226cf8956d2d88',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#abaf2a6ad4c88bd9f65fe1db1f73a8d87',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#a420453a56e77d6b3891ed4b5f178af9c',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const']]],
+  ['write_5fsafe_11',['write_safe',['../scan_8h.html#ae86aef08e5ebc8790031eb51eefa754c',1,'scan.h']]],
+  ['write_5fstrided_12',['write_strided',['../struct_read_writer.html#a77a4d7eac217305e22a3c25b3756ef67',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a12e7f43cd9de2d9990054184c0a32839',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a959ccaa08f2999c50cea063b01e492e4',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a5592b24dad5ad030a1e4769b0a278f35',1,'ReadWriter::write_strided(int stride, int overall_n)']]],
+  ['write_5funsafe_13',['write_unsafe',['../scan_8h.html#a8010e7bdf7a72cbd35ce7cd7ecb08e32',1,'scan.h']]],
+  ['writer_14',['Writer',['../classmlx_1_1core_1_1io_1_1_writer.html',1,'mlx::core::io']]],
+  ['ws_15',['wS',['../struct_m_l_x_conv_params.html#aba2074189644b1b59567d018409277a9',1,'MLXConvParams']]],
+  ['wt_5fstrides_16',['wt_strides',['../struct_m_l_x_conv_params.html#a887fee0da1494d038526fb0f59faff45',1,'MLXConvParams']]]
 ];
diff --git a/docs/build/html/search/all_2.js b/docs/build/html/search/all_2.js
index 29d404044..01d6f2885 100644
--- a/docs/build/html/search/all_2.js
+++ b/docs/build/html/search/all_2.js
@@ -1,110 +1,110 @@
 var searchData=
 [
-  ['b_0',['b',['../unionbool4__or__uint.html#a47d77eac47598fe420f8f04a615f76ca',1,'bool4_or_uint::b'],['../structmlx_1_1core_1_1_dtype.html#adb1ea8b45a0c53e04a0e73b168702715a92eb5ffee6ae2fec3ad71c777531578f',1,'mlx::core::Dtype::b']]],
-  ['b_5fstr_5fk_1',['B_str_k',['../structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211',1,'mlx::steel::BlockMMA']]],
-  ['b_5fstr_5fn_2',['B_str_n',['../structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17',1,'mlx::steel::BlockMMA']]],
-  ['backward_3',['BACKWARD',['../namespacepocketfft_1_1detail.html#a9d1eaa7469c018c39e745733eab9a9c3',1,'pocketfft::detail']]],
-  ['base_4',['Base',['../classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421',1,'mlx::core::Log']]],
-  ['base_5fwh_5',['base_wh',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aca37adba6f148579eb1cd0a7800a5cfe',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::base_wh'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6c46564bf1a96a02791dd432cc9c883e',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::base_wh']]],
-  ['base_5fww_6',['base_ww',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32d020c6715d06f7de360877fcb7b6e4',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::base_ww'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a230f0e581f9b8227b9ee68760b3b1503',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::base_ww']]],
-  ['basemmafrag_7',['BaseMMAFrag',['../structmlx_1_1steel_1_1_base_m_m_a_frag.html',1,'mlx::steel']]],
-  ['basemmafrag_3c_20t_2c_208_2c_208_20_3e_8',['BaseMMAFrag&lt; T, 8, 8 &gt;',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html',1,'mlx::steel']]],
-  ['batch_5fndim_9',['batch_ndim',['../struct_m_l_x_fast_attention_params.html#a6f3d94dbe44b32e675558768710bf0a3',1,'MLXFastAttentionParams::batch_ndim'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a640dc138a8bf7b2b5bed6a436b429c2f',1,'mlx::steel::GEMMParams::batch_ndim']]],
-  ['batch_5fsize_10',['batch_size',['../struct_read_writer.html#a689f4890c1d2ce33fc6da7550beec735',1,'ReadWriter']]],
-  ['batch_5fstride_5fa_11',['batch_stride_a',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a76f55783a8e2ee203cf8507eee4b000c',1,'mlx::steel::GEMMParams']]],
-  ['batch_5fstride_5fb_12',['batch_stride_b',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a99b959b12d12da657648fa24d43e49e8',1,'mlx::steel::GEMMParams']]],
-  ['batch_5fstride_5fc_13',['batch_stride_c',['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a77c065db228e9654a0a75a6ffe47c15a',1,'mlx::steel::GEMMAddMMParams']]],
-  ['batch_5fstride_5fd_14',['batch_stride_d',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#ad98006dd509a455864e6aa7c52743a41',1,'mlx::steel::GEMMParams']]],
-  ['batch_5fstride_5fk_15',['batch_stride_k',['../struct_m_l_x_fast_attention_params.html#a162826d3f288f64c0aea88a36b34859b',1,'MLXFastAttentionParams']]],
-  ['batch_5fstride_5fo_16',['batch_stride_o',['../struct_m_l_x_fast_attention_params.html#a3c5b1170999087f3f3a03830193b55c7',1,'MLXFastAttentionParams']]],
-  ['batch_5fstride_5fq_17',['batch_stride_q',['../struct_m_l_x_fast_attention_params.html#a98766fc89f75d5eef65b345f16a782d1',1,'MLXFastAttentionParams']]],
-  ['batch_5fstride_5fv_18',['batch_stride_v',['../struct_m_l_x_fast_attention_params.html#a1180e311b95cd4b6d4a336d21b873c21',1,'MLXFastAttentionParams']]],
-  ['bcols_19',['BCOLS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a29fbeeacdf5b6feeb74815ced255fa5a',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac3b40db720055350bba59d614ea1dd79',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a86519729ef0561686bb86e474c95b93d',1,'mlx::steel::Conv2DWeightBlockLoader::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9229d22e0a02d96825eb5a57c8cb95ac',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b6cf53a10514310d01f4d6459053a57',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3d6272d000f8ea79d9b3b5228bdca20f',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a1843921cd67926002bb0dcccf3048eb6',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::BCOLS']]],
-  ['bcols_5fpacked_20',['BCOLS_PACKED',['../struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb',1,'QuantizedBlockLoader']]],
-  ['begin_21',['begin',['../classmlx_1_1core_1_1array.html#a76b258b169d7d73419ebbf85340fb914',1,'mlx::core::array']]],
-  ['bernoulli_22',['bernoulli',['../namespacemlx_1_1core_1_1random.html#acb3f278fea2c4f06dea947d3bac2e9b7',1,'mlx::core::random::bernoulli(const array &amp;p, const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#aaa49f6c2af5496822fa09435e54275cb',1,'mlx::core::random::bernoulli(const array &amp;p, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#aa9e360f9cb7bd23221352ed9e31d83c2',1,'mlx::core::random::bernoulli(T p, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a37fcba120a1d246176db5256d3201cd4',1,'mlx::core::random::bernoulli(T p, const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#ad7eb4467e2f9d5f74a5607b29a935b6e',1,'mlx::core::random::bernoulli(const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
-  ['beta_23',['beta',['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#ac0ce4d8a6014f8adb29fd0a0bb23139f',1,'mlx::steel::GEMMAddMMParams::beta'],['../structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6',1,'mlx::steel::TransformAxpby::beta']]],
-  ['bf16_2eh_24',['bf16.h',['../backend_2metal_2kernels_2bf16_8h.html',1,'(Global Namespace)'],['../types_2bf16_8h.html',1,'(Global Namespace)']]],
-  ['bf16_5fmath_2eh_25',['bf16_math.h',['../bf16__math_8h.html',1,'']]],
-  ['bfloat16_26',['bfloat16',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa444fe01f3a7a54d1809aef0912846a47',1,'mlx::core::Dtype::bfloat16'],['../namespacemlx_1_1core.html#a514cf8b4e6f0a6af3a867e752f4338f7',1,'mlx::core::bfloat16']]],
-  ['bfloat16_5ft_27',['bfloat16_t',['../backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82',1,'bf16.h']]],
-  ['bfloat16_5fto_5fuint16_28',['bfloat16_to_uint16',['../bf16__math_8h.html#a51cfdd4502e755310f6f3456f039bea7',1,'bf16_math.h']]],
-  ['bfloat_5fbinop_29',['bfloat_binop',['../backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707',1,'bfloat_binop:&#160;bf16.h'],['../types_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707',1,'bfloat_binop:&#160;bf16.h']]],
-  ['bfloat_5fbinop_5fbase_30',['bfloat_binop_base',['../backend_2metal_2kernels_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70',1,'bfloat_binop_base:&#160;bf16.h'],['../types_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70',1,'bfloat_binop_base:&#160;bf16.h']]],
-  ['bfloat_5fbinop_5fhelper_31',['bfloat_binop_helper',['../backend_2metal_2kernels_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594',1,'bfloat_binop_helper:&#160;bf16.h'],['../types_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594',1,'bfloat_binop_helper:&#160;bf16.h']]],
-  ['bfloat_5fbitop_32',['bfloat_bitop',['../types_2bf16_8h.html#aac9ba86d4bf05bcda1936494f9b9b4d3',1,'bf16.h']]],
-  ['bfloat_5fbits_5fto_5ffloat_33',['bfloat_bits_to_float',['../backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1',1,'bf16.h']]],
-  ['bfloat_5fcompop_34',['bfloat_compop',['../backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239',1,'bfloat_compop:&#160;bf16.h'],['../types_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239',1,'bfloat_compop:&#160;bf16.h']]],
-  ['bfloat_5finplace_5fbitop_35',['bfloat_inplace_bitop',['../types_2bf16_8h.html#af13b46bc58e6e6f675ae47aabec37711',1,'bf16.h']]],
-  ['bfloat_5finplace_5fop_36',['bfloat_inplace_op',['../backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c',1,'bfloat_inplace_op:&#160;bf16.h'],['../types_2bf16_8h.html#aee905053f51f76e0c1af94199714d514',1,'bfloat_inplace_op:&#160;bf16.h']]],
-  ['bfloat_5finplace_5fop_5faddr_5fspace_5fhelper_37',['bfloat_inplace_op_addr_space_helper',['../backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8',1,'bfloat_inplace_op_addr_space_helper:&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1457da931c28fa4e2500daa4e6441e8b',1,'bfloat_inplace_op_addr_space_helper:&#160;bf16.h']]],
-  ['bfloat_5finplace_5fop_5fhelper_38',['bfloat_inplace_op_helper',['../backend_2metal_2kernels_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d',1,'bfloat_inplace_op_helper:&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afe5988aa8147be2bafda6a5b7792fe15',1,'bfloat_inplace_op_helper:&#160;bf16.h']]],
-  ['bi_39',['bi',['../struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906',1,'QuantizedBlockLoader::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8e53b0a9951cb840d922cc285b257ee3',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ae3af75287f279d2cdeef189126740d4c',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a8c5e74003600132954cb953616e1a026',1,'mlx::steel::Conv2DWeightBlockLoader::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9eb024e2fc6f07345f87fbf7141c0d16',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae3b9f21f72e5e6c541c9978f55d354c7',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32a3a91fa715b82f36e05ceb10933d09',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a4c91f848856ab0872bdfd37c62d4b0ba',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::bi'],['../structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af',1,'mlx::steel::BlockLoader::bi']]],
-  ['biases_40',['biases',['../struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd',1,'QuantizedBlockLoader']]],
-  ['binary_41',['binary',['../namespacemlx_1_1core_1_1metal.html#a269d591ec02e2f7c0f7a718fbfa37f73',1,'mlx::core::metal']]],
-  ['binary_2eh_42',['binary.h',['../common_2binary_8h.html',1,'(Global Namespace)'],['../metal_2binary_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2binary_8h.html',1,'(Global Namespace)']]],
-  ['binary_5fg_43',['binary_g',['../metal_2kernels_2binary_8h.html#a1f3f5d6bfbf3914f365790dd1434c10b',1,'binary_g(device const T *a, device const T *b, device U *c, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a6297badf47dece518bb4e67f02cffea8',1,'binary_g(device const T *a, device const T *b, device U *c, device U *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim):&#160;binary_two.h']]],
-  ['binary_5fg_5fnd1_44',['binary_g_nd1',['../metal_2kernels_2binary_8h.html#a6808bfb006cb5473da087a2758d0d867',1,'binary_g_nd1(device const T *a, device const T *b, device U *c, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ad1fad37c168192b212a4294f4cf78133',1,'binary_g_nd1(device const T *a, device const T *b, device U *c, device U *d, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index):&#160;binary_two.h']]],
-  ['binary_5fg_5fnd2_45',['binary_g_nd2',['../metal_2kernels_2binary_8h.html#a8cd5989852ec704c6fd132ae28f4fc14',1,'binary_g_nd2(device const T *a, device const T *b, device U *c, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a03f7c15a1607576755abb65c542ae347',1,'binary_g_nd2(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
-  ['binary_5fg_5fnd3_46',['binary_g_nd3',['../metal_2kernels_2binary_8h.html#ac4979e60b993f7ffb602bcb91cd68bc9',1,'binary_g_nd3(device const T *a, device const T *b, device U *c, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a43e5943460996c43060d1f3aa1309ba6',1,'binary_g_nd3(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim):&#160;binary_two.h']]],
-  ['binary_5fop_5fgpu_47',['binary_op_gpu',['../namespacemlx_1_1core.html#ad884f4a36308b5b4f8a5d990d2e086df',1,'mlx::core::binary_op_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs, const std::string &amp;op, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a094876ea5a2a2445ab64efc8222da202',1,'mlx::core::binary_op_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out, const std::string &amp;op, const Stream &amp;s)']]],
-  ['binary_5fop_5fgpu_5finplace_48',['binary_op_gpu_inplace',['../namespacemlx_1_1core.html#a8616c0b7b0fc118a75400bc86404c367',1,'mlx::core::binary_op_gpu_inplace(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs, const std::string &amp;op, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a7e6af6624e322e7ad60a3873a66e18a3',1,'mlx::core::binary_op_gpu_inplace(const std::vector&lt; array &gt; &amp;inputs, array &amp;out, const std::string &amp;op, const Stream &amp;s)']]],
-  ['binary_5fops_49',['binary_ops',['../namespacemlx_1_1core_1_1metal.html#a8db7f9cc781d4bfb08423a401665f322',1,'mlx::core::metal']]],
-  ['binary_5fops_2eh_50',['binary_ops.h',['../binary__ops_8h.html',1,'']]],
-  ['binary_5fss_51',['binary_ss',['../metal_2kernels_2binary_8h.html#a242b8b29a852c255467e50628c6dccf5',1,'binary_ss(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#af8a791ac7ca88d32cd8f4e9ac0f9ab4f',1,'binary_ss(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
-  ['binary_5fsv_52',['binary_sv',['../metal_2kernels_2binary_8h.html#a4116c35f2e4632366d1611d5a95ba141',1,'binary_sv(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ab18c6ecf5065275c93701efd095c916c',1,'binary_sv(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
-  ['binary_5fsv2_53',['binary_sv2',['../metal_2kernels_2binary_8h.html#aa8c48b1b21d8f5a181f5443de2346589',1,'binary_sv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a08822ff98ea6f61a98b49a9e9a38b891',1,'binary_sv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
-  ['binary_5ftwo_54',['binary_two',['../namespacemlx_1_1core_1_1metal.html#aed047eec38b030ec5f29b9da54abf8cb',1,'mlx::core::metal']]],
-  ['binary_5ftwo_2eh_55',['binary_two.h',['../common_2binary__two_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2binary__two_8h.html',1,'(Global Namespace)']]],
-  ['binary_5fvs_56',['binary_vs',['../metal_2kernels_2binary_8h.html#a649851d133358dd5832a73b1061b3313',1,'binary_vs(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a12dbda74fa460812177ccb9aeee6e1ca',1,'binary_vs(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
-  ['binary_5fvs2_57',['binary_vs2',['../metal_2kernels_2binary_8h.html#a48bd82eb10f9c623ce7d28daec4fa512',1,'binary_vs2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a273d2f31691f2c64623c2a97eab344be',1,'binary_vs2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
-  ['binary_5fvv_58',['binary_vv',['../metal_2kernels_2binary_8h.html#add6a9aeee3cb0ba909574f27fa9ecd5b',1,'binary_vv(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ab4324f594c007a6895540b77ad5d89d9',1,'binary_vv(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
-  ['binary_5fvv2_59',['binary_vv2',['../metal_2kernels_2binary_8h.html#a19dbbf8fea68b64bdd25dc8d36865171',1,'binary_vv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a12e80730e43dfaa4c79ce8d5f99edc50',1,'binary_vv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
-  ['bits_60',['bits',['../namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825',1,'mlx::core::random::bits(const std::vector&lt; int &gt; &amp;shape, int width, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a548ffed4ba3107b89885ff850ffce5f4',1,'mlx::core::random::bits(const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
-  ['bits_5f_61',['bits_',['../struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8',1,'_MLX_BFloat16::bits_'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#aca48963f820065c3d8ecab24265ab3fc',1,'mlx::core::_MLX_BFloat16::bits_'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a5203fe52424fd32bce6eb7917dd9288b',1,'mlx::core::_MLX_Float16::bits_']]],
-  ['bits_5fto_5fbfloat_62',['bits_to_bfloat',['../struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca',1,'_MLX_BFloat16']]],
-  ['bits_5fto_5fbfloat_5fstruct_63',['bits_to_bfloat_struct',['../struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html',1,'_MLX_BFloat16']]],
-  ['bitwise_5fand_64',['bitwise_and',['../group__ops.html#ga752fd2707dabb05d0308ba3d55346ada',1,'mlx::core']]],
-  ['bitwise_5for_65',['bitwise_or',['../group__ops.html#ga8af4f22c08c11c4ffab7e3d45e0f3cd6',1,'mlx::core']]],
-  ['bitwise_5fxor_66',['bitwise_xor',['../group__ops.html#ga3188638fba3a60e264baf69956a1e08b',1,'mlx::core']]],
-  ['bitwiseand_67',['BitwiseAnd',['../struct_bitwise_and.html',1,'BitwiseAnd'],['../structmlx_1_1core_1_1detail_1_1_bitwise_and.html',1,'mlx::core::detail::BitwiseAnd']]],
-  ['bitwisebinary_68',['BitwiseBinary',['../classmlx_1_1core_1_1_bitwise_binary.html',1,'mlx::core::BitwiseBinary'],['../classmlx_1_1core_1_1_bitwise_binary.html#a0d8b3a94951621ffcdebc6fda748a172',1,'mlx::core::BitwiseBinary::BitwiseBinary()']]],
-  ['bitwiseor_69',['BitwiseOr',['../struct_bitwise_or.html',1,'BitwiseOr'],['../structmlx_1_1core_1_1detail_1_1_bitwise_or.html',1,'mlx::core::detail::BitwiseOr']]],
-  ['bitwisexor_70',['BitwiseXor',['../struct_bitwise_xor.html',1,'BitwiseXor'],['../structmlx_1_1core_1_1detail_1_1_bitwise_xor.html',1,'mlx::core::detail::BitwiseXor']]],
-  ['bj_71',['bj',['../struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00',1,'QuantizedBlockLoader::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a7ae9e41f50c0c63c35b63086a1c22cc3',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a6fd3dd7b74d91609fa9dd61c657a0e32',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a6f2fdcaf5a67567cca38ae3d8120ab37',1,'mlx::steel::Conv2DWeightBlockLoader::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7cf448573d41fbc67f8dfc65b7aef2b2',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#adaa261fc2e8e694aedab4ebd60b52e5e',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#ace16704025bc6e6204c306a357f3a8b8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acec010e10d5733654963407af38d4f67',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::bj'],['../structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4',1,'mlx::steel::BlockLoader::bj']]],
-  ['block_5fmasked_5fgemm_72',['block_masked_gemm',['../steel__gemm__masked_8h.html#af805e998b2046ee30c2b4be813e3af97',1,'block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device out_mask_t *out_mask, const device op_mask_t *lhs_mask, const device op_mask_t *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid):&#160;steel_gemm_masked.h'],['../steel__gemm__masked_8h.html#a477932e2ae9d49366f7ede6db63f9cac',1,'block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device bool *out_mask, const device bool *lhs_mask, const device bool *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid):&#160;steel_gemm_masked.h']]],
-  ['block_5fmasked_5fmm_73',['block_masked_mm',['../group__ops.html#ga6b76c8ea46b19e6866af155fa5910be6',1,'mlx::core']]],
-  ['block_5fmerge_5fsort_5ft_74',['block_merge_sort_t',['../struct_kernel_merge_sort.html#adae7850e057fc30d5328c7b3dcc998fa',1,'KernelMergeSort::block_merge_sort_t'],['../struct_kernel_multi_block_merge_sort.html#af27e9af4b58640c0aa620bc4efc68dff',1,'KernelMultiBlockMergeSort::block_merge_sort_t']]],
-  ['block_5fsort_75',['block_sort',['../struct_kernel_merge_sort.html#a56b644ec66f7fb5c01b280f124304be9',1,'KernelMergeSort::block_sort()'],['../struct_kernel_multi_block_merge_sort.html#a322ed2eac315a561e0fd90af2fd577eb',1,'KernelMultiBlockMergeSort::block_sort()'],['../sort_8h.html#a93f14092416169c4449141043ac45ffd',1,'block_sort(const device T *inp, device U *out, const constant int &amp;size_sorted_axis, const constant int &amp;in_stride_sorted_axis, const constant int &amp;out_stride_sorted_axis, const constant int &amp;in_stride_segment_axis, const constant int &amp;out_stride_segment_axis, uint3 tid, uint3 lid):&#160;sort.h']]],
-  ['block_5fsort_5fnc_76',['block_sort_nc',['../sort_8h.html#a4ee3de195a6f9c33aa91ac52461808ad',1,'sort.h']]],
-  ['blockloader_77',['BlockLoader',['../structmlx_1_1steel_1_1_block_loader.html',1,'mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;'],['../structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335',1,'mlx::steel::BlockLoader::BlockLoader()']]],
-  ['blockm_78',['blockM',['../struct_g_e_m_v_kernel.html#a7281520100658811076400060663903c',1,'GEMVKernel::blockM'],['../struct_g_e_m_v_t_kernel.html#a2ae8ce535d59cccf453381b4485a77f0',1,'GEMVTKernel::blockM']]],
-  ['blockmaskedmm_79',['BlockMaskedMM',['../classmlx_1_1core_1_1_block_masked_m_m.html',1,'mlx::core::BlockMaskedMM'],['../classmlx_1_1core_1_1_block_masked_m_m.html#ad26509deb5306d0c5eb72477e9a57477',1,'mlx::core::BlockMaskedMM::BlockMaskedMM()']]],
-  ['blockmergesort_80',['BlockMergeSort',['../struct_block_merge_sort.html',1,'']]],
-  ['blockmma_81',['BlockMMA',['../structmlx_1_1steel_1_1_block_m_m_a.html',1,'mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;'],['../structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8',1,'mlx::steel::BlockMMA::BlockMMA()']]],
-  ['blockn_82',['blockN',['../struct_g_e_m_v_kernel.html#a2fef17f9c9aa0bdf530ad3554fb0988b',1,'GEMVKernel::blockN'],['../struct_g_e_m_v_t_kernel.html#a60be87666006ba0bf88bc8e6902da42a',1,'GEMVTKernel::blockN']]],
-  ['blockswizzle_83',['BlockSwizzle',['../structmlx_1_1steel_1_1_block_swizzle.html',1,'mlx::steel']]],
-  ['bluestein_5ffft_84',['bluestein_fft',['../backend_2metal_2kernels_2fft_8h.html#a0abc609e9756475800e996775a96a87e',1,'fft.h']]],
-  ['bool4_5for_5fuint_85',['bool4_or_uint',['../unionbool4__or__uint.html',1,'']]],
-  ['bool_5f_86',['bool_',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa467afb5838aa377d55cce81f84c5512b',1,'mlx::core::Dtype::bool_'],['../namespacemlx_1_1core.html#a113d2bac7e4aa6a4cb4a5c3242527b82',1,'mlx::core::bool_']]],
-  ['bool_5fconstant_87',['bool_constant',['../namespacemlx_1_1steel.html#adbb34bcf0d2dca6b9fb803d591d00da9',1,'mlx::steel']]],
-  ['broadcast_88',['Broadcast',['../classmlx_1_1core_1_1_broadcast.html',1,'mlx::core::Broadcast'],['../classmlx_1_1core_1_1_broadcast.html#accbab8433c93e281608a268d11afaefb',1,'mlx::core::Broadcast::Broadcast()']]],
-  ['broadcast_5farrays_89',['broadcast_arrays',['../group__ops.html#gab783890428b596f715dc7dd2057eae99',1,'mlx::core']]],
-  ['broadcast_5fshapes_90',['broadcast_shapes',['../namespacemlx_1_1core.html#a075e07def338cd9d815182d0e6a656c0',1,'mlx::core']]],
-  ['broadcast_5fto_91',['broadcast_to',['../group__ops.html#gad256e86cc1a6e6b3832e392baa90318d',1,'mlx::core']]],
-  ['brows_92',['BROWS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ac070c6bd5be85b1ae805e18890db4fd4',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a10591ea957605a9c662f93d59ff3410d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ae9b86b05b23153ea1abaeead456c491c',1,'mlx::steel::Conv2DWeightBlockLoader::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a343984fb74ec579a4404278dbbc7e7b5',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acc8140aae84694f62e6324dbb6a614a4',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aba1e1c8012e4e50f0e9bcfb9486c1781',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a015a0c56de74a0c4d51953a7e94fbba8',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::BROWS']]],
-  ['bs_5foffset_93',['Bs_offset',['../structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca',1,'mlx::steel::BlockMMA']]],
-  ['bs_5fqmm_5fn_94',['bs_qmm_n',['../quantized_8h.html#a1a66b061c46383952a0f067c3848971f',1,'quantized.h']]],
-  ['bs_5fqmm_5ft_95',['bs_qmm_t',['../quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84',1,'quantized.h']]],
-  ['bs_5fqmv_96',['bs_qmv',['../quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed',1,'quantized.h']]],
-  ['bs_5fqmv_5ffast_97',['bs_qmv_fast',['../quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7',1,'quantized.h']]],
-  ['bs_5fqvm_98',['bs_qvm',['../quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494',1,'quantized.h']]],
-  ['btile_99',['Btile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26',1,'mlx::steel::BlockMMA']]],
-  ['buf_100',['buf',['../struct_read_writer.html#a23bac3c96dd0265ddbee1f256be45ff5',1,'ReadWriter::buf'],['../backend_2metal_2allocator_8h.html#a15aa5cc1baf29be08d55cca88509e697',1,'buf:&#160;allocator.h']]],
-  ['buffer_101',['Buffer',['../classmlx_1_1core_1_1allocator_1_1_buffer.html',1,'mlx::core::allocator::Buffer'],['../classmlx_1_1core_1_1allocator_1_1_buffer.html#ac4fc2cc6aa1368cfb74aff329d9a1300',1,'mlx::core::allocator::Buffer::Buffer()']]],
-  ['buffer_102',['buffer',['../structmlx_1_1core_1_1array_1_1_data.html#a9a51e2d12ba505027cc0fca86bdd39ad',1,'mlx::core::array::Data::buffer'],['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb',1,'mlx::core::metal::DeviceStream::buffer'],['../classmlx_1_1core_1_1array.html#ab3daf04c27c4593d9d73c397b8484a08',1,'mlx::core::array::buffer()'],['../classmlx_1_1core_1_1array.html#a634466ce661485394f2fdc3bd6796bcd',1,'mlx::core::array::buffer() const']]],
-  ['buffer_5fops_103',['buffer_ops',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782',1,'mlx::core::metal::DeviceStream']]],
-  ['buffer_5fsize_104',['buffer_size',['../classmlx_1_1core_1_1array.html#a914577c63755b2e862d2da68bbf8e3dd',1,'mlx::core::array']]],
-  ['buffers_105',['buffers',['../struct_indices.html#ad705070a740579c07d109ae4f3d86e76',1,'Indices']]],
-  ['build_5flib_5fname_106',['build_lib_name',['../namespacemlx_1_1core.html#a3ef23f334cb9f68a2c50524bc67c913b',1,'mlx::core']]]
+  ['b_0',['B',['../structmlx_1_1steel_1_1_attn_params.html#a1cba7fedbd02e157922619195997cf4f',1,'mlx::steel::AttnParams']]],
+  ['b_1',['b',['../unionbool4__or__uint.html#a47d77eac47598fe420f8f04a615f76ca',1,'bool4_or_uint::b'],['../structmlx_1_1core_1_1_dtype.html#adb1ea8b45a0c53e04a0e73b168702715a92eb5ffee6ae2fec3ad71c777531578f',1,'mlx::core::Dtype::b']]],
+  ['b_5fstr_5fk_2',['B_str_k',['../structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211',1,'mlx::steel::BlockMMA']]],
+  ['b_5fstr_5fn_3',['B_str_n',['../structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17',1,'mlx::steel::BlockMMA']]],
+  ['backward_4',['BACKWARD',['../namespacepocketfft_1_1detail.html#a9d1eaa7469c018c39e745733eab9a9c3',1,'pocketfft::detail']]],
+  ['base_5',['Base',['../classmlx_1_1core_1_1_log.html#a044a23e8b1422984628e1cd5ab506421',1,'mlx::core::Log']]],
+  ['base_5fwh_6',['base_wh',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aca37adba6f148579eb1cd0a7800a5cfe',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::base_wh'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6c46564bf1a96a02791dd432cc9c883e',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::base_wh']]],
+  ['base_5fww_7',['base_ww',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32d020c6715d06f7de360877fcb7b6e4',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::base_ww'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a230f0e581f9b8227b9ee68760b3b1503',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::base_ww']]],
+  ['basemmafrag_8',['BaseMMAFrag',['../structmlx_1_1steel_1_1_base_m_m_a_frag.html',1,'mlx::steel']]],
+  ['basemmafrag_3c_20t_2c_208_2c_208_20_3e_9',['BaseMMAFrag&lt; T, 8, 8 &gt;',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html',1,'mlx::steel']]],
+  ['batch_5fndim_10',['batch_ndim',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a640dc138a8bf7b2b5bed6a436b429c2f',1,'mlx::steel::GEMMParams']]],
+  ['batch_5fsize_11',['batch_size',['../struct_read_writer.html#a689f4890c1d2ce33fc6da7550beec735',1,'ReadWriter']]],
+  ['batch_5fstride_5fa_12',['batch_stride_a',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a76f55783a8e2ee203cf8507eee4b000c',1,'mlx::steel::GEMMParams']]],
+  ['batch_5fstride_5fb_13',['batch_stride_b',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a99b959b12d12da657648fa24d43e49e8',1,'mlx::steel::GEMMParams']]],
+  ['batch_5fstride_5fc_14',['batch_stride_c',['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a77c065db228e9654a0a75a6ffe47c15a',1,'mlx::steel::GEMMAddMMParams']]],
+  ['batch_5fstride_5fd_15',['batch_stride_d',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#ad98006dd509a455864e6aa7c52743a41',1,'mlx::steel::GEMMParams']]],
+  ['bcols_16',['BCOLS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a29fbeeacdf5b6feeb74815ced255fa5a',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac3b40db720055350bba59d614ea1dd79',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a86519729ef0561686bb86e474c95b93d',1,'mlx::steel::Conv2DWeightBlockLoader::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9229d22e0a02d96825eb5a57c8cb95ac',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b6cf53a10514310d01f4d6459053a57',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3d6272d000f8ea79d9b3b5228bdca20f',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a1843921cd67926002bb0dcccf3048eb6',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::BCOLS']]],
+  ['bcols_5fpacked_17',['BCOLS_PACKED',['../struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb',1,'QuantizedBlockLoader']]],
+  ['begin_18',['begin',['../classmlx_1_1core_1_1array.html#a76b258b169d7d73419ebbf85340fb914',1,'mlx::core::array']]],
+  ['bernoulli_19',['bernoulli',['../namespacemlx_1_1core_1_1random.html#acb3f278fea2c4f06dea947d3bac2e9b7',1,'mlx::core::random::bernoulli(const array &amp;p, const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#aaa49f6c2af5496822fa09435e54275cb',1,'mlx::core::random::bernoulli(const array &amp;p, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#aa9e360f9cb7bd23221352ed9e31d83c2',1,'mlx::core::random::bernoulli(T p, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a37fcba120a1d246176db5256d3201cd4',1,'mlx::core::random::bernoulli(T p, const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#ad7eb4467e2f9d5f74a5607b29a935b6e',1,'mlx::core::random::bernoulli(const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
+  ['beta_20',['beta',['../structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6',1,'mlx::steel::TransformAxpby::beta'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#ac0ce4d8a6014f8adb29fd0a0bb23139f',1,'mlx::steel::GEMMAddMMParams::beta']]],
+  ['bf16_2eh_21',['bf16.h',['../backend_2metal_2kernels_2jit_2bf16_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2metal__3__1_2bf16_8h.html',1,'(Global Namespace)'],['../types_2bf16_8h.html',1,'(Global Namespace)']]],
+  ['bf16_5fmath_2eh_22',['bf16_math.h',['../bf16__math_8h.html',1,'']]],
+  ['bfloat16_23',['bfloat16',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa444fe01f3a7a54d1809aef0912846a47',1,'mlx::core::Dtype::bfloat16'],['../namespacemlx_1_1core.html#a514cf8b4e6f0a6af3a867e752f4338f7',1,'mlx::core::bfloat16']]],
+  ['bfloat16_5ft_24',['bfloat16_t',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82',1,'bfloat16_t:&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a58e15a77da988b9104fee00cdf8b280e',1,'bfloat16_t:&#160;bf16.h']]],
+  ['bfloat16_5fto_5fuint16_25',['bfloat16_to_uint16',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088',1,'bfloat16_to_uint16(const bfloat16_t x):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088',1,'bfloat16_to_uint16(const bfloat16_t x):&#160;bf16.h']]],
+  ['bfloat_5fbinop_26',['bfloat_binop',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707',1,'bfloat_binop:&#160;bf16.h'],['../types_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707',1,'bfloat_binop:&#160;bf16.h']]],
+  ['bfloat_5fbinop_5fbase_27',['bfloat_binop_base',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70',1,'bfloat_binop_base:&#160;bf16.h'],['../types_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70',1,'bfloat_binop_base:&#160;bf16.h']]],
+  ['bfloat_5fbinop_5fhelper_28',['bfloat_binop_helper',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594',1,'bfloat_binop_helper:&#160;bf16.h'],['../types_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594',1,'bfloat_binop_helper:&#160;bf16.h']]],
+  ['bfloat_5fbitop_29',['bfloat_bitop',['../types_2bf16_8h.html#aac9ba86d4bf05bcda1936494f9b9b4d3',1,'bf16.h']]],
+  ['bfloat_5fbits_5fto_5ffloat_30',['bfloat_bits_to_float',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1',1,'bf16.h']]],
+  ['bfloat_5fcompop_31',['bfloat_compop',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239',1,'bfloat_compop:&#160;bf16.h'],['../types_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239',1,'bfloat_compop:&#160;bf16.h']]],
+  ['bfloat_5finplace_5fbitop_32',['bfloat_inplace_bitop',['../types_2bf16_8h.html#af13b46bc58e6e6f675ae47aabec37711',1,'bf16.h']]],
+  ['bfloat_5finplace_5fop_33',['bfloat_inplace_op',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c',1,'bfloat_inplace_op:&#160;bf16.h'],['../types_2bf16_8h.html#aee905053f51f76e0c1af94199714d514',1,'bfloat_inplace_op:&#160;bf16.h']]],
+  ['bfloat_5finplace_5fop_5faddr_5fspace_5fhelper_34',['bfloat_inplace_op_addr_space_helper',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8',1,'bfloat_inplace_op_addr_space_helper:&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1457da931c28fa4e2500daa4e6441e8b',1,'bfloat_inplace_op_addr_space_helper:&#160;bf16.h']]],
+  ['bfloat_5finplace_5fop_5fhelper_35',['bfloat_inplace_op_helper',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d',1,'bfloat_inplace_op_helper:&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afe5988aa8147be2bafda6a5b7792fe15',1,'bfloat_inplace_op_helper:&#160;bf16.h']]],
+  ['bfs_5fmax_5fwidth_36',['bfs_max_width',['../namespacemlx_1_1core_1_1env.html#ac3266e1259a64c8b56bdc6c7029179f2',1,'mlx::core::env']]],
+  ['bi_37',['bi',['../struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906',1,'QuantizedBlockLoader::bi'],['../structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af',1,'mlx::steel::BlockLoader::bi'],['../structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35',1,'mlx::steel::BlockLoaderT::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8e53b0a9951cb840d922cc285b257ee3',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ae3af75287f279d2cdeef189126740d4c',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a8c5e74003600132954cb953616e1a026',1,'mlx::steel::Conv2DWeightBlockLoader::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9eb024e2fc6f07345f87fbf7141c0d16',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae3b9f21f72e5e6c541c9978f55d354c7',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32a3a91fa715b82f36e05ceb10933d09',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a4c91f848856ab0872bdfd37c62d4b0ba',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::bi']]],
+  ['biases_38',['biases',['../struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd',1,'QuantizedBlockLoader']]],
+  ['binary_39',['binary',['../namespacemlx_1_1core_1_1metal.html#a269d591ec02e2f7c0f7a718fbfa37f73',1,'mlx::core::metal']]],
+  ['binary_2eh_40',['binary.h',['../common_2binary_8h.html',1,'(Global Namespace)'],['../metal_2binary_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2binary_8h.html',1,'(Global Namespace)']]],
+  ['binary_5fg_41',['binary_g',['../metal_2kernels_2binary_8h.html#ab1b49438a70f6c707c18afd5bce12bb3',1,'binary_g(device const T *a, device const T *b, device U *c, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#aaf6edb734cea627bca4f6540dc338fbd',1,'binary_g(device const T *a, device const T *b, device U *c, device U *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim):&#160;binary_two.h']]],
+  ['binary_5fg_5fnd1_42',['binary_g_nd1',['../metal_2kernels_2binary_8h.html#a6808bfb006cb5473da087a2758d0d867',1,'binary_g_nd1(device const T *a, device const T *b, device U *c, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ad1fad37c168192b212a4294f4cf78133',1,'binary_g_nd1(device const T *a, device const T *b, device U *c, device U *d, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index):&#160;binary_two.h']]],
+  ['binary_5fg_5fnd2_43',['binary_g_nd2',['../metal_2kernels_2binary_8h.html#a6cefcfee68bd62f3a6924df0cd53dd49',1,'binary_g_nd2(device const T *a, device const T *b, device U *c, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a97b5613aff654d32c49225209a19bb95',1,'binary_g_nd2(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
+  ['binary_5fg_5fnd3_44',['binary_g_nd3',['../metal_2kernels_2binary_8h.html#abb15de8250f9a259de80618c6de46dfa',1,'binary_g_nd3(device const T *a, device const T *b, device U *c, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#aae07014f8dffa3649a5c7f4671e1268e',1,'binary_g_nd3(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim):&#160;binary_two.h']]],
+  ['binary_5fop_5fgpu_45',['binary_op_gpu',['../namespacemlx_1_1core.html#ad884f4a36308b5b4f8a5d990d2e086df',1,'mlx::core::binary_op_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs, const std::string &amp;op, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a094876ea5a2a2445ab64efc8222da202',1,'mlx::core::binary_op_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out, const std::string &amp;op, const Stream &amp;s)']]],
+  ['binary_5fop_5fgpu_5finplace_46',['binary_op_gpu_inplace',['../namespacemlx_1_1core.html#a8616c0b7b0fc118a75400bc86404c367',1,'mlx::core::binary_op_gpu_inplace(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs, const std::string &amp;op, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a7e6af6624e322e7ad60a3873a66e18a3',1,'mlx::core::binary_op_gpu_inplace(const std::vector&lt; array &gt; &amp;inputs, array &amp;out, const std::string &amp;op, const Stream &amp;s)']]],
+  ['binary_5fops_47',['binary_ops',['../namespacemlx_1_1core_1_1metal.html#a8db7f9cc781d4bfb08423a401665f322',1,'mlx::core::metal']]],
+  ['binary_5fops_2eh_48',['binary_ops.h',['../binary__ops_8h.html',1,'']]],
+  ['binary_5fss_49',['binary_ss',['../metal_2kernels_2binary_8h.html#a242b8b29a852c255467e50628c6dccf5',1,'binary_ss(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#af8a791ac7ca88d32cd8f4e9ac0f9ab4f',1,'binary_ss(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
+  ['binary_5fsv_50',['binary_sv',['../metal_2kernels_2binary_8h.html#a4116c35f2e4632366d1611d5a95ba141',1,'binary_sv(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ab18c6ecf5065275c93701efd095c916c',1,'binary_sv(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
+  ['binary_5fsv2_51',['binary_sv2',['../metal_2kernels_2binary_8h.html#aa8c48b1b21d8f5a181f5443de2346589',1,'binary_sv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a08822ff98ea6f61a98b49a9e9a38b891',1,'binary_sv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
+  ['binary_5ftwo_52',['binary_two',['../namespacemlx_1_1core_1_1metal.html#aed047eec38b030ec5f29b9da54abf8cb',1,'mlx::core::metal']]],
+  ['binary_5ftwo_2eh_53',['binary_two.h',['../common_2binary__two_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2binary__two_8h.html',1,'(Global Namespace)']]],
+  ['binary_5fvs_54',['binary_vs',['../metal_2kernels_2binary_8h.html#a649851d133358dd5832a73b1061b3313',1,'binary_vs(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a12dbda74fa460812177ccb9aeee6e1ca',1,'binary_vs(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
+  ['binary_5fvs2_55',['binary_vs2',['../metal_2kernels_2binary_8h.html#a48bd82eb10f9c623ce7d28daec4fa512',1,'binary_vs2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a273d2f31691f2c64623c2a97eab344be',1,'binary_vs2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
+  ['binary_5fvv_56',['binary_vv',['../metal_2kernels_2binary_8h.html#add6a9aeee3cb0ba909574f27fa9ecd5b',1,'binary_vv(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ab4324f594c007a6895540b77ad5d89d9',1,'binary_vv(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
+  ['binary_5fvv2_57',['binary_vv2',['../metal_2kernels_2binary_8h.html#a19dbbf8fea68b64bdd25dc8d36865171',1,'binary_vv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a12e80730e43dfaa4c79ce8d5f99edc50',1,'binary_vv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
+  ['bits_58',['bits',['../namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825',1,'mlx::core::random::bits(const std::vector&lt; int &gt; &amp;shape, int width, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a548ffed4ba3107b89885ff850ffce5f4',1,'mlx::core::random::bits(const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
+  ['bits_5f_59',['bits_',['../struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8',1,'_MLX_BFloat16::bits_'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#aca48963f820065c3d8ecab24265ab3fc',1,'mlx::core::_MLX_BFloat16::bits_'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a5203fe52424fd32bce6eb7917dd9288b',1,'mlx::core::_MLX_Float16::bits_']]],
+  ['bits_5fto_5fbfloat_60',['bits_to_bfloat',['../struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca',1,'_MLX_BFloat16']]],
+  ['bits_5fto_5fbfloat_5fstruct_61',['bits_to_bfloat_struct',['../struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html',1,'_MLX_BFloat16']]],
+  ['bitwise_5fand_62',['bitwise_and',['../group__ops.html#ga752fd2707dabb05d0308ba3d55346ada',1,'mlx::core']]],
+  ['bitwise_5for_63',['bitwise_or',['../group__ops.html#ga8af4f22c08c11c4ffab7e3d45e0f3cd6',1,'mlx::core']]],
+  ['bitwise_5fxor_64',['bitwise_xor',['../group__ops.html#ga3188638fba3a60e264baf69956a1e08b',1,'mlx::core']]],
+  ['bitwiseand_65',['BitwiseAnd',['../struct_bitwise_and.html',1,'BitwiseAnd'],['../structmlx_1_1core_1_1detail_1_1_bitwise_and.html',1,'mlx::core::detail::BitwiseAnd']]],
+  ['bitwisebinary_66',['BitwiseBinary',['../classmlx_1_1core_1_1_bitwise_binary.html',1,'mlx::core::BitwiseBinary'],['../classmlx_1_1core_1_1_bitwise_binary.html#a0d8b3a94951621ffcdebc6fda748a172',1,'mlx::core::BitwiseBinary::BitwiseBinary()']]],
+  ['bitwiseor_67',['BitwiseOr',['../struct_bitwise_or.html',1,'BitwiseOr'],['../structmlx_1_1core_1_1detail_1_1_bitwise_or.html',1,'mlx::core::detail::BitwiseOr']]],
+  ['bitwisexor_68',['BitwiseXor',['../struct_bitwise_xor.html',1,'BitwiseXor'],['../structmlx_1_1core_1_1detail_1_1_bitwise_xor.html',1,'mlx::core::detail::BitwiseXor']]],
+  ['bj_69',['bj',['../struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00',1,'QuantizedBlockLoader::bj'],['../structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4',1,'mlx::steel::BlockLoader::bj'],['../structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957',1,'mlx::steel::BlockLoaderT::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a7ae9e41f50c0c63c35b63086a1c22cc3',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a6fd3dd7b74d91609fa9dd61c657a0e32',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a6f2fdcaf5a67567cca38ae3d8120ab37',1,'mlx::steel::Conv2DWeightBlockLoader::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7cf448573d41fbc67f8dfc65b7aef2b2',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#adaa261fc2e8e694aedab4ebd60b52e5e',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#ace16704025bc6e6204c306a357f3a8b8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acec010e10d5733654963407af38d4f67',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::bj']]],
+  ['block_5fmasked_5fgemm_70',['block_masked_gemm',['../steel__gemm__masked_8h.html#af805e998b2046ee30c2b4be813e3af97',1,'block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device out_mask_t *out_mask, const device op_mask_t *lhs_mask, const device op_mask_t *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid):&#160;steel_gemm_masked.h'],['../steel__gemm__masked_8h.html#a477932e2ae9d49366f7ede6db63f9cac',1,'block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device bool *out_mask, const device bool *lhs_mask, const device bool *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid):&#160;steel_gemm_masked.h']]],
+  ['block_5fmasked_5fmm_71',['block_masked_mm',['../group__ops.html#ga6b76c8ea46b19e6866af155fa5910be6',1,'mlx::core']]],
+  ['block_5fmerge_5fsort_5ft_72',['block_merge_sort_t',['../struct_kernel_merge_sort.html#adae7850e057fc30d5328c7b3dcc998fa',1,'KernelMergeSort::block_merge_sort_t'],['../struct_kernel_multi_block_merge_sort.html#af27e9af4b58640c0aa620bc4efc68dff',1,'KernelMultiBlockMergeSort::block_merge_sort_t']]],
+  ['block_5fsort_73',['block_sort',['../struct_kernel_merge_sort.html#a56b644ec66f7fb5c01b280f124304be9',1,'KernelMergeSort::block_sort()'],['../struct_kernel_multi_block_merge_sort.html#a322ed2eac315a561e0fd90af2fd577eb',1,'KernelMultiBlockMergeSort::block_sort()'],['../sort_8h.html#a93f14092416169c4449141043ac45ffd',1,'block_sort(const device T *inp, device U *out, const constant int &amp;size_sorted_axis, const constant int &amp;in_stride_sorted_axis, const constant int &amp;out_stride_sorted_axis, const constant int &amp;in_stride_segment_axis, const constant int &amp;out_stride_segment_axis, uint3 tid, uint3 lid):&#160;sort.h']]],
+  ['block_5fsort_5fnc_74',['block_sort_nc',['../sort_8h.html#a4ee3de195a6f9c33aa91ac52461808ad',1,'sort.h']]],
+  ['blockloader_75',['BlockLoader',['../structmlx_1_1steel_1_1_block_loader.html',1,'mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;'],['../structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335',1,'mlx::steel::BlockLoader::BlockLoader(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)'],['../structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335',1,'mlx::steel::BlockLoader::BlockLoader(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)']]],
+  ['blockloadert_76',['BlockLoaderT',['../structmlx_1_1steel_1_1_block_loader_t.html',1,'mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;'],['../structmlx_1_1steel_1_1_block_loader_t.html#a076616a7c67ad1b847e0e6b046077ee2',1,'mlx::steel::BlockLoaderT::BlockLoaderT()']]],
+  ['blockm_77',['blockM',['../struct_g_e_m_v_kernel.html#a7281520100658811076400060663903c',1,'GEMVKernel::blockM'],['../struct_g_e_m_v_t_kernel.html#a2ae8ce535d59cccf453381b4485a77f0',1,'GEMVTKernel::blockM']]],
+  ['blockmaskedmm_78',['BlockMaskedMM',['../classmlx_1_1core_1_1_block_masked_m_m.html',1,'mlx::core::BlockMaskedMM'],['../classmlx_1_1core_1_1_block_masked_m_m.html#ad26509deb5306d0c5eb72477e9a57477',1,'mlx::core::BlockMaskedMM::BlockMaskedMM()']]],
+  ['blockmergesort_79',['BlockMergeSort',['../struct_block_merge_sort.html',1,'']]],
+  ['blockmma_80',['BlockMMA',['../structmlx_1_1steel_1_1_block_m_m_a.html',1,'mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;'],['../structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8',1,'mlx::steel::BlockMMA::BlockMMA(ushort simd_group_id, ushort simd_lane_id)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8',1,'mlx::steel::BlockMMA::BlockMMA(ushort simd_group_id, ushort simd_lane_id)']]],
+  ['blockn_81',['blockN',['../struct_g_e_m_v_kernel.html#a2fef17f9c9aa0bdf530ad3554fb0988b',1,'GEMVKernel::blockN'],['../struct_g_e_m_v_t_kernel.html#a60be87666006ba0bf88bc8e6902da42a',1,'GEMVTKernel::blockN']]],
+  ['blockswizzle_82',['BlockSwizzle',['../structmlx_1_1steel_1_1_block_swizzle.html',1,'mlx::steel']]],
+  ['bluestein_5ffft_83',['bluestein_fft',['../backend_2metal_2kernels_2fft_8h.html#a0abc609e9756475800e996775a96a87e',1,'fft.h']]],
+  ['bool4_5for_5fuint_84',['bool4_or_uint',['../unionbool4__or__uint.html',1,'']]],
+  ['bool_5f_85',['bool_',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa467afb5838aa377d55cce81f84c5512b',1,'mlx::core::Dtype::bool_'],['../namespacemlx_1_1core.html#a113d2bac7e4aa6a4cb4a5c3242527b82',1,'mlx::core::bool_']]],
+  ['bool_5fconstant_86',['bool_constant',['../namespacemlx_1_1steel.html#adbb34bcf0d2dca6b9fb803d591d00da9',1,'mlx::steel']]],
+  ['broadcast_87',['Broadcast',['../classmlx_1_1core_1_1_broadcast.html',1,'mlx::core::Broadcast'],['../classmlx_1_1core_1_1_broadcast.html#accbab8433c93e281608a268d11afaefb',1,'mlx::core::Broadcast::Broadcast()']]],
+  ['broadcast_5farrays_88',['broadcast_arrays',['../group__ops.html#gab783890428b596f715dc7dd2057eae99',1,'mlx::core']]],
+  ['broadcast_5fshapes_89',['broadcast_shapes',['../namespacemlx_1_1core.html#a075e07def338cd9d815182d0e6a656c0',1,'mlx::core']]],
+  ['broadcast_5fto_90',['broadcast_to',['../group__ops.html#gad256e86cc1a6e6b3832e392baa90318d',1,'mlx::core']]],
+  ['brows_91',['BROWS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ac070c6bd5be85b1ae805e18890db4fd4',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a10591ea957605a9c662f93d59ff3410d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ae9b86b05b23153ea1abaeead456c491c',1,'mlx::steel::Conv2DWeightBlockLoader::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a343984fb74ec579a4404278dbbc7e7b5',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acc8140aae84694f62e6324dbb6a614a4',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aba1e1c8012e4e50f0e9bcfb9486c1781',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a015a0c56de74a0c4d51953a7e94fbba8',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::BROWS']]],
+  ['bs_5foffset_92',['Bs_offset',['../structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca',1,'mlx::steel::BlockMMA']]],
+  ['bs_5fqmm_5fn_93',['bs_qmm_n',['../quantized_8h.html#a1a66b061c46383952a0f067c3848971f',1,'quantized.h']]],
+  ['bs_5fqmm_5ft_94',['bs_qmm_t',['../quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84',1,'quantized.h']]],
+  ['bs_5fqmv_95',['bs_qmv',['../quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed',1,'quantized.h']]],
+  ['bs_5fqmv_5ffast_96',['bs_qmv_fast',['../quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7',1,'quantized.h']]],
+  ['bs_5fqvm_97',['bs_qvm',['../quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494',1,'quantized.h']]],
+  ['btile_98',['Btile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0',1,'mlx::steel::BlockMMA']]],
+  ['buf_99',['buf',['../struct_read_writer.html#a23bac3c96dd0265ddbee1f256be45ff5',1,'ReadWriter::buf'],['../backend_2metal_2allocator_8h.html#a15aa5cc1baf29be08d55cca88509e697',1,'buf:&#160;allocator.h']]],
+  ['buffer_100',['Buffer',['../classmlx_1_1core_1_1allocator_1_1_buffer.html',1,'mlx::core::allocator::Buffer'],['../classmlx_1_1core_1_1allocator_1_1_buffer.html#ac4fc2cc6aa1368cfb74aff329d9a1300',1,'mlx::core::allocator::Buffer::Buffer()']]],
+  ['buffer_101',['buffer',['../structmlx_1_1core_1_1array_1_1_data.html#a9a51e2d12ba505027cc0fca86bdd39ad',1,'mlx::core::array::Data::buffer'],['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb',1,'mlx::core::metal::DeviceStream::buffer'],['../classmlx_1_1core_1_1array.html#ab3daf04c27c4593d9d73c397b8484a08',1,'mlx::core::array::buffer()'],['../classmlx_1_1core_1_1array.html#a634466ce661485394f2fdc3bd6796bcd',1,'mlx::core::array::buffer() const']]],
+  ['buffer_5fops_102',['buffer_ops',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782',1,'mlx::core::metal::DeviceStream']]],
+  ['buffer_5fsize_103',['buffer_size',['../classmlx_1_1core_1_1array.html#a914577c63755b2e862d2da68bbf8e3dd',1,'mlx::core::array']]],
+  ['buffers_104',['buffers',['../struct_indices.html#ad705070a740579c07d109ae4f3d86e76',1,'Indices']]],
+  ['build_5flib_5fname_105',['build_lib_name',['../namespacemlx_1_1core.html#a3ef23f334cb9f68a2c50524bc67c913b',1,'mlx::core']]],
+  ['bytes_5fper_5fpack_106',['bytes_per_pack',['../struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db',1,'QuantizedBlockLoader']]]
 ];
diff --git a/docs/build/html/search/all_3.js b/docs/build/html/search/all_3.js
index adac75852..726ce7d55 100644
--- a/docs/build/html/search/all_3.js
+++ b/docs/build/html/search/all_3.js
@@ -1,12 +1,12 @@
 var searchData=
 [
   ['c_0',['C',['../struct_m_l_x_conv_params.html#a0953063962ac3b5a027243289e72fbb2',1,'MLXConvParams']]],
-  ['c_1',['c',['../structmlx_1_1core_1_1_dtype.html#adb1ea8b45a0c53e04a0e73b168702715a4a8a08f09d37b73795649038408b5f33',1,'mlx::core::Dtype']]],
+  ['c_1',['c',['../structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e',1,'mlx::steel::Shape2D::c'],['../structmlx_1_1core_1_1_dtype.html#adb1ea8b45a0c53e04a0e73b168702715a4a8a08f09d37b73795649038408b5f33',1,'mlx::core::Dtype::c']]],
   ['c2c_2',['c2c',['../namespacepocketfft_1_1detail.html#ab585ac594ae1253d4659e7b9e1623c8a',1,'pocketfft::detail']]],
   ['c2r_3',['c2r',['../namespacepocketfft_1_1detail.html#ab26cbfed16f487b987f50bf63bfc1ab9',1,'pocketfft::detail::c2r(const shape_t &amp;shape_out, const stride_t &amp;stride_in, const stride_t &amp;stride_out, size_t axis, bool forward, const std::complex&lt; T &gt; *data_in, T *data_out, T fct, size_t nthreads=1)'],['../namespacepocketfft_1_1detail.html#a788506fff59f8e13056247076cac51c1',1,'pocketfft::detail::c2r(const shape_t &amp;shape_out, const stride_t &amp;stride_in, const stride_t &amp;stride_out, const shape_t &amp;axes, bool forward, const std::complex&lt; T &gt; *data_in, T *data_out, T fct, size_t nthreads=1)']]],
-  ['can_5fconvert_5ffrom_5fbfloat_4',['can_convert_from_bfloat',['../backend_2metal_2kernels_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a',1,'bf16.h']]],
+  ['can_5fconvert_5ffrom_5fbfloat_4',['can_convert_from_bfloat',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a',1,'bf16.h']]],
   ['can_5fconvert_5ffrom_5fcomplex64_5',['can_convert_from_complex64',['../backend_2metal_2kernels_2complex_8h.html#ab149db78f6f19b8da6297dac4c36d893',1,'complex.h']]],
-  ['can_5fconvert_5fto_5fbfloat_6',['can_convert_to_bfloat',['../backend_2metal_2kernels_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e',1,'bf16.h']]],
+  ['can_5fconvert_5fto_5fbfloat_6',['can_convert_to_bfloat',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e',1,'bf16.h']]],
   ['can_5fconvert_5fto_5fcomplex128_7',['can_convert_to_complex128',['../namespacemlx_1_1core.html#a2822d2a4d346c826d3cfebbcf89c3057',1,'mlx::core']]],
   ['can_5fconvert_5fto_5fcomplex64_8',['can_convert_to_complex64',['../backend_2metal_2kernels_2complex_8h.html#a4f90ad54f4fae363e8d3cc41d539557b',1,'can_convert_to_complex64:&#160;complex.h'],['../namespacemlx_1_1core.html#a0b3c76fd03f4df39ec8f9aefdced0861',1,'mlx::core::can_convert_to_complex64']]],
   ['capitalize_5fbool_9',['capitalize_bool',['../structmlx_1_1core_1_1_print_formatter.html#adf49a949db36f0ba076842a6d675d79a',1,'mlx::core::PrintFormatter']]],
@@ -27,7 +27,7 @@ var searchData=
   ['cholesky_24',['Cholesky',['../classmlx_1_1core_1_1_cholesky.html',1,'mlx::core::Cholesky'],['../classmlx_1_1core_1_1_cholesky.html#a6ae2e30b85f99f4f0d7f14c7949818ab',1,'mlx::core::Cholesky::Cholesky()']]],
   ['cholesky_25',['cholesky',['../namespacemlx_1_1core_1_1linalg.html#a46c8a4f806f0a97a4323e91189aa512b',1,'mlx::core::linalg']]],
   ['cholesky_5finv_26',['cholesky_inv',['../namespacemlx_1_1core_1_1linalg.html#aef0fe4894c5cf98792d59859c6d20511',1,'mlx::core::linalg']]],
-  ['clear_27',['clear',['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7',1,'mlx::steel::MMATile']]],
+  ['clear_27',['clear',['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7',1,'mlx::steel::MMATile::clear()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7',1,'mlx::steel::MMATile::clear()']]],
   ['clear_5fcache_28',['clear_cache',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a447c1eb38c00d2e8e521675297f4a9b1',1,'mlx::core::metal::MetalAllocator::clear_cache()'],['../namespacemlx_1_1core_1_1metal.html#a22b3384ebd17f2fca198f81b9f1b6dc3',1,'mlx::core::metal::clear_cache()']]],
   ['clip_29',['clip',['../group__ops.html#ga157cd7c23f9b306fee2e1eb2b9bf1dd8',1,'mlx::core']]],
   ['cmplx_30',['cmplx',['../structpocketfft_1_1detail_1_1cmplx.html',1,'pocketfft::detail::cmplx&lt; T &gt;'],['../structpocketfft_1_1detail_1_1cmplx.html#a5b1ce506f1023f5254025ac81b831a2c',1,'pocketfft::detail::cmplx::cmplx()'],['../structpocketfft_1_1detail_1_1cmplx.html#a05491b4f1f22ca0bc49012f6a1c1710a',1,'pocketfft::detail::cmplx::cmplx(T r_, T i_)']]],
@@ -35,120 +35,123 @@ var searchData=
   ['cmplx_3c_20thigh_20_3e_32',['cmplx&lt; Thigh &gt;',['../structpocketfft_1_1detail_1_1cmplx.html',1,'pocketfft::detail']]],
   ['cndarr_33',['cndarr',['../classpocketfft_1_1detail_1_1cndarr.html',1,'pocketfft::detail::cndarr&lt; T &gt;'],['../classpocketfft_1_1detail_1_1cndarr.html#abf73f1b4ddcfb27d7f85cfa441607129',1,'pocketfft::detail::cndarr::cndarr()']]],
   ['col_5fcontiguous_34',['col_contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#ae24709026598d635e6b5c24a15f8a802',1,'mlx::core::array::Flags']]],
-  ['col_5freduce_5f2pass_35',['col_reduce_2pass',['../reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d',1,'reduce_col.h']]],
-  ['col_5freduce_5flongcolumn_36',['col_reduce_longcolumn',['../reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb',1,'reduce_col.h']]],
-  ['col_5freduce_5flooped_37',['col_reduce_looped',['../reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385',1,'reduce_col.h']]],
-  ['col_5freduce_5fsmall_38',['col_reduce_small',['../reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5',1,'reduce_col.h']]],
-  ['collapse_5fcontiguous_5fdims_39',['collapse_contiguous_dims',['../namespacemlx_1_1core.html#a38fe6ec5220d13d96c7dad7556d2b613',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; int64_t &gt; &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#af2895f9b0083efd8221275eb8cadccbe',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; size_t &gt; &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a90e2b6edc0fe82230cb93f5ea39febb4',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; array &gt; &amp;xs, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#ac813412cce77fc1340dcfefc6e099276',1,'mlx::core::collapse_contiguous_dims(Arrays &amp;&amp;... xs)'],['../namespacemlx_1_1core.html#aab3cc7f3808934ae0727b920eba231bd',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; int64_t &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a1e0cbcf109d32794ffc8efc7302ba9b0',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a4ee50bfb240512d0c0ce151dfe2c74ef',1,'mlx::core::collapse_contiguous_dims(const array &amp;a, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())']]],
-  ['commandencoder_40',['CommandEncoder',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html',1,'mlx::core::metal::CommandEncoder'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3',1,'mlx::core::metal::CommandEncoder::CommandEncoder(MTL::CommandBuffer *cbuf)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14',1,'mlx::core::metal::CommandEncoder::CommandEncoder(const CommandEncoder &amp;)=delete']]],
-  ['commit_5fcommand_5fbuffer_41',['commit_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c',1,'mlx::core::metal::Device']]],
-  ['commonallocator_42',['CommonAllocator',['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html',1,'mlx::core::allocator']]],
-  ['communication_5fstream_43',['communication_stream',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#ac3612edf0e0e18c1e4ba0ce7c6e35cd6',1,'mlx::core::distributed::detail']]],
-  ['compile_44',['compile',['../namespacemlx_1_1core.html#a3ac798e65e59fe10b7fb5c522efce782',1,'mlx::core::compile()'],['../namespacemlx_1_1core_1_1detail.html#ac3b7b09892ff7290d5f3ef26cb444329',1,'mlx::core::detail::compile()']]],
-  ['compile_2eh_45',['compile.h',['../compile_8h.html',1,'']]],
-  ['compile_5favailable_5ffor_5fdevice_46',['compile_available_for_device',['../namespacemlx_1_1core_1_1detail.html#aeeff2ba6ec3d9d4ed090de6d2681dbc2',1,'mlx::core::detail']]],
-  ['compile_5fclear_5fcache_47',['compile_clear_cache',['../namespacemlx_1_1core_1_1detail.html#a3fb927c209b946aefebb195993fbe4cf',1,'mlx::core::detail']]],
-  ['compile_5ferase_48',['compile_erase',['../namespacemlx_1_1core_1_1detail.html#a69eb76a14f845ca000f1ccb2edda0175',1,'mlx::core::detail']]],
-  ['compile_5fimpl_2eh_49',['compile_impl.h',['../compile__impl_8h.html',1,'']]],
-  ['compiled_50',['Compiled',['../classmlx_1_1core_1_1_compiled.html',1,'mlx::core::Compiled'],['../classmlx_1_1core_1_1_compiled.html#a2d8cefff835c419a48a077d306b8e051',1,'mlx::core::Compiled::Compiled()']]],
-  ['compiled_2eh_51',['compiled.h',['../compiled_8h.html',1,'']]],
-  ['compiled_5fallocate_5foutputs_52',['compiled_allocate_outputs',['../namespacemlx_1_1core.html#ab8c3c4fc05745f586de922c8266f4fce',1,'mlx::core']]],
-  ['compiled_5fcheck_5fcontiguity_53',['compiled_check_contiguity',['../namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027',1,'mlx::core']]],
-  ['compiled_5fpreamble_2eh_54',['compiled_preamble.h',['../compiled__preamble_8h.html',1,'']]],
-  ['compilemode_55',['CompileMode',['../namespacemlx_1_1core.html#adb15ff2b1ca5207fd4f6e631e2c3bcb4',1,'mlx::core']]],
-  ['complex_2eh_56',['complex.h',['../backend_2metal_2kernels_2complex_8h.html',1,'(Global Namespace)'],['../types_2complex_8h.html',1,'(Global Namespace)']]],
-  ['complex128_5ft_57',['complex128_t',['../structmlx_1_1core_1_1complex128__t.html',1,'mlx::core::complex128_t'],['../structmlx_1_1core_1_1complex128__t.html#aa15d0b805f8790f7c7b76fc7b9d677e0',1,'mlx::core::complex128_t::complex128_t(double v, double u)'],['../structmlx_1_1core_1_1complex128__t.html#abf2842253b874f9f13f39ea68a89e5b6',1,'mlx::core::complex128_t::complex128_t(std::complex&lt; double &gt; v)'],['../structmlx_1_1core_1_1complex128__t.html#a526fba96d7e815360cb4226af085a1bf',1,'mlx::core::complex128_t::complex128_t(T x)']]],
-  ['complex64_58',['complex64',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa8c022579455bcd2c681f007e84f4e2cf',1,'mlx::core::Dtype::complex64'],['../namespacemlx_1_1core.html#af99db87e0078bfcdb383f5689bc874d4',1,'mlx::core::complex64']]],
-  ['complex64_5ft_59',['complex64_t',['../structcomplex64__t.html',1,'complex64_t'],['../structmlx_1_1core_1_1complex64__t.html',1,'mlx::core::complex64_t'],['../structcomplex64__t.html#adbd392a5e92d31997380ad0a38be4be8',1,'complex64_t::complex64_t(float real, float imag)'],['../structcomplex64__t.html#a29782289bb90d6294099667b86509cd3',1,'complex64_t::complex64_t()'],['../structcomplex64__t.html#a905b048d70eb8d748a62454268242291',1,'complex64_t::complex64_t() threadgroup'],['../structcomplex64__t.html#a33a2452eb33b5ed53655773539c357a5',1,'complex64_t::complex64_t(T x) thread'],['../structcomplex64__t.html#a89b65ace8588b7bf215355f705eb23d9',1,'complex64_t::complex64_t(T x) threadgroup'],['../structcomplex64__t.html#ac81b486f642fb3b26c5d659917bdbcd0',1,'complex64_t::complex64_t(T x) device'],['../structcomplex64__t.html#a0a27a41206400f1e62b60ceb56960c93',1,'complex64_t::complex64_t(T x) const ant'],['../structmlx_1_1core_1_1complex64__t.html#a697cc973ae27d63c8e00d830e780bd8c',1,'mlx::core::complex64_t::complex64_t(float v, float u)'],['../structmlx_1_1core_1_1complex64__t.html#ae065e39938f9c4374b4116f4c67d4d09',1,'mlx::core::complex64_t::complex64_t(std::complex&lt; float &gt; v)'],['../structmlx_1_1core_1_1complex64__t.html#a2232cbbe591a9d2bc228cb23fac38b50',1,'mlx::core::complex64_t::complex64_t(T x)']]],
-  ['complex_5fbinop_60',['complex_binop',['../types_2complex_8h.html#a9c7995d495359894e1b30c0f1678d6bd',1,'complex.h']]],
-  ['complex_5fbinop_5fhelper_61',['complex_binop_helper',['../types_2complex_8h.html#ac6890f9852de12339b09b65757ebc8c4',1,'complex.h']]],
-  ['complex_5fmul_62',['complex_mul',['../radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6',1,'radix.h']]],
-  ['complex_5fmul_5fconj_63',['complex_mul_conj',['../radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3',1,'radix.h']]],
-  ['complexfloating_64',['complexfloating',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2dafb203630099d501ff7c255a574bc4812',1,'mlx::core::Dtype::complexfloating'],['../namespacemlx_1_1core.html#a70b8e88c9df750af984757105af33423',1,'mlx::core::complexfloating']]],
-  ['compute_5fstrided_5findices_65',['compute_strided_indices',['../struct_read_writer.html#a7c903fbb8b85a856ba5564d7df537cdf',1,'ReadWriter']]],
-  ['concatenate_66',['Concatenate',['../classmlx_1_1core_1_1_concatenate.html',1,'mlx::core::Concatenate'],['../classmlx_1_1core_1_1_concatenate.html#acff07853de2d31faeec7c4ca40ce0888',1,'mlx::core::Concatenate::Concatenate()']]],
-  ['concatenate_67',['concatenate',['../group__ops.html#gabdc36fa65697d0361c8d67495de77129',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
-  ['concatenate_5fgpu_68',['concatenate_gpu',['../namespacemlx_1_1core.html#a050299d0d366ca5c9d09d1004dcc3e7d',1,'mlx::core']]],
-  ['concurrent_5fqueue_69',['concurrent_queue',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html',1,'pocketfft::detail::threading']]],
-  ['concurrent_5fqueue_3c_20std_3a_3afunction_3c_20void_28_29_3e_20_3e_70',['concurrent_queue&lt; std::function&lt; void()&gt; &gt;',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html',1,'pocketfft::detail::threading']]],
-  ['concurrentcontext_71',['ConcurrentContext',['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html',1,'mlx::core::metal::CommandEncoder::ConcurrentContext'],['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174',1,'mlx::core::metal::CommandEncoder::ConcurrentContext::ConcurrentContext()']]],
-  ['cond_72',['cond',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a4ffd524d6a5bedd1a303b63bdde6701c',1,'mlx::core::scheduler::StreamThread']]],
-  ['conj_73',['conj',['../namespacepocketfft_1_1detail.html#a66d79051d502046a9b9f103e744dbad3',1,'pocketfft::detail']]],
-  ['conjugate_74',['Conjugate',['../struct_conjugate.html',1,'Conjugate'],['../classmlx_1_1core_1_1_conjugate.html',1,'mlx::core::Conjugate'],['../structmlx_1_1core_1_1detail_1_1_conjugate.html',1,'mlx::core::detail::Conjugate'],['../classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87',1,'mlx::core::Conjugate::Conjugate()']]],
-  ['conjugate_75',['conjugate',['../group__ops.html#ga5b596906bf8cdc8d97ed6ddc9aeb4c23',1,'mlx::core']]],
-  ['contiguous_76',['contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#afd0ab11e7a486a2a8e50ee84b971ac8a',1,'mlx::core::array::Flags']]],
-  ['contiguous_5fscan_77',['contiguous_scan',['../scan_8h.html#a60d279b9add7d56639bb209408f09d79',1,'scan.h']]],
-  ['contiguousallreduce_78',['ContiguousAllReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ae4e34c7154eb8dc47aa8503209730424',1,'mlx::core']]],
-  ['contiguousiterator_79',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html',1,'mlx::core::ContiguousIterator&lt; StrideT &gt;'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6',1,'mlx::core::ContiguousIterator::ContiguousIterator()'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a6cb378408b6f546eeb6ade1a4faafe3c',1,'mlx::core::ContiguousIterator::ContiguousIterator(const array &amp;a)'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a16bdacb53f65b7284068cd49d4cba292',1,'mlx::core::ContiguousIterator::ContiguousIterator(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides, int dims)']]],
-  ['contiguousreduce_80',['ContiguousReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ad2547f25dffe8d8936dbec25601cfc84',1,'mlx::core']]],
-  ['contiguousstridedreduce_81',['ContiguousStridedReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ab48dac7508a2c790de1bdc33f29177ed',1,'mlx::core']]],
-  ['conv_82',['conv',['../namespacemlx_1_1core_1_1metal.html#ab1704e853394c725668c06752ebb5c24',1,'mlx::core::metal']]],
-  ['conv_2eh_83',['conv.h',['../conv_8h.html',1,'']]],
-  ['conv1d_84',['conv1d',['../group__ops.html#ga30d47e08093c03a3676f235f9f559411',1,'mlx::core']]],
-  ['conv2d_85',['conv2d',['../group__ops.html#ga73b02833229678786e7f302d458d5a83',1,'mlx::core']]],
-  ['conv2dgeneralbaseinfo_86',['Conv2DGeneralBaseInfo',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html',1,'mlx::steel']]],
-  ['conv2dgeneraljumpparams_87',['Conv2DGeneralJumpParams',['../structmlx_1_1steel_1_1_conv2_d_general_jump_params.html',1,'mlx::steel']]],
-  ['conv2dinputblockloadergeneral_88',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html',1,'mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::Conv2DInputBlockLoaderGeneral()']]],
-  ['conv2dinputblockloaderlargefilter_89',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8755116a535539744e4947bc69f9c50f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::Conv2DInputBlockLoaderLargeFilter()']]],
-  ['conv2dinputblockloadersmallchannels_90',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ab9fd3fdeab94470dde3326f1dd5c455a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::Conv2DInputBlockLoaderSmallChannels()']]],
-  ['conv2dinputblockloadersmallfilter_91',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0a2cbf57c51cd928722e3f06aafcf933',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::Conv2DInputBlockLoaderSmallFilter()']]],
-  ['conv2dweightblockloader_92',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html',1,'mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a9a7dca3512b64cffb6eac305d795831c',1,'mlx::steel::Conv2DWeightBlockLoader::Conv2DWeightBlockLoader()']]],
-  ['conv2dweightblockloadergeneral_93',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#ad0550fabbdc9297559381a5b488e9af1',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::Conv2DWeightBlockLoaderGeneral()']]],
-  ['conv2dweightblockloadersmallchannels_94',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae1806ea1c19713819dee83a38ab35fa6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::Conv2DWeightBlockLoaderSmallChannels()']]],
-  ['conv3d_95',['conv3d',['../group__ops.html#ga6e9907d2f14dc4803e4306b3dbc4b3ca',1,'mlx::core']]],
-  ['conv_5fgeneral_96',['conv_general',['../group__ops.html#ga2236e5dfc7e52e28abf6c21675d0a51e',1,'mlx::core::conv_general(array input, array weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding_lo={}, std::vector&lt; int &gt; padding_hi={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})'],['../group__ops.html#gab59f89942cd1efaadffe9e8762e3c99d',1,'mlx::core::conv_general(const array &amp;input, const array &amp;weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})']]],
-  ['conv_5ftranspose1d_97',['conv_transpose1d',['../group__ops.html#gaa30bf1adcd78d1c2595d07b215731714',1,'mlx::core']]],
-  ['conv_5ftranspose2d_98',['conv_transpose2d',['../group__ops.html#gaebb59971cb9bc45005dc1d398e4f0a3d',1,'mlx::core']]],
-  ['conv_5ftranspose3d_99',['conv_transpose3d',['../group__ops.html#ga8db814da631d9cd32a8d6563bf4ac530',1,'mlx::core']]],
-  ['convolution_100',['Convolution',['../classmlx_1_1core_1_1_convolution.html',1,'mlx::core::Convolution'],['../classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef',1,'mlx::core::Convolution::Convolution()']]],
-  ['copy_101',['Copy',['../classmlx_1_1core_1_1_copy.html',1,'mlx::core::Copy'],['../classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584',1,'mlx::core::Copy::Copy()']]],
-  ['copy_102',['copy',['../namespacemlx_1_1core.html#a479648542a2bea151b947b18f0e79dd2',1,'mlx::core::copy()'],['../namespacemlx_1_1core_1_1metal.html#aa215e631e2680f04a591b88d91571719',1,'mlx::core::metal::copy()'],['../group__ops.html#gae306e93af12f774bd80bad6c231b09d6',1,'mlx::core::copy()']]],
-  ['copy_2eh_103',['copy.h',['../common_2copy_8h.html',1,'(Global Namespace)'],['../metal_2copy_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2copy_8h.html',1,'(Global Namespace)']]],
-  ['copy_5fg_104',['copy_g',['../metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36',1,'copy.h']]],
-  ['copy_5fg_5fnd1_105',['copy_g_nd1',['../metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77',1,'copy.h']]],
-  ['copy_5fg_5fnd2_106',['copy_g_nd2',['../metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c',1,'copy.h']]],
-  ['copy_5fg_5fnd3_107',['copy_g_nd3',['../metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff',1,'copy.h']]],
-  ['copy_5fgg_108',['copy_gg',['../metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5',1,'copy.h']]],
-  ['copy_5fgg_5fnd1_109',['copy_gg_nd1',['../metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1',1,'copy.h']]],
-  ['copy_5fgg_5fnd2_110',['copy_gg_nd2',['../metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950',1,'copy.h']]],
-  ['copy_5fgg_5fnd3_111',['copy_gg_nd3',['../metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd',1,'copy.h']]],
-  ['copy_5fgpu_112',['copy_gpu',['../namespacemlx_1_1core.html#addaa46a13ac2deb1d9ce621338320e0e',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a6a6f4e46c8fc44fdc74c50ace02bcf38',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype)']]],
-  ['copy_5fgpu_5finplace_113',['copy_gpu_inplace',['../namespacemlx_1_1core.html#a69e30f5d30a6d72ac0ffe4886f24b7ba',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a8e1ccb0ed9387b0a789311d9f8964803',1,'mlx::core::copy_gpu_inplace(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#ae55b801b09ccf55cba96278163a9b1ef',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int64_t &gt; &amp;istride, int64_t ioffset, CopyType ctype, const Stream &amp;s)']]],
-  ['copy_5fhartley_114',['copy_hartley',['../namespacepocketfft_1_1detail.html#abac3fcc8ce83800d228774f64c28d4c3',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#ae7b44d2773d9d06a9787aff01d66b3ed',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
-  ['copy_5finplace_115',['copy_inplace',['../namespacemlx_1_1core.html#a98495894a796b2cc6d022e7a03432c64',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, CopyType ctype)'],['../namespacemlx_1_1core.html#aad636e2d0b2f882cadd1b438f4daa9ed',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype)']]],
-  ['copy_5finput_116',['copy_input',['../namespacepocketfft_1_1detail.html#aff05be3064743c1143b19318ab12ad4a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; cmplx&lt; T &gt; &gt; &amp;src, cmplx&lt; vtype_t&lt; T &gt; &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a30fc708f9d8f9cfa74194925c7863c0a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, vtype_t&lt; T &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a3387bd35f237870e42b8461769e6aec4',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, T *dst)']]],
-  ['copy_5foutput_117',['copy_output',['../namespacepocketfft_1_1detail.html#a1523a037300a8da05db210b802d9cb0e',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const cmplx&lt; vtype_t&lt; T &gt; &gt; *src, ndarr&lt; cmplx&lt; T &gt; &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a21980853aca4d92ed06e3dcffe7ef660',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a310481c334e46674710ba794ad7403c0',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
-  ['copy_5fs_118',['copy_s',['../metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea',1,'copy.h']]],
-  ['copy_5fs2_119',['copy_s2',['../metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3',1,'copy.h']]],
-  ['copy_5fshared_5fbuffer_120',['copy_shared_buffer',['../classmlx_1_1core_1_1array.html#a28df7a333d90a311c49bc4bce7a1ad6d',1,'mlx::core::array::copy_shared_buffer(const array &amp;other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a92974c656c35a972ad241f80584bbd29',1,'mlx::core::array::copy_shared_buffer(const array &amp;other)']]],
-  ['copy_5fv_121',['copy_v',['../metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659',1,'copy.h']]],
-  ['copy_5fv2_122',['copy_v2',['../metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3',1,'copy.h']]],
-  ['copytype_123',['CopyType',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337',1,'mlx::core']]],
-  ['core_20array_20operations_124',['Core array operations',['../group__ops.html',1,'']]],
-  ['cos_125',['Cos',['../struct_cos.html',1,'Cos'],['../classmlx_1_1core_1_1_cos.html',1,'mlx::core::Cos'],['../structmlx_1_1core_1_1detail_1_1_cos.html',1,'mlx::core::detail::Cos'],['../classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995',1,'mlx::core::Cos::Cos()']]],
-  ['cos_126',['cos',['../namespacepocketfft_1_1detail.html#a499c1e8b7d79a5272af024f46c63ff9d',1,'pocketfft::detail::cos()'],['../namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3',1,'metal::cos()'],['../namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88',1,'metal::fast::cos()'],['../namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220',1,'metal::precise::cos()'],['../group__ops.html#ga39dfdf72b556012aa35ff27a94116e74',1,'mlx::core::cos()']]],
-  ['cosh_127',['Cosh',['../struct_cosh.html',1,'Cosh'],['../classmlx_1_1core_1_1_cosh.html',1,'mlx::core::Cosh'],['../structmlx_1_1core_1_1detail_1_1_cosh.html',1,'mlx::core::detail::Cosh'],['../classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1',1,'mlx::core::Cosh::Cosh()']]],
-  ['cosh_128',['cosh',['../namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0',1,'metal::cosh()'],['../namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e',1,'metal::fast::cosh()'],['../namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc',1,'metal::precise::cosh()'],['../group__ops.html#ga2181b71cda88007a3092be4795ff0715',1,'mlx::core::cosh()']]],
-  ['cosine_129',['cosine',['../structpocketfft_1_1detail_1_1_exec_dcst.html#a185023fc1e386cc8f233b79c49c1fd8a',1,'pocketfft::detail::ExecDcst']]],
-  ['cospi_130',['cospi',['../namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1',1,'metal::cospi()'],['../namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce',1,'metal::fast::cospi()'],['../namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7',1,'metal::precise::cospi()']]],
-  ['cost_5fguess_131',['cost_guess',['../structpocketfft_1_1detail_1_1util.html#ad3d874bc3fb0048df2270779a15d4bd0',1,'pocketfft::detail::util']]],
-  ['count_5fdown_132',['count_down',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#a81d6597189b40410e35f3cd653fd1342',1,'pocketfft::detail::threading::latch']]],
-  ['cpu_133',['cpu',['../structmlx_1_1core_1_1_device.html#a69ee81924251dec96f1945c9d91506fd',1,'mlx::core::Device::cpu'],['../structmlx_1_1core_1_1_device.html#ac45b3de9b3458d8f31005136cde20fdbad9747e2da342bdb995f6389533ad1a3d',1,'mlx::core::Device::cpu']]],
-  ['cross_134',['cross',['../namespacemlx_1_1core_1_1linalg.html#abcda3fbda45183c21e7f27aa0dde64e6',1,'mlx::core::linalg']]],
-  ['ctile_135',['Ctile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88',1,'mlx::steel::BlockMMA']]],
-  ['cummax_136',['CumMax',['../struct_cum_max.html',1,'']]],
-  ['cummax_137',['cummax',['../group__ops.html#gaee37cac8476e8f8d666bcded5bc59143',1,'mlx::core']]],
-  ['cummin_138',['CumMin',['../struct_cum_min.html',1,'']]],
-  ['cummin_139',['cummin',['../group__ops.html#ga19c1bf6929fe8d66b9cd408946aea6a8',1,'mlx::core']]],
-  ['cumprod_140',['CumProd',['../struct_cum_prod.html',1,'']]],
-  ['cumprod_141',['cumprod',['../group__ops.html#ga0d71dfbc14ef3ed564b0c5ee26af680f',1,'mlx::core']]],
-  ['cumprod_3c_20bool_20_3e_142',['CumProd&lt; bool &gt;',['../struct_cum_prod_3_01bool_01_4.html',1,'']]],
-  ['cumsum_143',['CumSum',['../struct_cum_sum.html',1,'']]],
-  ['cumsum_144',['cumsum',['../group__ops.html#gaddc825a5c173e195ab0fda83ad630420',1,'mlx::core']]],
-  ['custom_145',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html',1,'mlx::core::fast::Custom'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313',1,'mlx::core::fast::Custom::Custom()']]],
-  ['custom_5ffunction_146',['custom_function',['../namespacemlx_1_1core.html#a8d3ca5fbaecdb995660c24cde5aeebaf',1,'mlx::core']]],
-  ['custom_5fvjp_147',['custom_vjp',['../namespacemlx_1_1core.html#a9290596250fa308df4c69b44483bb8aa',1,'mlx::core']]],
-  ['customkernel_148',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html',1,'mlx::core::fast::CustomKernel'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153',1,'mlx::core::fast::CustomKernel::CustomKernel()']]],
-  ['customkernelshapeinfo_149',['CustomKernelShapeInfo',['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html',1,'mlx::core::fast']]],
-  ['customtransforms_150',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html',1,'mlx::core::CustomTransforms'],['../classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488',1,'mlx::core::CustomTransforms::CustomTransforms()']]]
+  ['col_5ffrag_5ftype_35',['col_frag_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aab8dd1c6917247da41dd3a31139a665f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
+  ['col_5freduce_5f2pass_36',['col_reduce_2pass',['../reduce__col_8h.html#a9a7be400d810700b47fc1a998032ce29',1,'reduce_col.h']]],
+  ['col_5freduce_5flongcolumn_37',['col_reduce_longcolumn',['../reduce__col_8h.html#aa3287cd98e97123b67b5d3920d984ca2',1,'reduce_col.h']]],
+  ['col_5freduce_5flooped_38',['col_reduce_looped',['../reduce__col_8h.html#ae8f9354e1c595142d05b33fe13988f02',1,'reduce_col.h']]],
+  ['col_5freduce_5fsmall_39',['col_reduce_small',['../reduce__col_8h.html#a82cd031d8014c02e61dc9a817ea6d4ec',1,'reduce_col.h']]],
+  ['collapse_5fcontiguous_5fdims_40',['collapse_contiguous_dims',['../namespacemlx_1_1core.html#a38fe6ec5220d13d96c7dad7556d2b613',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; int64_t &gt; &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#af2895f9b0083efd8221275eb8cadccbe',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; size_t &gt; &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a90e2b6edc0fe82230cb93f5ea39febb4',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; array &gt; &amp;xs, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#ac813412cce77fc1340dcfefc6e099276',1,'mlx::core::collapse_contiguous_dims(Arrays &amp;&amp;... xs)'],['../namespacemlx_1_1core.html#aab3cc7f3808934ae0727b920eba231bd',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; int64_t &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a1e0cbcf109d32794ffc8efc7302ba9b0',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a4ee50bfb240512d0c0ce151dfe2c74ef',1,'mlx::core::collapse_contiguous_dims(const array &amp;a, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())']]],
+  ['commandencoder_41',['CommandEncoder',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html',1,'mlx::core::metal::CommandEncoder'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3',1,'mlx::core::metal::CommandEncoder::CommandEncoder(MTL::CommandBuffer *cbuf)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14',1,'mlx::core::metal::CommandEncoder::CommandEncoder(const CommandEncoder &amp;)=delete']]],
+  ['commit_5fcommand_5fbuffer_42',['commit_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c',1,'mlx::core::metal::Device']]],
+  ['commonallocator_43',['CommonAllocator',['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html',1,'mlx::core::allocator']]],
+  ['communication_5fstream_44',['communication_stream',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#ac3612edf0e0e18c1e4ba0ce7c6e35cd6',1,'mlx::core::distributed::detail']]],
+  ['compile_45',['compile',['../namespacemlx_1_1core.html#a3ac798e65e59fe10b7fb5c522efce782',1,'mlx::core::compile()'],['../namespacemlx_1_1core_1_1detail.html#ac3b7b09892ff7290d5f3ef26cb444329',1,'mlx::core::detail::compile()']]],
+  ['compile_2eh_46',['compile.h',['../compile_8h.html',1,'']]],
+  ['compile_5favailable_5ffor_5fdevice_47',['compile_available_for_device',['../namespacemlx_1_1core_1_1detail.html#aeeff2ba6ec3d9d4ed090de6d2681dbc2',1,'mlx::core::detail']]],
+  ['compile_5fclear_5fcache_48',['compile_clear_cache',['../namespacemlx_1_1core_1_1detail.html#a3fb927c209b946aefebb195993fbe4cf',1,'mlx::core::detail']]],
+  ['compile_5ferase_49',['compile_erase',['../namespacemlx_1_1core_1_1detail.html#a69eb76a14f845ca000f1ccb2edda0175',1,'mlx::core::detail']]],
+  ['compile_5fimpl_2eh_50',['compile_impl.h',['../compile__impl_8h.html',1,'']]],
+  ['compiled_51',['Compiled',['../classmlx_1_1core_1_1_compiled.html',1,'mlx::core::Compiled'],['../classmlx_1_1core_1_1_compiled.html#a2d8cefff835c419a48a077d306b8e051',1,'mlx::core::Compiled::Compiled()']]],
+  ['compiled_2eh_52',['compiled.h',['../compiled_8h.html',1,'']]],
+  ['compiled_5fallocate_5foutputs_53',['compiled_allocate_outputs',['../namespacemlx_1_1core.html#ab8c3c4fc05745f586de922c8266f4fce',1,'mlx::core']]],
+  ['compiled_5fcheck_5fcontiguity_54',['compiled_check_contiguity',['../namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027',1,'mlx::core']]],
+  ['compiled_5fpreamble_2eh_55',['compiled_preamble.h',['../compiled__preamble_8h.html',1,'']]],
+  ['compilemode_56',['CompileMode',['../namespacemlx_1_1core.html#adb15ff2b1ca5207fd4f6e631e2c3bcb4',1,'mlx::core']]],
+  ['complex_2eh_57',['complex.h',['../backend_2metal_2kernels_2complex_8h.html',1,'(Global Namespace)'],['../types_2complex_8h.html',1,'(Global Namespace)']]],
+  ['complex128_5ft_58',['complex128_t',['../structmlx_1_1core_1_1complex128__t.html',1,'mlx::core::complex128_t'],['../structmlx_1_1core_1_1complex128__t.html#aa15d0b805f8790f7c7b76fc7b9d677e0',1,'mlx::core::complex128_t::complex128_t(double v, double u)'],['../structmlx_1_1core_1_1complex128__t.html#abf2842253b874f9f13f39ea68a89e5b6',1,'mlx::core::complex128_t::complex128_t(std::complex&lt; double &gt; v)'],['../structmlx_1_1core_1_1complex128__t.html#a526fba96d7e815360cb4226af085a1bf',1,'mlx::core::complex128_t::complex128_t(T x)']]],
+  ['complex64_59',['complex64',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa8c022579455bcd2c681f007e84f4e2cf',1,'mlx::core::Dtype::complex64'],['../namespacemlx_1_1core.html#af99db87e0078bfcdb383f5689bc874d4',1,'mlx::core::complex64']]],
+  ['complex64_5ft_60',['complex64_t',['../structcomplex64__t.html',1,'complex64_t'],['../structmlx_1_1core_1_1complex64__t.html',1,'mlx::core::complex64_t'],['../structcomplex64__t.html#adbd392a5e92d31997380ad0a38be4be8',1,'complex64_t::complex64_t(float real, float imag)'],['../structcomplex64__t.html#a29782289bb90d6294099667b86509cd3',1,'complex64_t::complex64_t()'],['../structcomplex64__t.html#a905b048d70eb8d748a62454268242291',1,'complex64_t::complex64_t() threadgroup'],['../structcomplex64__t.html#a33a2452eb33b5ed53655773539c357a5',1,'complex64_t::complex64_t(T x) thread'],['../structcomplex64__t.html#a89b65ace8588b7bf215355f705eb23d9',1,'complex64_t::complex64_t(T x) threadgroup'],['../structcomplex64__t.html#ac81b486f642fb3b26c5d659917bdbcd0',1,'complex64_t::complex64_t(T x) device'],['../structcomplex64__t.html#a0a27a41206400f1e62b60ceb56960c93',1,'complex64_t::complex64_t(T x) const ant'],['../structmlx_1_1core_1_1complex64__t.html#a697cc973ae27d63c8e00d830e780bd8c',1,'mlx::core::complex64_t::complex64_t(float v, float u)'],['../structmlx_1_1core_1_1complex64__t.html#ae065e39938f9c4374b4116f4c67d4d09',1,'mlx::core::complex64_t::complex64_t(std::complex&lt; float &gt; v)'],['../structmlx_1_1core_1_1complex64__t.html#a2232cbbe591a9d2bc228cb23fac38b50',1,'mlx::core::complex64_t::complex64_t(T x)']]],
+  ['complex_5fbinop_61',['complex_binop',['../types_2complex_8h.html#a9c7995d495359894e1b30c0f1678d6bd',1,'complex.h']]],
+  ['complex_5fbinop_5fhelper_62',['complex_binop_helper',['../types_2complex_8h.html#ac6890f9852de12339b09b65757ebc8c4',1,'complex.h']]],
+  ['complex_5fmul_63',['complex_mul',['../radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6',1,'radix.h']]],
+  ['complex_5fmul_5fconj_64',['complex_mul_conj',['../radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3',1,'radix.h']]],
+  ['complexfloating_65',['complexfloating',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2dafb203630099d501ff7c255a574bc4812',1,'mlx::core::Dtype::complexfloating'],['../namespacemlx_1_1core.html#a70b8e88c9df750af984757105af33423',1,'mlx::core::complexfloating']]],
+  ['compute_5fstrided_5findices_66',['compute_strided_indices',['../struct_read_writer.html#a7c903fbb8b85a856ba5564d7df537cdf',1,'ReadWriter']]],
+  ['concatenate_67',['Concatenate',['../classmlx_1_1core_1_1_concatenate.html',1,'mlx::core::Concatenate'],['../classmlx_1_1core_1_1_concatenate.html#acff07853de2d31faeec7c4ca40ce0888',1,'mlx::core::Concatenate::Concatenate()']]],
+  ['concatenate_68',['concatenate',['../namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d',1,'mlx::core::concatenate(std::string &amp;acc, T first)'],['../namespacemlx_1_1core.html#aaf51544472fa87fa974686eacdd2a4a6',1,'mlx::core::concatenate(std::string &amp;acc, T first, Args... args)'],['../group__ops.html#gabdc36fa65697d0361c8d67495de77129',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
+  ['concatenate_5fgpu_69',['concatenate_gpu',['../namespacemlx_1_1core.html#a050299d0d366ca5c9d09d1004dcc3e7d',1,'mlx::core']]],
+  ['concurrent_5fqueue_70',['concurrent_queue',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html',1,'pocketfft::detail::threading']]],
+  ['concurrent_5fqueue_3c_20std_3a_3afunction_3c_20void_28_29_3e_20_3e_71',['concurrent_queue&lt; std::function&lt; void()&gt; &gt;',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html',1,'pocketfft::detail::threading']]],
+  ['concurrentcontext_72',['ConcurrentContext',['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html',1,'mlx::core::metal::CommandEncoder::ConcurrentContext'],['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174',1,'mlx::core::metal::CommandEncoder::ConcurrentContext::ConcurrentContext()']]],
+  ['cond_73',['cond',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a4ffd524d6a5bedd1a303b63bdde6701c',1,'mlx::core::scheduler::StreamThread']]],
+  ['conj_74',['conj',['../namespacepocketfft_1_1detail.html#a66d79051d502046a9b9f103e744dbad3',1,'pocketfft::detail']]],
+  ['conjugate_75',['Conjugate',['../struct_conjugate.html',1,'Conjugate'],['../classmlx_1_1core_1_1_conjugate.html',1,'mlx::core::Conjugate'],['../structmlx_1_1core_1_1detail_1_1_conjugate.html',1,'mlx::core::detail::Conjugate'],['../classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87',1,'mlx::core::Conjugate::Conjugate()']]],
+  ['conjugate_76',['conjugate',['../group__ops.html#ga5b596906bf8cdc8d97ed6ddc9aeb4c23',1,'mlx::core']]],
+  ['contiguous_77',['Contiguous',['../classmlx_1_1core_1_1_contiguous.html',1,'mlx::core::Contiguous'],['../classmlx_1_1core_1_1_contiguous.html#a3e83f414c02ae0b92a50b6f8e402e1c0',1,'mlx::core::Contiguous::Contiguous()']]],
+  ['contiguous_78',['contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#afd0ab11e7a486a2a8e50ee84b971ac8a',1,'mlx::core::array::Flags::contiguous'],['../group__ops.html#ga8ab10aa6c41416d739791164a52b25d5',1,'mlx::core::contiguous()']]],
+  ['contiguous_5fscan_79',['contiguous_scan',['../scan_8h.html#a60d279b9add7d56639bb209408f09d79',1,'scan.h']]],
+  ['contiguousallreduce_80',['ContiguousAllReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ae4e34c7154eb8dc47aa8503209730424',1,'mlx::core']]],
+  ['contiguousiterator_81',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html',1,'mlx::core::ContiguousIterator&lt; StrideT &gt;'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6',1,'mlx::core::ContiguousIterator::ContiguousIterator()'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a6cb378408b6f546eeb6ade1a4faafe3c',1,'mlx::core::ContiguousIterator::ContiguousIterator(const array &amp;a)'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a16bdacb53f65b7284068cd49d4cba292',1,'mlx::core::ContiguousIterator::ContiguousIterator(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides, int dims)']]],
+  ['contiguousreduce_82',['ContiguousReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ad2547f25dffe8d8936dbec25601cfc84',1,'mlx::core']]],
+  ['contiguousstridedreduce_83',['ContiguousStridedReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ab48dac7508a2c790de1bdc33f29177ed',1,'mlx::core']]],
+  ['conv_84',['conv',['../namespacemlx_1_1core_1_1metal.html#ab1704e853394c725668c06752ebb5c24',1,'mlx::core::metal']]],
+  ['conv_2eh_85',['conv.h',['../conv_8h.html',1,'']]],
+  ['conv1d_86',['conv1d',['../group__ops.html#ga30d47e08093c03a3676f235f9f559411',1,'mlx::core']]],
+  ['conv2d_87',['conv2d',['../group__ops.html#ga73b02833229678786e7f302d458d5a83',1,'mlx::core']]],
+  ['conv2dgeneralbaseinfo_88',['Conv2DGeneralBaseInfo',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html',1,'mlx::steel']]],
+  ['conv2dgeneraljumpparams_89',['Conv2DGeneralJumpParams',['../structmlx_1_1steel_1_1_conv2_d_general_jump_params.html',1,'mlx::steel']]],
+  ['conv2dinputblockloadergeneral_90',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html',1,'mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::Conv2DInputBlockLoaderGeneral()']]],
+  ['conv2dinputblockloaderlargefilter_91',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8755116a535539744e4947bc69f9c50f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::Conv2DInputBlockLoaderLargeFilter()']]],
+  ['conv2dinputblockloadersmallchannels_92',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ab9fd3fdeab94470dde3326f1dd5c455a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::Conv2DInputBlockLoaderSmallChannels()']]],
+  ['conv2dinputblockloadersmallfilter_93',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0a2cbf57c51cd928722e3f06aafcf933',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::Conv2DInputBlockLoaderSmallFilter()']]],
+  ['conv2dweightblockloader_94',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html',1,'mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a9a7dca3512b64cffb6eac305d795831c',1,'mlx::steel::Conv2DWeightBlockLoader::Conv2DWeightBlockLoader()']]],
+  ['conv2dweightblockloadergeneral_95',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#ad0550fabbdc9297559381a5b488e9af1',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::Conv2DWeightBlockLoaderGeneral()']]],
+  ['conv2dweightblockloadersmallchannels_96',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae1806ea1c19713819dee83a38ab35fa6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::Conv2DWeightBlockLoaderSmallChannels()']]],
+  ['conv3d_97',['conv3d',['../group__ops.html#ga6e9907d2f14dc4803e4306b3dbc4b3ca',1,'mlx::core']]],
+  ['conv_5fgeneral_98',['conv_general',['../group__ops.html#ga2236e5dfc7e52e28abf6c21675d0a51e',1,'mlx::core::conv_general(array input, array weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding_lo={}, std::vector&lt; int &gt; padding_hi={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})'],['../group__ops.html#gab59f89942cd1efaadffe9e8762e3c99d',1,'mlx::core::conv_general(const array &amp;input, const array &amp;weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})']]],
+  ['conv_5ftranspose1d_99',['conv_transpose1d',['../group__ops.html#gaa30bf1adcd78d1c2595d07b215731714',1,'mlx::core']]],
+  ['conv_5ftranspose2d_100',['conv_transpose2d',['../group__ops.html#gaebb59971cb9bc45005dc1d398e4f0a3d',1,'mlx::core']]],
+  ['conv_5ftranspose3d_101',['conv_transpose3d',['../group__ops.html#ga8db814da631d9cd32a8d6563bf4ac530',1,'mlx::core']]],
+  ['convolution_102',['Convolution',['../classmlx_1_1core_1_1_convolution.html',1,'mlx::core::Convolution'],['../classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef',1,'mlx::core::Convolution::Convolution()']]],
+  ['copy_103',['Copy',['../classmlx_1_1core_1_1_copy.html',1,'mlx::core::Copy'],['../classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584',1,'mlx::core::Copy::Copy()']]],
+  ['copy_104',['copy',['../namespacemlx_1_1core.html#a479648542a2bea151b947b18f0e79dd2',1,'mlx::core::copy()'],['../namespacemlx_1_1core_1_1metal.html#aa215e631e2680f04a591b88d91571719',1,'mlx::core::metal::copy()'],['../group__ops.html#gae306e93af12f774bd80bad6c231b09d6',1,'mlx::core::copy()']]],
+  ['copy_2eh_105',['copy.h',['../common_2copy_8h.html',1,'(Global Namespace)'],['../metal_2copy_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2copy_8h.html',1,'(Global Namespace)']]],
+  ['copy_5fg_106',['copy_g',['../metal_2kernels_2copy_8h.html#a71e4103db4689d90ef6f9d5ba93604cf',1,'copy.h']]],
+  ['copy_5fg_5fnd1_107',['copy_g_nd1',['../metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77',1,'copy.h']]],
+  ['copy_5fg_5fnd2_108',['copy_g_nd2',['../metal_2kernels_2copy_8h.html#a39ec5b7b8351e4332b842982a2ee6260',1,'copy.h']]],
+  ['copy_5fg_5fnd3_109',['copy_g_nd3',['../metal_2kernels_2copy_8h.html#aab82689380897ff4716b5eafd6ef3ecc',1,'copy.h']]],
+  ['copy_5fgg_110',['copy_gg',['../metal_2kernels_2copy_8h.html#ade9a9eea9b8262a854a11721fe2bb9fa',1,'copy.h']]],
+  ['copy_5fgg_5fnd1_111',['copy_gg_nd1',['../metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1',1,'copy.h']]],
+  ['copy_5fgg_5fnd2_112',['copy_gg_nd2',['../metal_2kernels_2copy_8h.html#af0b06ac3a96852a64fa4274a94b58301',1,'copy.h']]],
+  ['copy_5fgg_5fnd3_113',['copy_gg_nd3',['../metal_2kernels_2copy_8h.html#a3f3836ad0b6545ec9b9e1864224f7a13',1,'copy.h']]],
+  ['copy_5fgpu_114',['copy_gpu',['../namespacemlx_1_1core.html#addaa46a13ac2deb1d9ce621338320e0e',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a6a6f4e46c8fc44fdc74c50ace02bcf38',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype)']]],
+  ['copy_5fgpu_5finplace_115',['copy_gpu_inplace',['../namespacemlx_1_1core.html#a69e30f5d30a6d72ac0ffe4886f24b7ba',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a8e1ccb0ed9387b0a789311d9f8964803',1,'mlx::core::copy_gpu_inplace(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#ae55b801b09ccf55cba96278163a9b1ef',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int64_t &gt; &amp;istride, int64_t ioffset, CopyType ctype, const Stream &amp;s)']]],
+  ['copy_5fhartley_116',['copy_hartley',['../namespacepocketfft_1_1detail.html#abac3fcc8ce83800d228774f64c28d4c3',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#ae7b44d2773d9d06a9787aff01d66b3ed',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
+  ['copy_5finplace_117',['copy_inplace',['../namespacemlx_1_1core.html#a98495894a796b2cc6d022e7a03432c64',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, CopyType ctype)'],['../namespacemlx_1_1core.html#aad636e2d0b2f882cadd1b438f4daa9ed',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype)']]],
+  ['copy_5finput_118',['copy_input',['../namespacepocketfft_1_1detail.html#aff05be3064743c1143b19318ab12ad4a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; cmplx&lt; T &gt; &gt; &amp;src, cmplx&lt; vtype_t&lt; T &gt; &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a30fc708f9d8f9cfa74194925c7863c0a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, vtype_t&lt; T &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a3387bd35f237870e42b8461769e6aec4',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, T *dst)']]],
+  ['copy_5foutput_119',['copy_output',['../namespacepocketfft_1_1detail.html#a1523a037300a8da05db210b802d9cb0e',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const cmplx&lt; vtype_t&lt; T &gt; &gt; *src, ndarr&lt; cmplx&lt; T &gt; &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a21980853aca4d92ed06e3dcffe7ef660',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a310481c334e46674710ba794ad7403c0',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
+  ['copy_5fs_120',['copy_s',['../metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea',1,'copy.h']]],
+  ['copy_5fs2_121',['copy_s2',['../metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3',1,'copy.h']]],
+  ['copy_5fshared_5fbuffer_122',['copy_shared_buffer',['../classmlx_1_1core_1_1array.html#a28df7a333d90a311c49bc4bce7a1ad6d',1,'mlx::core::array::copy_shared_buffer(const array &amp;other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a92974c656c35a972ad241f80584bbd29',1,'mlx::core::array::copy_shared_buffer(const array &amp;other)']]],
+  ['copy_5fv_123',['copy_v',['../metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659',1,'copy.h']]],
+  ['copy_5fv2_124',['copy_v2',['../metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3',1,'copy.h']]],
+  ['copytype_125',['CopyType',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337',1,'mlx::core']]],
+  ['core_20array_20operations_126',['Core array operations',['../group__ops.html',1,'']]],
+  ['cos_127',['Cos',['../struct_cos.html',1,'Cos'],['../classmlx_1_1core_1_1_cos.html',1,'mlx::core::Cos'],['../structmlx_1_1core_1_1detail_1_1_cos.html',1,'mlx::core::detail::Cos'],['../classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995',1,'mlx::core::Cos::Cos()']]],
+  ['cos_128',['cos',['../namespacepocketfft_1_1detail.html#a499c1e8b7d79a5272af024f46c63ff9d',1,'pocketfft::detail::cos()'],['../namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3',1,'metal::cos()'],['../namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88',1,'metal::fast::cos()'],['../namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220',1,'metal::precise::cos()'],['../group__ops.html#ga39dfdf72b556012aa35ff27a94116e74',1,'mlx::core::cos()']]],
+  ['cosh_129',['Cosh',['../struct_cosh.html',1,'Cosh'],['../classmlx_1_1core_1_1_cosh.html',1,'mlx::core::Cosh'],['../structmlx_1_1core_1_1detail_1_1_cosh.html',1,'mlx::core::detail::Cosh'],['../classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1',1,'mlx::core::Cosh::Cosh()']]],
+  ['cosh_130',['cosh',['../namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0',1,'metal::cosh()'],['../namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e',1,'metal::fast::cosh()'],['../namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc',1,'metal::precise::cosh()'],['../group__ops.html#ga2181b71cda88007a3092be4795ff0715',1,'mlx::core::cosh()']]],
+  ['cosine_131',['cosine',['../structpocketfft_1_1detail_1_1_exec_dcst.html#a185023fc1e386cc8f233b79c49c1fd8a',1,'pocketfft::detail::ExecDcst']]],
+  ['cospi_132',['cospi',['../namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1',1,'metal::cospi()'],['../namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce',1,'metal::fast::cospi()'],['../namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7',1,'metal::precise::cospi()']]],
+  ['cost_5fguess_133',['cost_guess',['../structpocketfft_1_1detail_1_1util.html#ad3d874bc3fb0048df2270779a15d4bd0',1,'pocketfft::detail::util']]],
+  ['count_5fdown_134',['count_down',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#a81d6597189b40410e35f3cd653fd1342',1,'pocketfft::detail::threading::latch']]],
+  ['cpu_135',['cpu',['../structmlx_1_1core_1_1_device.html#a69ee81924251dec96f1945c9d91506fd',1,'mlx::core::Device::cpu'],['../structmlx_1_1core_1_1_device.html#ac45b3de9b3458d8f31005136cde20fdbad9747e2da342bdb995f6389533ad1a3d',1,'mlx::core::Device::cpu']]],
+  ['cross_136',['cross',['../namespacemlx_1_1core_1_1linalg.html#abcda3fbda45183c21e7f27aa0dde64e6',1,'mlx::core::linalg']]],
+  ['cshape_137',['CShape',['../structmlx_1_1steel_1_1_c_shape.html',1,'mlx::steel']]],
+  ['ctile_138',['Ctile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6',1,'mlx::steel::BlockMMA']]],
+  ['cummax_139',['CumMax',['../struct_cum_max.html',1,'']]],
+  ['cummax_140',['cummax',['../group__ops.html#gaee37cac8476e8f8d666bcded5bc59143',1,'mlx::core']]],
+  ['cummin_141',['CumMin',['../struct_cum_min.html',1,'']]],
+  ['cummin_142',['cummin',['../group__ops.html#ga19c1bf6929fe8d66b9cd408946aea6a8',1,'mlx::core']]],
+  ['cumprod_143',['CumProd',['../struct_cum_prod.html',1,'']]],
+  ['cumprod_144',['cumprod',['../group__ops.html#ga0d71dfbc14ef3ed564b0c5ee26af680f',1,'mlx::core']]],
+  ['cumprod_3c_20bool_20_3e_145',['CumProd&lt; bool &gt;',['../struct_cum_prod_3_01bool_01_4.html',1,'']]],
+  ['cumsum_146',['CumSum',['../struct_cum_sum.html',1,'']]],
+  ['cumsum_147',['cumsum',['../group__ops.html#gaddc825a5c173e195ab0fda83ad630420',1,'mlx::core']]],
+  ['custom_148',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html',1,'mlx::core::fast::Custom'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313',1,'mlx::core::fast::Custom::Custom()']]],
+  ['custom_5ffunction_149',['custom_function',['../namespacemlx_1_1core.html#a8d3ca5fbaecdb995660c24cde5aeebaf',1,'mlx::core']]],
+  ['custom_5fvjp_150',['custom_vjp',['../namespacemlx_1_1core.html#a9290596250fa308df4c69b44483bb8aa',1,'mlx::core']]],
+  ['customkernel_151',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html',1,'mlx::core::fast::CustomKernel'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153',1,'mlx::core::fast::CustomKernel::CustomKernel()']]],
+  ['customkernelshapeinfo_152',['CustomKernelShapeInfo',['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html',1,'mlx::core::fast']]],
+  ['customtransforms_153',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html',1,'mlx::core::CustomTransforms'],['../classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488',1,'mlx::core::CustomTransforms::CustomTransforms()']]]
 ];
diff --git a/docs/build/html/search/all_4.js b/docs/build/html/search/all_4.js
index 17515ff9f..cfaf8fd68 100644
--- a/docs/build/html/search/all_4.js
+++ b/docs/build/html/search/all_4.js
@@ -1,64 +1,67 @@
 var searchData=
 [
-  ['d_0',['d',['../classpocketfft_1_1detail_1_1cndarr.html#ac29c769aebb03f81fbcf16ba6e766af2',1,'pocketfft::detail::cndarr::d'],['../structmlx_1_1core_1_1array_1_1_data.html#a25f52ac67912a49bb6e2b6715aa65311',1,'mlx::core::array::Data::d']]],
-  ['data_1',['Data',['../structmlx_1_1core_1_1array_1_1_data.html',1,'mlx::core::array::Data'],['../structmlx_1_1core_1_1array_1_1_data.html#a77e2ea35fac1d54e4062468a432e1482',1,'mlx::core::array::Data::Data(allocator::Buffer buffer, deleter_t d=allocator::free)'],['../structmlx_1_1core_1_1array_1_1_data.html#a50f242040b123052e48e18c244ff70fc',1,'mlx::core::array::Data::Data(const Data &amp;d)=delete']]],
-  ['data_2',['data',['../classpocketfft_1_1detail_1_1arr.html#aec0f2191b4663b4187aab92454c34de8',1,'pocketfft::detail::arr::data()'],['../classpocketfft_1_1detail_1_1arr.html#ac82daa17e9f991072b012343f9d7c182',1,'pocketfft::detail::arr::data() const'],['../classmlx_1_1core_1_1array.html#a72e3ce6c03fefe272cadf214bd127b95',1,'mlx::core::array::data()'],['../classmlx_1_1core_1_1array.html#a99fb28eeab39b9f429373f8bd7557676',1,'mlx::core::array::data() const']]],
-  ['data_5fshared_5fptr_3',['data_shared_ptr',['../classmlx_1_1core_1_1array.html#ab84c792117e29cdf90ef3433303f6141',1,'mlx::core::array']]],
-  ['data_5fsize_4',['data_size',['../classmlx_1_1core_1_1array.html#afaf2a370fa35d96af1b27a4b814e3bfd',1,'mlx::core::array']]],
-  ['dct_5',['dct',['../namespacepocketfft_1_1detail.html#a60615f5b685314c658346c309d5ef2ba',1,'pocketfft::detail']]],
-  ['deallocate_6',['deallocate',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a2a99b8e296d26b255e9937ba5f30e76f',1,'pocketfft::detail::threading::aligned_allocator']]],
-  ['debug_5fset_5fprimitive_5fbuffer_5flabel_7',['debug_set_primitive_buffer_label',['../namespacemlx_1_1core.html#a489e45b3a5cd8b46e8ea56b9132eb230',1,'mlx::core']]],
-  ['debug_5fset_5fstream_5fqueue_5flabel_8',['debug_set_stream_queue_label',['../namespacemlx_1_1core.html#a79817d2432e782e596c9c49a08b93be2',1,'mlx::core']]],
-  ['decompose_5fhadamard_9',['decompose_hadamard',['../namespacemlx_1_1core.html#a3a8fe7ba84714dbb5fdc81e93a07abc8',1,'mlx::core']]],
-  ['default_5f_10',['default_',['../classmlx_1_1core_1_1random_1_1_key_sequence.html#ab5993daeed822c6b970caddab7e3fd90',1,'mlx::core::random::KeySequence']]],
-  ['default_5fdevice_11',['default_device',['../namespacemlx_1_1core.html#a0196171cfe6ee2953113abce597dc815',1,'mlx::core']]],
-  ['default_5fstream_12',['default_stream',['../namespacemlx_1_1core.html#ac198b7e282957c724c84a435e8f1215e',1,'mlx::core']]],
-  ['defaultcontiguousreduce_13',['DefaultContiguousReduce',['../structmlx_1_1core_1_1_default_contiguous_reduce.html',1,'mlx::core::DefaultContiguousReduce&lt; T, U, Op &gt;'],['../structmlx_1_1core_1_1_default_contiguous_reduce.html#aeb4fb7fa1a4c8e7d1da1f450ce95c57f',1,'mlx::core::DefaultContiguousReduce::DefaultContiguousReduce()']]],
-  ['defaultstridedreduce_14',['DefaultStridedReduce',['../structmlx_1_1core_1_1_default_strided_reduce.html',1,'mlx::core::DefaultStridedReduce&lt; T, U, Op &gt;'],['../structmlx_1_1core_1_1_default_strided_reduce.html#a477e5dd0dd33071e48769d11d19a13fb',1,'mlx::core::DefaultStridedReduce::DefaultStridedReduce()']]],
-  ['define_5fdefault_5fis_5fequivalent_15',['DEFINE_DEFAULT_IS_EQUIVALENT',['../primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a',1,'primitives.h']]],
-  ['define_5fgrads_16',['DEFINE_GRADS',['../primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6',1,'primitives.h']]],
-  ['define_5finput_5foutput_5fshape_17',['DEFINE_INPUT_OUTPUT_SHAPE',['../primitives_8h.html#a649a06267b75e007224ea4ddefedb999',1,'primitives.h']]],
-  ['define_5fprint_18',['DEFINE_PRINT',['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a8af1e90d4aa56f31ec40ad152ebd2421',1,'mlx::core::distributed::AllGather::DEFINE_PRINT()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a31bf76e24cf3836cf1fd26da30712e31',1,'mlx::core::distributed::Send::DEFINE_PRINT()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a7a0cad13da7cf8e565934318a2bc34f1',1,'mlx::core::distributed::Recv::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae6eea81b5e3789c2f6f376cc07f0a47c',1,'mlx::core::fast::RMSNorm::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a9895733eab845e11484d86cf6ecedced',1,'mlx::core::fast::RMSNormVJP::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a467fcf02b3ddf1d8b6d476b244ae3568',1,'mlx::core::fast::LayerNorm::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a5ab3eb5402c7e8060916056eb2b7887f',1,'mlx::core::fast::LayerNormVJP::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a2b06fe64fa8feca65140632087065e16',1,'mlx::core::fast::RoPE::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a6cc2092fa5b8e7585921b8e0f3ec3db7',1,'mlx::core::fast::ScaledDotProductAttention::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a4b8f1b1f633002c8ca6fa8f0ef4dd587',1,'mlx::core::fast::AffineQuantize::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a116ecf31c8672c94e5ea06c1d43e9534',1,'mlx::core::fast::CustomKernel::DEFINE_PRINT()'],['../primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592',1,'DEFINE_PRINT:&#160;primitives.h']]],
-  ['define_5fsimd_5fexclusive_5fscan_19',['DEFINE_SIMD_EXCLUSIVE_SCAN',['../scan_8h.html#a185f66aac8c5317587e6abd43f3013fc',1,'scan.h']]],
-  ['define_5fsimd_5freduce_20',['DEFINE_SIMD_REDUCE',['../backend_2metal_2kernels_2reduction_2ops_8h.html#acacf99e0ba629ed062ccc3c2eba89b05',1,'ops.h']]],
-  ['define_5fsimd_5fscan_21',['DEFINE_SIMD_SCAN',['../scan_8h.html#a0d8d6a9b0f3a1263629380bda8eca7bc',1,'scan.h']]],
-  ['define_5fvmap_22',['DEFINE_VMAP',['../primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd',1,'primitives.h']]],
-  ['defines_2eh_23',['defines.h',['../defines_8h.html',1,'(Global Namespace)'],['../steel_2defines_8h.html',1,'(Global Namespace)']]],
-  ['degrees_24',['degrees',['../group__ops.html#ga3a70569b50e1083c5ded199d73fb960c',1,'mlx::core']]],
-  ['deleter_5ft_25',['deleter_t',['../namespacemlx_1_1core.html#a1e6cec03ebd80fd2d6b12b288367bfa8',1,'mlx::core']]],
-  ['denorm_5fmin_26',['denorm_min',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a6a9dbcba4dd79cad50876dda506b9eed',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['depends_27',['Depends',['../classmlx_1_1core_1_1_depends.html',1,'mlx::core::Depends'],['../classmlx_1_1core_1_1_depends.html#a4ccb792c99f5d8d133d3fac29f7d3f62',1,'mlx::core::Depends::Depends()']]],
-  ['depends_28',['depends',['../group__ops.html#gac4a51a68fbe1725436b026d2fbb95759',1,'mlx::core']]],
-  ['dequantize_29',['dequantize',['../quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2',1,'dequantize():&#160;quantized.h'],['../group__ops.html#gabff758a5c1ce32ad7e8b78aba0164077',1,'mlx::core::dequantize()']]],
-  ['detach_30',['detach',['../classmlx_1_1core_1_1array.html#a84948c29df8c957904919c8602692bd2',1,'mlx::core::array']]],
-  ['device_31',['Device',['../structmlx_1_1core_1_1_device.html',1,'mlx::core::Device'],['../classmlx_1_1core_1_1metal_1_1_device.html',1,'mlx::core::metal::Device'],['../classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6',1,'mlx::core::metal::Device::Device()'],['../classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06',1,'mlx::core::metal::Device::Device(const Device &amp;)=delete'],['../structmlx_1_1core_1_1_device.html#a481ccfb94d689994396bd353e966b489',1,'mlx::core::Device::Device()']]],
-  ['device_32',['device',['../structmlx_1_1core_1_1_stream.html#a406b1b0162287a4162fab1f70e2ff3bb',1,'mlx::core::Stream::device'],['../classmlx_1_1core_1_1_primitive.html#a8ae61e3289c4134232a69295268f8261',1,'mlx::core::Primitive::device()'],['../namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57',1,'mlx::core::metal::device()']]],
-  ['device_2eh_33',['device.h',['../backend_2metal_2device_8h.html',1,'(Global Namespace)'],['../device_8h.html',1,'(Global Namespace)']]],
-  ['device_5finfo_34',['device_info',['../namespacemlx_1_1core_1_1metal.html#a6ad19c44efabb7423f973407926ead61',1,'mlx::core::metal']]],
-  ['devicestream_35',['DeviceStream',['../structmlx_1_1core_1_1metal_1_1_device_stream.html',1,'mlx::core::metal::DeviceStream'],['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7',1,'mlx::core::metal::DeviceStream::DeviceStream()']]],
-  ['devicetype_36',['DeviceType',['../structmlx_1_1core_1_1_device.html#ac45b3de9b3458d8f31005136cde20fdb',1,'mlx::core::Device']]],
-  ['diag_37',['diag',['../group__ops.html#ga11af511875640e1fa88e0ca87e199344',1,'mlx::core']]],
-  ['diagonal_38',['diagonal',['../group__ops.html#ga9236b085a88ead3128ed8079d009cac6',1,'mlx::core']]],
-  ['difference_5ftype_39',['difference_type',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#adcee44c77980fc2370a2c31e203aead5',1,'mlx::core::array::ArrayIterator']]],
-  ['digits_40',['digits',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#af6a681edff230c8d734a1feefb8d1879',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['digits10_41',['digits10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a0f48dd0c8a2d2dfa825067fb212b2e6b',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['disable_5fcompile_42',['disable_compile',['../namespacemlx_1_1core.html#a5f5fea955057bb3842b271b037909e66',1,'mlx::core']]],
-  ['disabled_43',['disabled',['../namespacemlx_1_1core.html#adb15ff2b1ca5207fd4f6e631e2c3bcb4a075ae3d2fc31640504f814f60e5ef713',1,'mlx::core']]],
-  ['dispatchthreadgroups_44',['dispatchThreadgroups',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e',1,'mlx::core::metal::CommandEncoder']]],
-  ['dispatchthreads_45',['dispatchThreads',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810',1,'mlx::core::metal::CommandEncoder']]],
-  ['distprimitive_46',['DistPrimitive',['../classmlx_1_1core_1_1distributed_1_1_dist_primitive.html',1,'mlx::core::distributed::DistPrimitive'],['../classmlx_1_1core_1_1distributed_1_1_dist_primitive.html#a8c54166951522c2a52ef39fce8c87f8f',1,'mlx::core::distributed::DistPrimitive::DistPrimitive()']]],
-  ['distributed_2eh_47',['distributed.h',['../distributed_8h.html',1,'']]],
-  ['distributed_5fimpl_2eh_48',['distributed_impl.h',['../distributed__impl_8h.html',1,'']]],
-  ['divide_49',['Divide',['../struct_divide.html',1,'Divide'],['../structmlx_1_1core_1_1detail_1_1_divide.html',1,'mlx::core::detail::Divide'],['../classmlx_1_1core_1_1_divide.html',1,'mlx::core::Divide'],['../classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb',1,'mlx::core::Divide::Divide()']]],
-  ['divide_50',['divide',['../namespacemetal.html#a2aea493fc1a874970b77ed0031e965df',1,'metal::divide()'],['../namespacemetal_1_1fast.html#ae70bc2185e4649369cf7b15f5e1d48be',1,'metal::fast::divide()'],['../namespacemetal_1_1precise.html#aec0982cdb96a08b61f51129150d82e9d',1,'metal::precise::divide()'],['../group__ops.html#ga77472dd06cfa7a30a42e4fd927bd859f',1,'mlx::core::divide()']]],
-  ['divmod_51',['DivMod',['../struct_div_mod.html',1,'DivMod'],['../classmlx_1_1core_1_1_div_mod.html',1,'mlx::core::DivMod'],['../classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826',1,'mlx::core::DivMod::DivMod()']]],
-  ['divmod_52',['divmod',['../group__ops.html#gaa30ebc0a8376dbc3f7e46a47052b5894',1,'mlx::core']]],
-  ['do_5faxpby_53',['do_axpby',['../steel__gemm__fused_8h.html#a703f06c849c89c37af7b1d27b0804a29',1,'steel_gemm_fused.h']]],
-  ['do_5fgather_54',['do_gather',['../steel__gemm__fused_8h.html#a60efac3ac3b7cd64d096bbae38a3ac69',1,'steel_gemm_fused.h']]],
-  ['do_5fread_55',['do_read',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a13eb86acf6abe288c19645935a47d2ad',1,'mlx::steel::Conv2DWeightBlockLoader::do_read'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a640155880483e1042ec5f647b9adaac6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::do_read']]],
-  ['dst_56',['dst',['../struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83',1,'QuantizedBlockLoader::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ae048eb79f8b8d98f0fe8805c30fbb09f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8598bf23a2bce6af13c876cbfa76449f',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aea6494838175225d02cbc7768a646ec7',1,'mlx::steel::Conv2DWeightBlockLoader::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a59a4fffc1dc2f3fadfb3fdd1b886da70',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a24e20e4c1dd1ebf9534bfa2b3e050ed3',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aa84c4ad43a5defb83ba1a5f49a7adb2a',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8474daf268013e138a84fc1c4bff7352',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::dst'],['../structmlx_1_1steel_1_1_block_loader.html#af34c184a19846e4b40ba54b2946589ec',1,'mlx::steel::BlockLoader::dst'],['../namespacepocketfft_1_1detail.html#add0f231fc8a1ce01b90a90faeebcb4eb',1,'pocketfft::detail::dst()']]],
-  ['dst_5fld_57',['dst_ld',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a91192d512e7a18c2d16a139065000959',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a9e59da7e4436e61b2d3c3f982355910b',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a0ff5a6d503e0bbac4634030a75ab818d',1,'mlx::steel::Conv2DWeightBlockLoader::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ae71570942c7b0ad8e67c62662b336c4a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ac18eeebea26cc6da434ead6eb4397350',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a07c85eab8cbf7b02c60df29cf32031ef',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aae121ca6016fc6c7255027b3641f3a09',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::dst_ld']]],
-  ['dtype_58',['Dtype',['../structmlx_1_1core_1_1_dtype.html',1,'mlx::core::Dtype'],['../structmlx_1_1core_1_1_dtype.html#aec17f0a4a51729e5ac40b62f0aa765d1',1,'mlx::core::Dtype::Dtype()']]],
-  ['dtype_59',['dtype',['../classmlx_1_1core_1_1array.html#ae29e7d6fbfbea1e5e321a8d1ea3cfacd',1,'mlx::core::array']]],
-  ['dtype_2eh_60',['dtype.h',['../dtype_8h.html',1,'']]]
+  ['d_0',['D',['../structmlx_1_1steel_1_1_attn_params.html#a07ae31628e43e09bce533c7682c8dae3',1,'mlx::steel::AttnParams']]],
+  ['d_1',['d',['../classpocketfft_1_1detail_1_1cndarr.html#ac29c769aebb03f81fbcf16ba6e766af2',1,'pocketfft::detail::cndarr::d'],['../structmlx_1_1core_1_1array_1_1_data.html#a25f52ac67912a49bb6e2b6715aa65311',1,'mlx::core::array::Data::d']]],
+  ['data_2',['Data',['../structmlx_1_1core_1_1array_1_1_data.html',1,'mlx::core::array::Data'],['../structmlx_1_1core_1_1array_1_1_data.html#a77e2ea35fac1d54e4062468a432e1482',1,'mlx::core::array::Data::Data(allocator::Buffer buffer, deleter_t d=allocator::free)'],['../structmlx_1_1core_1_1array_1_1_data.html#a50f242040b123052e48e18c244ff70fc',1,'mlx::core::array::Data::Data(const Data &amp;d)=delete']]],
+  ['data_3',['data',['../classpocketfft_1_1detail_1_1arr.html#aec0f2191b4663b4187aab92454c34de8',1,'pocketfft::detail::arr::data()'],['../classpocketfft_1_1detail_1_1arr.html#ac82daa17e9f991072b012343f9d7c182',1,'pocketfft::detail::arr::data() const'],['../classmlx_1_1core_1_1array.html#a72e3ce6c03fefe272cadf214bd127b95',1,'mlx::core::array::data()'],['../classmlx_1_1core_1_1array.html#a99fb28eeab39b9f429373f8bd7557676',1,'mlx::core::array::data() const']]],
+  ['data_5fshared_5fptr_4',['data_shared_ptr',['../classmlx_1_1core_1_1array.html#ab84c792117e29cdf90ef3433303f6141',1,'mlx::core::array']]],
+  ['data_5fsize_5',['data_size',['../classmlx_1_1core_1_1array.html#afaf2a370fa35d96af1b27a4b814e3bfd',1,'mlx::core::array']]],
+  ['dct_6',['dct',['../namespacepocketfft_1_1detail.html#a60615f5b685314c658346c309d5ef2ba',1,'pocketfft::detail']]],
+  ['deallocate_7',['deallocate',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a2a99b8e296d26b255e9937ba5f30e76f',1,'pocketfft::detail::threading::aligned_allocator']]],
+  ['debug_5fset_5fprimitive_5fbuffer_5flabel_8',['debug_set_primitive_buffer_label',['../namespacemlx_1_1core.html#a489e45b3a5cd8b46e8ea56b9132eb230',1,'mlx::core']]],
+  ['debug_5fset_5fstream_5fqueue_5flabel_9',['debug_set_stream_queue_label',['../namespacemlx_1_1core.html#a79817d2432e782e596c9c49a08b93be2',1,'mlx::core']]],
+  ['decompose_5fhadamard_10',['decompose_hadamard',['../namespacemlx_1_1core.html#a3a8fe7ba84714dbb5fdc81e93a07abc8',1,'mlx::core']]],
+  ['default_5f_11',['default_',['../classmlx_1_1core_1_1random_1_1_key_sequence.html#ab5993daeed822c6b970caddab7e3fd90',1,'mlx::core::random::KeySequence']]],
+  ['default_5fdevice_12',['default_device',['../namespacemlx_1_1core.html#a0196171cfe6ee2953113abce597dc815',1,'mlx::core']]],
+  ['default_5fstream_13',['default_stream',['../namespacemlx_1_1core.html#ac198b7e282957c724c84a435e8f1215e',1,'mlx::core']]],
+  ['defaultcontiguousreduce_14',['DefaultContiguousReduce',['../structmlx_1_1core_1_1_default_contiguous_reduce.html',1,'mlx::core::DefaultContiguousReduce&lt; T, U, Op &gt;'],['../structmlx_1_1core_1_1_default_contiguous_reduce.html#aeb4fb7fa1a4c8e7d1da1f450ce95c57f',1,'mlx::core::DefaultContiguousReduce::DefaultContiguousReduce()']]],
+  ['defaultstridedreduce_15',['DefaultStridedReduce',['../structmlx_1_1core_1_1_default_strided_reduce.html',1,'mlx::core::DefaultStridedReduce&lt; T, U, Op &gt;'],['../structmlx_1_1core_1_1_default_strided_reduce.html#a477e5dd0dd33071e48769d11d19a13fb',1,'mlx::core::DefaultStridedReduce::DefaultStridedReduce()']]],
+  ['define_5fdefault_5fis_5fequivalent_16',['DEFINE_DEFAULT_IS_EQUIVALENT',['../primitives_8h.html#a0fb9d19207dc4869aca35abfbdf4d70a',1,'primitives.h']]],
+  ['define_5fgrads_17',['DEFINE_GRADS',['../primitives_8h.html#a77abdcb55bc2eb0f9a45edc5ee639bf6',1,'primitives.h']]],
+  ['define_5finput_5foutput_5fshape_18',['DEFINE_INPUT_OUTPUT_SHAPE',['../primitives_8h.html#a649a06267b75e007224ea4ddefedb999',1,'primitives.h']]],
+  ['define_5fprint_19',['DEFINE_PRINT',['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a8af1e90d4aa56f31ec40ad152ebd2421',1,'mlx::core::distributed::AllGather::DEFINE_PRINT()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a31bf76e24cf3836cf1fd26da30712e31',1,'mlx::core::distributed::Send::DEFINE_PRINT()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a7a0cad13da7cf8e565934318a2bc34f1',1,'mlx::core::distributed::Recv::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae6eea81b5e3789c2f6f376cc07f0a47c',1,'mlx::core::fast::RMSNorm::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a9895733eab845e11484d86cf6ecedced',1,'mlx::core::fast::RMSNormVJP::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a467fcf02b3ddf1d8b6d476b244ae3568',1,'mlx::core::fast::LayerNorm::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a5ab3eb5402c7e8060916056eb2b7887f',1,'mlx::core::fast::LayerNormVJP::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a2b06fe64fa8feca65140632087065e16',1,'mlx::core::fast::RoPE::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a6cc2092fa5b8e7585921b8e0f3ec3db7',1,'mlx::core::fast::ScaledDotProductAttention::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a4b8f1b1f633002c8ca6fa8f0ef4dd587',1,'mlx::core::fast::AffineQuantize::DEFINE_PRINT()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a116ecf31c8672c94e5ea06c1d43e9534',1,'mlx::core::fast::CustomKernel::DEFINE_PRINT()'],['../primitives_8h.html#a1d3a37af519e16f6a703b1e9ebd0f592',1,'DEFINE_PRINT:&#160;primitives.h']]],
+  ['define_5fsimd_5fexclusive_5fscan_20',['DEFINE_SIMD_EXCLUSIVE_SCAN',['../scan_8h.html#a185f66aac8c5317587e6abd43f3013fc',1,'scan.h']]],
+  ['define_5fsimd_5freduce_21',['DEFINE_SIMD_REDUCE',['../backend_2metal_2kernels_2reduction_2ops_8h.html#acacf99e0ba629ed062ccc3c2eba89b05',1,'ops.h']]],
+  ['define_5fsimd_5fscan_22',['DEFINE_SIMD_SCAN',['../scan_8h.html#a0d8d6a9b0f3a1263629380bda8eca7bc',1,'scan.h']]],
+  ['define_5fvmap_23',['DEFINE_VMAP',['../primitives_8h.html#adc0fbd79fe0d1114dc85da4ed99798bd',1,'primitives.h']]],
+  ['defines_2eh_24',['defines.h',['../defines_8h.html',1,'(Global Namespace)'],['../steel_2defines_8h.html',1,'(Global Namespace)']]],
+  ['degrees_25',['degrees',['../group__ops.html#ga3a70569b50e1083c5ded199d73fb960c',1,'mlx::core']]],
+  ['deleter_5ft_26',['deleter_t',['../namespacemlx_1_1core.html#a1e6cec03ebd80fd2d6b12b288367bfa8',1,'mlx::core']]],
+  ['denorm_5fmin_27',['denorm_min',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a6a9dbcba4dd79cad50876dda506b9eed',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['depends_28',['Depends',['../classmlx_1_1core_1_1_depends.html',1,'mlx::core::Depends'],['../classmlx_1_1core_1_1_depends.html#a4ccb792c99f5d8d133d3fac29f7d3f62',1,'mlx::core::Depends::Depends()']]],
+  ['depends_29',['depends',['../group__ops.html#gac4a51a68fbe1725436b026d2fbb95759',1,'mlx::core']]],
+  ['dequantize_30',['dequantize',['../quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2',1,'dequantize():&#160;quantized.h'],['../group__ops.html#gabff758a5c1ce32ad7e8b78aba0164077',1,'mlx::core::dequantize()']]],
+  ['detach_31',['detach',['../classmlx_1_1core_1_1array.html#a84948c29df8c957904919c8602692bd2',1,'mlx::core::array']]],
+  ['device_32',['Device',['../structmlx_1_1core_1_1_device.html',1,'mlx::core::Device'],['../classmlx_1_1core_1_1metal_1_1_device.html',1,'mlx::core::metal::Device'],['../classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6',1,'mlx::core::metal::Device::Device()'],['../classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06',1,'mlx::core::metal::Device::Device(const Device &amp;)=delete'],['../structmlx_1_1core_1_1_device.html#a481ccfb94d689994396bd353e966b489',1,'mlx::core::Device::Device()']]],
+  ['device_33',['device',['../structmlx_1_1core_1_1_stream.html#a406b1b0162287a4162fab1f70e2ff3bb',1,'mlx::core::Stream::device'],['../classmlx_1_1core_1_1_primitive.html#a8ae61e3289c4134232a69295268f8261',1,'mlx::core::Primitive::device()'],['../namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57',1,'mlx::core::metal::device()']]],
+  ['device_2eh_34',['device.h',['../backend_2metal_2device_8h.html',1,'(Global Namespace)'],['../device_8h.html',1,'(Global Namespace)']]],
+  ['device_5finfo_35',['device_info',['../namespacemlx_1_1core_1_1metal.html#a6ad19c44efabb7423f973407926ead61',1,'mlx::core::metal']]],
+  ['devicestream_36',['DeviceStream',['../structmlx_1_1core_1_1metal_1_1_device_stream.html',1,'mlx::core::metal::DeviceStream'],['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7',1,'mlx::core::metal::DeviceStream::DeviceStream()']]],
+  ['devicetype_37',['DeviceType',['../structmlx_1_1core_1_1_device.html#ac45b3de9b3458d8f31005136cde20fdb',1,'mlx::core::Device']]],
+  ['diag_38',['diag',['../group__ops.html#ga11af511875640e1fa88e0ca87e199344',1,'mlx::core']]],
+  ['diagonal_39',['diagonal',['../group__ops.html#ga9236b085a88ead3128ed8079d009cac6',1,'mlx::core']]],
+  ['difference_5ftype_40',['difference_type',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#adcee44c77980fc2370a2c31e203aead5',1,'mlx::core::array::ArrayIterator']]],
+  ['digits_41',['digits',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#af6a681edff230c8d734a1feefb8d1879',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['digits10_42',['digits10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a0f48dd0c8a2d2dfa825067fb212b2e6b',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['dim_43',['dim',['../struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364',1,'LoopedElemToLoc::dim'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a7be6bf560080472d61e74b522979ef1e',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::dim']]],
+  ['disable_5fcompile_44',['disable_compile',['../namespacemlx_1_1core.html#a5f5fea955057bb3842b271b037909e66',1,'mlx::core']]],
+  ['disabled_45',['disabled',['../namespacemlx_1_1core.html#adb15ff2b1ca5207fd4f6e631e2c3bcb4a075ae3d2fc31640504f814f60e5ef713',1,'mlx::core']]],
+  ['dispatch_5fthreadgroups_46',['dispatch_threadgroups',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a85796b2bf41dbf347ae0978d4660600d',1,'mlx::core::metal::CommandEncoder']]],
+  ['dispatch_5fthreads_47',['dispatch_threads',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a0a8501b940e5a347475fa4bc38fb4c05',1,'mlx::core::metal::CommandEncoder']]],
+  ['distprimitive_48',['DistPrimitive',['../classmlx_1_1core_1_1distributed_1_1_dist_primitive.html',1,'mlx::core::distributed::DistPrimitive'],['../classmlx_1_1core_1_1distributed_1_1_dist_primitive.html#a8c54166951522c2a52ef39fce8c87f8f',1,'mlx::core::distributed::DistPrimitive::DistPrimitive()']]],
+  ['distributed_2eh_49',['distributed.h',['../distributed_8h.html',1,'']]],
+  ['distributed_5fimpl_2eh_50',['distributed_impl.h',['../distributed__impl_8h.html',1,'']]],
+  ['divide_51',['Divide',['../struct_divide.html',1,'Divide'],['../structmlx_1_1core_1_1detail_1_1_divide.html',1,'mlx::core::detail::Divide'],['../classmlx_1_1core_1_1_divide.html',1,'mlx::core::Divide'],['../classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb',1,'mlx::core::Divide::Divide()']]],
+  ['divide_52',['divide',['../namespacemetal.html#a2aea493fc1a874970b77ed0031e965df',1,'metal::divide()'],['../namespacemetal_1_1fast.html#ae70bc2185e4649369cf7b15f5e1d48be',1,'metal::fast::divide()'],['../namespacemetal_1_1precise.html#aec0982cdb96a08b61f51129150d82e9d',1,'metal::precise::divide()'],['../group__ops.html#ga77472dd06cfa7a30a42e4fd927bd859f',1,'mlx::core::divide()']]],
+  ['divmod_53',['DivMod',['../struct_div_mod.html',1,'DivMod'],['../classmlx_1_1core_1_1_div_mod.html',1,'mlx::core::DivMod'],['../classmlx_1_1core_1_1_div_mod.html#a859e3b6149cdceab1c7ccfd2246fb826',1,'mlx::core::DivMod::DivMod()']]],
+  ['divmod_54',['divmod',['../group__ops.html#gaa30ebc0a8376dbc3f7e46a47052b5894',1,'mlx::core']]],
+  ['divop_55',['DivOp',['../struct_div_op.html',1,'']]],
+  ['do_5faxpby_56',['do_axpby',['../steel__gemm__fused_8h.html#a703f06c849c89c37af7b1d27b0804a29',1,'steel_gemm_fused.h']]],
+  ['do_5fgather_57',['do_gather',['../steel__gemm__fused_8h.html#a60efac3ac3b7cd64d096bbae38a3ac69',1,'steel_gemm_fused.h']]],
+  ['do_5fread_58',['do_read',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a13eb86acf6abe288c19645935a47d2ad',1,'mlx::steel::Conv2DWeightBlockLoader::do_read'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a640155880483e1042ec5f647b9adaac6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::do_read']]],
+  ['dst_59',['dst',['../struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83',1,'QuantizedBlockLoader::dst'],['../structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2',1,'mlx::steel::BlockLoader::dst'],['../structmlx_1_1steel_1_1_block_loader_t.html#a6eb4e566b687395e27f290da288362db',1,'mlx::steel::BlockLoaderT::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ae048eb79f8b8d98f0fe8805c30fbb09f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8598bf23a2bce6af13c876cbfa76449f',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aea6494838175225d02cbc7768a646ec7',1,'mlx::steel::Conv2DWeightBlockLoader::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a59a4fffc1dc2f3fadfb3fdd1b886da70',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a24e20e4c1dd1ebf9534bfa2b3e050ed3',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aa84c4ad43a5defb83ba1a5f49a7adb2a',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8474daf268013e138a84fc1c4bff7352',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::dst'],['../namespacepocketfft_1_1detail.html#add0f231fc8a1ce01b90a90faeebcb4eb',1,'pocketfft::detail::dst()']]],
+  ['dst_5fld_60',['dst_ld',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a91192d512e7a18c2d16a139065000959',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a9e59da7e4436e61b2d3c3f982355910b',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a0ff5a6d503e0bbac4634030a75ab818d',1,'mlx::steel::Conv2DWeightBlockLoader::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ae71570942c7b0ad8e67c62662b336c4a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ac18eeebea26cc6da434ead6eb4397350',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a07c85eab8cbf7b02c60df29cf32031ef',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aae121ca6016fc6c7255027b3641f3a09',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::dst_ld']]],
+  ['dtype_61',['Dtype',['../structmlx_1_1core_1_1_dtype.html',1,'mlx::core::Dtype'],['../structmlx_1_1core_1_1_dtype.html#aec17f0a4a51729e5ac40b62f0aa765d1',1,'mlx::core::Dtype::Dtype()']]],
+  ['dtype_62',['dtype',['../classmlx_1_1core_1_1array.html#ae29e7d6fbfbea1e5e321a8d1ea3cfacd',1,'mlx::core::array']]],
+  ['dtype_2eh_63',['dtype.h',['../dtype_8h.html',1,'']]]
 ];
diff --git a/docs/build/html/search/all_5.js b/docs/build/html/search/all_5.js
index 1fb21449f..d3f39741c 100644
--- a/docs/build/html/search/all_5.js
+++ b/docs/build/html/search/all_5.js
@@ -8,15 +8,15 @@ var searchData=
   ['einsum_2eh_5',['einsum.h',['../einsum_8h.html',1,'']]],
   ['einsum_5fpath_6',['einsum_path',['../namespacemlx_1_1core.html#ab14ec41f17675691c1fdebb8990b6695',1,'mlx::core']]],
   ['elem_7',['elem',['../struct_read_writer.html#a32b82adcd6ed324ce235e9f5ad780ede',1,'ReadWriter']]],
-  ['elem_5fto_5floc_8',['elem_to_loc',['../namespacemlx_1_1core.html#a77657cb50fd9392f7f4c64e43843c2b3',1,'mlx::core::elem_to_loc(int elem, const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides)'],['../namespacemlx_1_1core.html#ad7e4f40eb351b554bbfabb6d7d600d06',1,'mlx::core::elem_to_loc(int elem, const array &amp;a)'],['../backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1',1,'elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#aa6b041005351293e68e19b5abf1286cd',1,'elem_to_loc(stride_t elem, constant const int *shape, constant const stride_t *strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a37e00d94751710e81c9632bca2f91e51',1,'elem_to_loc(uint3 elem, constant const int *shape, constant const stride_t *strides, int ndim):&#160;utils.h']]],
-  ['elem_5fto_5floc_5f1_9',['elem_to_loc_1',['../backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d',1,'utils.h']]],
-  ['elem_5fto_5floc_5f2_10',['elem_to_loc_2',['../backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79',1,'utils.h']]],
-  ['elem_5fto_5floc_5f2_5fnd_11',['elem_to_loc_2_nd',['../backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953',1,'utils.h']]],
-  ['elem_5fto_5floc_5f3_12',['elem_to_loc_3',['../backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330',1,'utils.h']]],
-  ['elem_5fto_5floc_5f3_5fnd_13',['elem_to_loc_3_nd',['../backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b',1,'utils.h']]],
+  ['elem_5fto_5floc_8',['elem_to_loc',['../namespacemlx_1_1core.html#a77657cb50fd9392f7f4c64e43843c2b3',1,'mlx::core::elem_to_loc(int elem, const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides)'],['../namespacemlx_1_1core.html#ad7e4f40eb351b554bbfabb6d7d600d06',1,'mlx::core::elem_to_loc(int elem, const array &amp;a)'],['../backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5',1,'elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a4b53fb0679f67f9063deba94753d4185',1,'elem_to_loc(StrideT elem, constant const int *shape, constant const StrideT *strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#aec82f4bf0e22b8d1b89ad654ad8d8753',1,'elem_to_loc(uint3 elem, constant const int *shape, constant const StrideT *strides, int ndim):&#160;utils.h']]],
+  ['elem_5fto_5floc_5f1_9',['elem_to_loc_1',['../backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3',1,'utils.h']]],
+  ['elem_5fto_5floc_5f2_10',['elem_to_loc_2',['../backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de',1,'utils.h']]],
+  ['elem_5fto_5floc_5f2_5fnd_11',['elem_to_loc_2_nd',['../backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a',1,'utils.h']]],
+  ['elem_5fto_5floc_5f3_12',['elem_to_loc_3',['../backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc',1,'utils.h']]],
+  ['elem_5fto_5floc_5f3_5fnd_13',['elem_to_loc_3_nd',['../backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733',1,'utils.h']]],
   ['elem_5fto_5floc_5fbroadcast_14',['elem_to_loc_broadcast',['../backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f',1,'elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2steel_2utils_8h.html#a42bd57d203a40d3d7d429f2333590a3c',1,'elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, int ndim):&#160;utils.h']]],
-  ['elem_5ftype_15',['elem_type',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c',1,'mlx::steel::MMATile']]],
-  ['elems_16',['elems',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc',1,'mlx::steel::MMATile::elems()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1',1,'mlx::steel::MMATile::elems() const']]],
+  ['elem_5ftype_15',['elem_type',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628',1,'mlx::steel::MMATile']]],
+  ['elems_16',['elems',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc',1,'mlx::steel::MMATile::elems()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1',1,'mlx::steel::MMATile::elems() const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc',1,'mlx::steel::MMATile::elems()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1',1,'mlx::steel::MMATile::elems() const']]],
   ['elems_5fper_5fthread_17',['elems_per_thread',['../struct_read_writer.html#a444230a0182ce6ba1898c04ce6e669a7',1,'ReadWriter']]],
   ['elems_5fper_5fthread_5f_18',['elems_per_thread_',['../backend_2metal_2kernels_2fft_8h.html#ad395c11e6f2aee72cd1928fba93a35a3',1,'fft.h']]],
   ['empty_19',['empty',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#a1269e5da40c3f5145c895cee3641879a',1,'pocketfft::detail::threading::concurrent_queue']]],
@@ -37,8 +37,8 @@ var searchData=
   ['erfinv_34',['ErfInv',['../struct_erf_inv.html',1,'ErfInv'],['../structmlx_1_1core_1_1detail_1_1_erf_inv.html',1,'mlx::core::detail::ErfInv'],['../classmlx_1_1core_1_1_erf_inv.html',1,'mlx::core::ErfInv'],['../classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478',1,'mlx::core::ErfInv::ErfInv()']]],
   ['erfinv_35',['erfinv',['../erf_8h.html#a1846e0d683c7aff826bb32addcc3b885',1,'erfinv():&#160;erf.h'],['../group__ops.html#ga76fb9062c64264e34d2e07013390557c',1,'mlx::core::erfinv()']]],
   ['eval_36',['eval',['../classmlx_1_1core_1_1array.html#a2820c45188071a22175e9fa42e10a49a',1,'mlx::core::array::eval()'],['../namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299',1,'mlx::core::eval(std::vector&lt; array &gt; outputs)'],['../namespacemlx_1_1core.html#adb14f689c9f75f7901edb196c2bfb971',1,'mlx::core::eval(Arrays &amp;&amp;... outputs)']]],
-  ['eval_5fcpu_37',['eval_cpu',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#acdc1965ad64ee9ee6328fe150a97902e',1,'mlx::core::distributed::AllReduce::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#ab721fe0072fffbddbc3c4334dd033ba5',1,'mlx::core::distributed::AllGather::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#af2620837bfc1b97217d006ed6e374051',1,'mlx::core::distributed::Send::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a3be84b08122a939edd6062d26261358a',1,'mlx::core::distributed::Recv::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#a7da6e0cfd630958d9633b2e2bd97a54f',1,'mlx::core::fast::RMSNorm::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#adfc1d52bc266466ab29ee45fd8fab439',1,'mlx::core::fast::RMSNormVJP::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5d7a4c1c9ee84e327d1c371733108c05',1,'mlx::core::fast::LayerNorm::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a0d8c4c6e7462befc38f7e08244fa1c2b',1,'mlx::core::fast::LayerNormVJP::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a05a7d595c6b9dadf7ddfd6e3fd402f0e',1,'mlx::core::fast::RoPE::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ae20851e002f7fcb6d4f97817596f6328',1,'mlx::core::fast::ScaledDotProductAttention::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd',1,'mlx::core::fast::AffineQuantize::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad',1,'mlx::core::fast::CustomKernel::eval_cpu()'],['../classmlx_1_1core_1_1_primitive.html#a1596dc50b910538eae14878e98f07575',1,'mlx::core::Primitive::eval_cpu()'],['../classmlx_1_1core_1_1_unary_primitive.html#a7e8f6f5d6ae0a33f6abc0f5a46e0b132',1,'mlx::core::UnaryPrimitive::eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output)=0'],['../classmlx_1_1core_1_1_unary_primitive.html#aa0ed6e32c36200a3ff9bc592c9b300db',1,'mlx::core::UnaryPrimitive::eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60',1,'mlx::core::Abs::eval_cpu()'],['../classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f',1,'mlx::core::Add::eval_cpu()'],['../classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c',1,'mlx::core::AddMM::eval_cpu()'],['../classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1',1,'mlx::core::Arange::eval_cpu()'],['../classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006',1,'mlx::core::ArcCos::eval_cpu()'],['../classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9',1,'mlx::core::ArcCosh::eval_cpu()'],['../classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4',1,'mlx::core::ArcSin::eval_cpu()'],['../classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066',1,'mlx::core::ArcSinh::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3',1,'mlx::core::ArcTan::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c',1,'mlx::core::ArcTan2::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd',1,'mlx::core::ArcTanh::eval_cpu()'],['../classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828',1,'mlx::core::ArgPartition::eval_cpu()'],['../classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287',1,'mlx::core::ArgReduce::eval_cpu()'],['../classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa',1,'mlx::core::ArgSort::eval_cpu()'],['../classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d',1,'mlx::core::AsType::eval_cpu()'],['../classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193',1,'mlx::core::AsStrided::eval_cpu()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283',1,'mlx::core::BitwiseBinary::eval_cpu()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2',1,'mlx::core::BlockMaskedMM::eval_cpu()'],['../classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730',1,'mlx::core::GatherMM::eval_cpu()'],['../classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780',1,'mlx::core::Broadcast::eval_cpu()'],['../classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035',1,'mlx::core::Ceil::eval_cpu()'],['../classmlx_1_1core_1_1_compiled.html#ac45b1d0fedd85feefbff7ce7e168b151',1,'mlx::core::Compiled::eval_cpu()'],['../classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258',1,'mlx::core::Concatenate::eval_cpu()'],['../classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61',1,'mlx::core::Conjugate::eval_cpu()'],['../classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b',1,'mlx::core::Convolution::eval_cpu()'],['../classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c',1,'mlx::core::Copy::eval_cpu()'],['../classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152',1,'mlx::core::Cos::eval_cpu()'],['../classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d',1,'mlx::core::Cosh::eval_cpu()'],['../classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184',1,'mlx::core::CustomTransforms::eval_cpu()'],['../classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e',1,'mlx::core::Depends::eval_cpu()'],['../classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49',1,'mlx::core::Divide::eval_cpu()'],['../classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3',1,'mlx::core::DivMod::eval_cpu()'],['../classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2',1,'mlx::core::Select::eval_cpu()'],['../classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc',1,'mlx::core::Remainder::eval_cpu()'],['../classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454',1,'mlx::core::Equal::eval_cpu()'],['../classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6',1,'mlx::core::Erf::eval_cpu()'],['../classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e',1,'mlx::core::ErfInv::eval_cpu()'],['../classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c',1,'mlx::core::Exp::eval_cpu()'],['../classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a',1,'mlx::core::Expm1::eval_cpu()'],['../classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635',1,'mlx::core::FFT::eval_cpu()'],['../classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7',1,'mlx::core::Floor::eval_cpu()'],['../classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c',1,'mlx::core::Full::eval_cpu()'],['../classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290',1,'mlx::core::Gather::eval_cpu()'],['../classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae',1,'mlx::core::Greater::eval_cpu()'],['../classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075',1,'mlx::core::GreaterEqual::eval_cpu()'],['../classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d',1,'mlx::core::Hadamard::eval_cpu()'],['../classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829',1,'mlx::core::Imag::eval_cpu()'],['../classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef',1,'mlx::core::Less::eval_cpu()'],['../classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16',1,'mlx::core::LessEqual::eval_cpu()'],['../classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a',1,'mlx::core::Load::eval_cpu()'],['../classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f',1,'mlx::core::Log::eval_cpu()'],['../classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23',1,'mlx::core::Log1p::eval_cpu()'],['../classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3',1,'mlx::core::LogicalNot::eval_cpu()'],['../classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3',1,'mlx::core::LogicalAnd::eval_cpu()'],['../classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62',1,'mlx::core::LogicalOr::eval_cpu()'],['../classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0',1,'mlx::core::LogAddExp::eval_cpu()'],['../classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc',1,'mlx::core::Matmul::eval_cpu()'],['../classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf',1,'mlx::core::Maximum::eval_cpu()'],['../classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e',1,'mlx::core::Minimum::eval_cpu()'],['../classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34',1,'mlx::core::Multiply::eval_cpu()'],['../classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b',1,'mlx::core::Negative::eval_cpu()'],['../classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047',1,'mlx::core::NotEqual::eval_cpu()'],['../classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f',1,'mlx::core::NumberOfElements::eval_cpu()'],['../classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb',1,'mlx::core::Pad::eval_cpu()'],['../classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8',1,'mlx::core::Partition::eval_cpu()'],['../classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206',1,'mlx::core::Power::eval_cpu()'],['../classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3',1,'mlx::core::QuantizedMatmul::eval_cpu()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c',1,'mlx::core::GatherQMM::eval_cpu()'],['../classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2',1,'mlx::core::RandomBits::eval_cpu()'],['../classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934',1,'mlx::core::Real::eval_cpu()'],['../classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f',1,'mlx::core::Reshape::eval_cpu()'],['../classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa',1,'mlx::core::Reduce::eval_cpu()'],['../classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007',1,'mlx::core::Round::eval_cpu()'],['../classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b',1,'mlx::core::Scan::eval_cpu()'],['../classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97',1,'mlx::core::Scatter::eval_cpu()'],['../classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255',1,'mlx::core::Sigmoid::eval_cpu()'],['../classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97',1,'mlx::core::Sign::eval_cpu()'],['../classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5',1,'mlx::core::Sin::eval_cpu()'],['../classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd',1,'mlx::core::Sinh::eval_cpu()'],['../classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2',1,'mlx::core::Slice::eval_cpu()'],['../classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b',1,'mlx::core::SliceUpdate::eval_cpu()'],['../classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79',1,'mlx::core::Softmax::eval_cpu()'],['../classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd',1,'mlx::core::Sort::eval_cpu()'],['../classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4',1,'mlx::core::Split::eval_cpu()'],['../classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59',1,'mlx::core::Square::eval_cpu()'],['../classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5',1,'mlx::core::Sqrt::eval_cpu()'],['../classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2',1,'mlx::core::StopGradient::eval_cpu()'],['../classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12',1,'mlx::core::Subtract::eval_cpu()'],['../classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9',1,'mlx::core::Tan::eval_cpu()'],['../classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5',1,'mlx::core::Tanh::eval_cpu()'],['../classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f',1,'mlx::core::Uniform::eval_cpu()'],['../classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497',1,'mlx::core::View::eval_cpu()'],['../classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8',1,'mlx::core::Transpose::eval_cpu()'],['../classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2',1,'mlx::core::QRF::eval_cpu()'],['../classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6',1,'mlx::core::SVD::eval_cpu()'],['../classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81',1,'mlx::core::Inverse::eval_cpu()'],['../classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5',1,'mlx::core::Cholesky::eval_cpu()'],['../classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be',1,'mlx::core::Eigh::eval_cpu()']]],
-  ['eval_5fgpu_38',['eval_gpu',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a52df7155f56b8450581b2fd2747cad20',1,'mlx::core::distributed::AllReduce::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a4251ce0f2db2045226b66210b828af7a',1,'mlx::core::distributed::AllGather::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a0c8dbd2a912be91be04ec701e29fba3d',1,'mlx::core::distributed::Send::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a932e39624bc3d234a7489c3decc4749e',1,'mlx::core::distributed::Recv::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae7955e8d43c097eecae264e804b4d8ca',1,'mlx::core::fast::RMSNorm::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a48efb8fa84c4ba6cc9fb560ebbe01560',1,'mlx::core::fast::RMSNormVJP::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a77abda7f47bffa2c037a5d60cccc1528',1,'mlx::core::fast::LayerNorm::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a954a003a4a27c8c4c60a5a14142a9cc3',1,'mlx::core::fast::LayerNormVJP::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a913b6b00fc518b25ac3947e4e15790f2',1,'mlx::core::fast::RoPE::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a505f38ba93a3499895f5312e0112e73d',1,'mlx::core::fast::ScaledDotProductAttention::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ad51666e69f670e286293aff96eb435a9',1,'mlx::core::fast::ScaledDotProductAttention::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out)'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628',1,'mlx::core::fast::AffineQuantize::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db',1,'mlx::core::fast::CustomKernel::eval_gpu()'],['../classmlx_1_1core_1_1_primitive.html#ad217376dcf5eff691d731566faec2ba2',1,'mlx::core::Primitive::eval_gpu()'],['../classmlx_1_1core_1_1_unary_primitive.html#a6b7f80abaf038d53ec6ffbb0dfac6adb',1,'mlx::core::UnaryPrimitive::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output)=0'],['../classmlx_1_1core_1_1_unary_primitive.html#a971fe9ad47f6569118879ce1d0f41447',1,'mlx::core::UnaryPrimitive::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514',1,'mlx::core::Abs::eval_gpu()'],['../classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d',1,'mlx::core::Add::eval_gpu()'],['../classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9',1,'mlx::core::AddMM::eval_gpu()'],['../classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031',1,'mlx::core::Arange::eval_gpu()'],['../classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c',1,'mlx::core::ArcCos::eval_gpu()'],['../classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc',1,'mlx::core::ArcCosh::eval_gpu()'],['../classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3',1,'mlx::core::ArcSin::eval_gpu()'],['../classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac',1,'mlx::core::ArcSinh::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254',1,'mlx::core::ArcTan::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50',1,'mlx::core::ArcTan2::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d',1,'mlx::core::ArcTanh::eval_gpu()'],['../classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc',1,'mlx::core::ArgPartition::eval_gpu()'],['../classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29',1,'mlx::core::ArgReduce::eval_gpu()'],['../classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709',1,'mlx::core::ArgSort::eval_gpu()'],['../classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b',1,'mlx::core::AsType::eval_gpu()'],['../classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed',1,'mlx::core::AsStrided::eval_gpu()'],['../classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd',1,'mlx::core::BitwiseBinary::eval_gpu()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9',1,'mlx::core::BlockMaskedMM::eval_gpu()'],['../classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1',1,'mlx::core::GatherMM::eval_gpu()'],['../classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe',1,'mlx::core::Broadcast::eval_gpu()'],['../classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887',1,'mlx::core::Ceil::eval_gpu()'],['../classmlx_1_1core_1_1_compiled.html#aa3d5ff0f2b3554ad48fbbf2a0f3336d5',1,'mlx::core::Compiled::eval_gpu()'],['../classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474',1,'mlx::core::Concatenate::eval_gpu()'],['../classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de',1,'mlx::core::Conjugate::eval_gpu()'],['../classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2',1,'mlx::core::Convolution::eval_gpu()'],['../classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1',1,'mlx::core::Copy::eval_gpu()'],['../classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060',1,'mlx::core::Cos::eval_gpu()'],['../classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559',1,'mlx::core::Cosh::eval_gpu()'],['../classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667',1,'mlx::core::CustomTransforms::eval_gpu()'],['../classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28',1,'mlx::core::Depends::eval_gpu()'],['../classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7',1,'mlx::core::Divide::eval_gpu()'],['../classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc',1,'mlx::core::DivMod::eval_gpu()'],['../classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b',1,'mlx::core::Select::eval_gpu()'],['../classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161',1,'mlx::core::Remainder::eval_gpu()'],['../classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c',1,'mlx::core::Equal::eval_gpu()'],['../classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008',1,'mlx::core::Erf::eval_gpu()'],['../classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db',1,'mlx::core::ErfInv::eval_gpu()'],['../classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822',1,'mlx::core::Exp::eval_gpu()'],['../classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f',1,'mlx::core::Expm1::eval_gpu()'],['../classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd',1,'mlx::core::FFT::eval_gpu()'],['../classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65',1,'mlx::core::Floor::eval_gpu()'],['../classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872',1,'mlx::core::Full::eval_gpu()'],['../classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8',1,'mlx::core::Gather::eval_gpu()'],['../classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878',1,'mlx::core::Greater::eval_gpu()'],['../classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24',1,'mlx::core::GreaterEqual::eval_gpu()'],['../classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733',1,'mlx::core::Hadamard::eval_gpu()'],['../classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6',1,'mlx::core::Imag::eval_gpu()'],['../classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917',1,'mlx::core::Less::eval_gpu()'],['../classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac',1,'mlx::core::LessEqual::eval_gpu()'],['../classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d',1,'mlx::core::Load::eval_gpu()'],['../classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390',1,'mlx::core::Log::eval_gpu()'],['../classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431',1,'mlx::core::Log1p::eval_gpu()'],['../classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a',1,'mlx::core::LogicalNot::eval_gpu()'],['../classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f',1,'mlx::core::LogicalAnd::eval_gpu()'],['../classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a',1,'mlx::core::LogicalOr::eval_gpu()'],['../classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a',1,'mlx::core::LogAddExp::eval_gpu()'],['../classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7',1,'mlx::core::Matmul::eval_gpu()'],['../classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7',1,'mlx::core::Maximum::eval_gpu()'],['../classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba',1,'mlx::core::Minimum::eval_gpu()'],['../classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0',1,'mlx::core::Multiply::eval_gpu()'],['../classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b',1,'mlx::core::Negative::eval_gpu()'],['../classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2',1,'mlx::core::NotEqual::eval_gpu()'],['../classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5',1,'mlx::core::NumberOfElements::eval_gpu()'],['../classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153',1,'mlx::core::Pad::eval_gpu()'],['../classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef',1,'mlx::core::Partition::eval_gpu()'],['../classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11',1,'mlx::core::Power::eval_gpu()'],['../classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3',1,'mlx::core::QuantizedMatmul::eval_gpu()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887',1,'mlx::core::GatherQMM::eval_gpu()'],['../classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a',1,'mlx::core::RandomBits::eval_gpu()'],['../classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2',1,'mlx::core::Real::eval_gpu()'],['../classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059',1,'mlx::core::Reshape::eval_gpu()'],['../classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f',1,'mlx::core::Reduce::eval_gpu()'],['../classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec',1,'mlx::core::Round::eval_gpu()'],['../classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde',1,'mlx::core::Scan::eval_gpu()'],['../classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678',1,'mlx::core::Scatter::eval_gpu()'],['../classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca',1,'mlx::core::Sigmoid::eval_gpu()'],['../classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b',1,'mlx::core::Sign::eval_gpu()'],['../classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e',1,'mlx::core::Sin::eval_gpu()'],['../classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75',1,'mlx::core::Sinh::eval_gpu()'],['../classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a',1,'mlx::core::Slice::eval_gpu()'],['../classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b',1,'mlx::core::SliceUpdate::eval_gpu()'],['../classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af',1,'mlx::core::Softmax::eval_gpu()'],['../classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382',1,'mlx::core::Sort::eval_gpu()'],['../classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df',1,'mlx::core::Split::eval_gpu()'],['../classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045',1,'mlx::core::Square::eval_gpu()'],['../classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501',1,'mlx::core::Sqrt::eval_gpu()'],['../classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89',1,'mlx::core::StopGradient::eval_gpu()'],['../classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c',1,'mlx::core::Subtract::eval_gpu()'],['../classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f',1,'mlx::core::Tan::eval_gpu()'],['../classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761',1,'mlx::core::Tanh::eval_gpu()'],['../classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0',1,'mlx::core::Uniform::eval_gpu()'],['../classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075',1,'mlx::core::View::eval_gpu()'],['../classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e',1,'mlx::core::Transpose::eval_gpu()'],['../classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9',1,'mlx::core::QRF::eval_gpu()'],['../classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83',1,'mlx::core::SVD::eval_gpu()'],['../classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2',1,'mlx::core::Inverse::eval_gpu()'],['../classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795',1,'mlx::core::Cholesky::eval_gpu()'],['../classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2',1,'mlx::core::Eigh::eval_gpu()']]],
+  ['eval_5fcpu_37',['eval_cpu',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#acdc1965ad64ee9ee6328fe150a97902e',1,'mlx::core::distributed::AllReduce::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#ab721fe0072fffbddbc3c4334dd033ba5',1,'mlx::core::distributed::AllGather::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#af2620837bfc1b97217d006ed6e374051',1,'mlx::core::distributed::Send::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a3be84b08122a939edd6062d26261358a',1,'mlx::core::distributed::Recv::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#a7da6e0cfd630958d9633b2e2bd97a54f',1,'mlx::core::fast::RMSNorm::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#adfc1d52bc266466ab29ee45fd8fab439',1,'mlx::core::fast::RMSNormVJP::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5d7a4c1c9ee84e327d1c371733108c05',1,'mlx::core::fast::LayerNorm::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a0d8c4c6e7462befc38f7e08244fa1c2b',1,'mlx::core::fast::LayerNormVJP::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a05a7d595c6b9dadf7ddfd6e3fd402f0e',1,'mlx::core::fast::RoPE::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ae20851e002f7fcb6d4f97817596f6328',1,'mlx::core::fast::ScaledDotProductAttention::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd',1,'mlx::core::fast::AffineQuantize::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad',1,'mlx::core::fast::CustomKernel::eval_cpu()'],['../classmlx_1_1core_1_1_primitive.html#a1596dc50b910538eae14878e98f07575',1,'mlx::core::Primitive::eval_cpu()'],['../classmlx_1_1core_1_1_unary_primitive.html#a7e8f6f5d6ae0a33f6abc0f5a46e0b132',1,'mlx::core::UnaryPrimitive::eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output)=0'],['../classmlx_1_1core_1_1_unary_primitive.html#aa0ed6e32c36200a3ff9bc592c9b300db',1,'mlx::core::UnaryPrimitive::eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60',1,'mlx::core::Abs::eval_cpu()'],['../classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f',1,'mlx::core::Add::eval_cpu()'],['../classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c',1,'mlx::core::AddMM::eval_cpu()'],['../classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1',1,'mlx::core::Arange::eval_cpu()'],['../classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006',1,'mlx::core::ArcCos::eval_cpu()'],['../classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9',1,'mlx::core::ArcCosh::eval_cpu()'],['../classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4',1,'mlx::core::ArcSin::eval_cpu()'],['../classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066',1,'mlx::core::ArcSinh::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3',1,'mlx::core::ArcTan::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c',1,'mlx::core::ArcTan2::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd',1,'mlx::core::ArcTanh::eval_cpu()'],['../classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828',1,'mlx::core::ArgPartition::eval_cpu()'],['../classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287',1,'mlx::core::ArgReduce::eval_cpu()'],['../classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa',1,'mlx::core::ArgSort::eval_cpu()'],['../classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d',1,'mlx::core::AsType::eval_cpu()'],['../classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193',1,'mlx::core::AsStrided::eval_cpu()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283',1,'mlx::core::BitwiseBinary::eval_cpu()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2',1,'mlx::core::BlockMaskedMM::eval_cpu()'],['../classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730',1,'mlx::core::GatherMM::eval_cpu()'],['../classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780',1,'mlx::core::Broadcast::eval_cpu()'],['../classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035',1,'mlx::core::Ceil::eval_cpu()'],['../classmlx_1_1core_1_1_compiled.html#ac45b1d0fedd85feefbff7ce7e168b151',1,'mlx::core::Compiled::eval_cpu()'],['../classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258',1,'mlx::core::Concatenate::eval_cpu()'],['../classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61',1,'mlx::core::Conjugate::eval_cpu()'],['../classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336',1,'mlx::core::Contiguous::eval_cpu()'],['../classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b',1,'mlx::core::Convolution::eval_cpu()'],['../classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c',1,'mlx::core::Copy::eval_cpu()'],['../classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152',1,'mlx::core::Cos::eval_cpu()'],['../classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d',1,'mlx::core::Cosh::eval_cpu()'],['../classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184',1,'mlx::core::CustomTransforms::eval_cpu()'],['../classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e',1,'mlx::core::Depends::eval_cpu()'],['../classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49',1,'mlx::core::Divide::eval_cpu()'],['../classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3',1,'mlx::core::DivMod::eval_cpu()'],['../classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2',1,'mlx::core::Select::eval_cpu()'],['../classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc',1,'mlx::core::Remainder::eval_cpu()'],['../classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454',1,'mlx::core::Equal::eval_cpu()'],['../classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6',1,'mlx::core::Erf::eval_cpu()'],['../classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e',1,'mlx::core::ErfInv::eval_cpu()'],['../classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c',1,'mlx::core::Exp::eval_cpu()'],['../classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a',1,'mlx::core::Expm1::eval_cpu()'],['../classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635',1,'mlx::core::FFT::eval_cpu()'],['../classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7',1,'mlx::core::Floor::eval_cpu()'],['../classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c',1,'mlx::core::Full::eval_cpu()'],['../classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290',1,'mlx::core::Gather::eval_cpu()'],['../classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae',1,'mlx::core::Greater::eval_cpu()'],['../classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075',1,'mlx::core::GreaterEqual::eval_cpu()'],['../classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d',1,'mlx::core::Hadamard::eval_cpu()'],['../classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829',1,'mlx::core::Imag::eval_cpu()'],['../classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef',1,'mlx::core::Less::eval_cpu()'],['../classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16',1,'mlx::core::LessEqual::eval_cpu()'],['../classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a',1,'mlx::core::Load::eval_cpu()'],['../classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f',1,'mlx::core::Log::eval_cpu()'],['../classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23',1,'mlx::core::Log1p::eval_cpu()'],['../classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3',1,'mlx::core::LogicalNot::eval_cpu()'],['../classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3',1,'mlx::core::LogicalAnd::eval_cpu()'],['../classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62',1,'mlx::core::LogicalOr::eval_cpu()'],['../classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0',1,'mlx::core::LogAddExp::eval_cpu()'],['../classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc',1,'mlx::core::Matmul::eval_cpu()'],['../classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf',1,'mlx::core::Maximum::eval_cpu()'],['../classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e',1,'mlx::core::Minimum::eval_cpu()'],['../classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34',1,'mlx::core::Multiply::eval_cpu()'],['../classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b',1,'mlx::core::Negative::eval_cpu()'],['../classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047',1,'mlx::core::NotEqual::eval_cpu()'],['../classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f',1,'mlx::core::NumberOfElements::eval_cpu()'],['../classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb',1,'mlx::core::Pad::eval_cpu()'],['../classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8',1,'mlx::core::Partition::eval_cpu()'],['../classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206',1,'mlx::core::Power::eval_cpu()'],['../classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3',1,'mlx::core::QuantizedMatmul::eval_cpu()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c',1,'mlx::core::GatherQMM::eval_cpu()'],['../classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2',1,'mlx::core::RandomBits::eval_cpu()'],['../classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934',1,'mlx::core::Real::eval_cpu()'],['../classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f',1,'mlx::core::Reshape::eval_cpu()'],['../classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa',1,'mlx::core::Reduce::eval_cpu()'],['../classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007',1,'mlx::core::Round::eval_cpu()'],['../classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b',1,'mlx::core::Scan::eval_cpu()'],['../classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97',1,'mlx::core::Scatter::eval_cpu()'],['../classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255',1,'mlx::core::Sigmoid::eval_cpu()'],['../classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97',1,'mlx::core::Sign::eval_cpu()'],['../classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5',1,'mlx::core::Sin::eval_cpu()'],['../classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd',1,'mlx::core::Sinh::eval_cpu()'],['../classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2',1,'mlx::core::Slice::eval_cpu()'],['../classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b',1,'mlx::core::SliceUpdate::eval_cpu()'],['../classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79',1,'mlx::core::Softmax::eval_cpu()'],['../classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd',1,'mlx::core::Sort::eval_cpu()'],['../classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4',1,'mlx::core::Split::eval_cpu()'],['../classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59',1,'mlx::core::Square::eval_cpu()'],['../classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5',1,'mlx::core::Sqrt::eval_cpu()'],['../classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2',1,'mlx::core::StopGradient::eval_cpu()'],['../classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12',1,'mlx::core::Subtract::eval_cpu()'],['../classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9',1,'mlx::core::Tan::eval_cpu()'],['../classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5',1,'mlx::core::Tanh::eval_cpu()'],['../classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f',1,'mlx::core::Uniform::eval_cpu()'],['../classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497',1,'mlx::core::View::eval_cpu()'],['../classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8',1,'mlx::core::Transpose::eval_cpu()'],['../classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2',1,'mlx::core::QRF::eval_cpu()'],['../classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6',1,'mlx::core::SVD::eval_cpu()'],['../classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81',1,'mlx::core::Inverse::eval_cpu()'],['../classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5',1,'mlx::core::Cholesky::eval_cpu()'],['../classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be',1,'mlx::core::Eigh::eval_cpu()']]],
+  ['eval_5fgpu_38',['eval_gpu',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a52df7155f56b8450581b2fd2747cad20',1,'mlx::core::distributed::AllReduce::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a4251ce0f2db2045226b66210b828af7a',1,'mlx::core::distributed::AllGather::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a0c8dbd2a912be91be04ec701e29fba3d',1,'mlx::core::distributed::Send::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a932e39624bc3d234a7489c3decc4749e',1,'mlx::core::distributed::Recv::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae7955e8d43c097eecae264e804b4d8ca',1,'mlx::core::fast::RMSNorm::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a48efb8fa84c4ba6cc9fb560ebbe01560',1,'mlx::core::fast::RMSNormVJP::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a77abda7f47bffa2c037a5d60cccc1528',1,'mlx::core::fast::LayerNorm::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a954a003a4a27c8c4c60a5a14142a9cc3',1,'mlx::core::fast::LayerNormVJP::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a913b6b00fc518b25ac3947e4e15790f2',1,'mlx::core::fast::RoPE::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a505f38ba93a3499895f5312e0112e73d',1,'mlx::core::fast::ScaledDotProductAttention::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ad51666e69f670e286293aff96eb435a9',1,'mlx::core::fast::ScaledDotProductAttention::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out)'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628',1,'mlx::core::fast::AffineQuantize::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db',1,'mlx::core::fast::CustomKernel::eval_gpu()'],['../classmlx_1_1core_1_1_primitive.html#ad217376dcf5eff691d731566faec2ba2',1,'mlx::core::Primitive::eval_gpu()'],['../classmlx_1_1core_1_1_unary_primitive.html#a6b7f80abaf038d53ec6ffbb0dfac6adb',1,'mlx::core::UnaryPrimitive::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output)=0'],['../classmlx_1_1core_1_1_unary_primitive.html#a971fe9ad47f6569118879ce1d0f41447',1,'mlx::core::UnaryPrimitive::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514',1,'mlx::core::Abs::eval_gpu()'],['../classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d',1,'mlx::core::Add::eval_gpu()'],['../classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9',1,'mlx::core::AddMM::eval_gpu()'],['../classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031',1,'mlx::core::Arange::eval_gpu()'],['../classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c',1,'mlx::core::ArcCos::eval_gpu()'],['../classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc',1,'mlx::core::ArcCosh::eval_gpu()'],['../classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3',1,'mlx::core::ArcSin::eval_gpu()'],['../classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac',1,'mlx::core::ArcSinh::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254',1,'mlx::core::ArcTan::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50',1,'mlx::core::ArcTan2::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d',1,'mlx::core::ArcTanh::eval_gpu()'],['../classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc',1,'mlx::core::ArgPartition::eval_gpu()'],['../classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29',1,'mlx::core::ArgReduce::eval_gpu()'],['../classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709',1,'mlx::core::ArgSort::eval_gpu()'],['../classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b',1,'mlx::core::AsType::eval_gpu()'],['../classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed',1,'mlx::core::AsStrided::eval_gpu()'],['../classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd',1,'mlx::core::BitwiseBinary::eval_gpu()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9',1,'mlx::core::BlockMaskedMM::eval_gpu()'],['../classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1',1,'mlx::core::GatherMM::eval_gpu()'],['../classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe',1,'mlx::core::Broadcast::eval_gpu()'],['../classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887',1,'mlx::core::Ceil::eval_gpu()'],['../classmlx_1_1core_1_1_compiled.html#aa3d5ff0f2b3554ad48fbbf2a0f3336d5',1,'mlx::core::Compiled::eval_gpu()'],['../classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474',1,'mlx::core::Concatenate::eval_gpu()'],['../classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de',1,'mlx::core::Conjugate::eval_gpu()'],['../classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f',1,'mlx::core::Contiguous::eval_gpu()'],['../classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2',1,'mlx::core::Convolution::eval_gpu()'],['../classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1',1,'mlx::core::Copy::eval_gpu()'],['../classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060',1,'mlx::core::Cos::eval_gpu()'],['../classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559',1,'mlx::core::Cosh::eval_gpu()'],['../classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667',1,'mlx::core::CustomTransforms::eval_gpu()'],['../classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28',1,'mlx::core::Depends::eval_gpu()'],['../classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7',1,'mlx::core::Divide::eval_gpu()'],['../classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc',1,'mlx::core::DivMod::eval_gpu()'],['../classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b',1,'mlx::core::Select::eval_gpu()'],['../classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161',1,'mlx::core::Remainder::eval_gpu()'],['../classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c',1,'mlx::core::Equal::eval_gpu()'],['../classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008',1,'mlx::core::Erf::eval_gpu()'],['../classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db',1,'mlx::core::ErfInv::eval_gpu()'],['../classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822',1,'mlx::core::Exp::eval_gpu()'],['../classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f',1,'mlx::core::Expm1::eval_gpu()'],['../classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd',1,'mlx::core::FFT::eval_gpu()'],['../classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65',1,'mlx::core::Floor::eval_gpu()'],['../classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872',1,'mlx::core::Full::eval_gpu()'],['../classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8',1,'mlx::core::Gather::eval_gpu()'],['../classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878',1,'mlx::core::Greater::eval_gpu()'],['../classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24',1,'mlx::core::GreaterEqual::eval_gpu()'],['../classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733',1,'mlx::core::Hadamard::eval_gpu()'],['../classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6',1,'mlx::core::Imag::eval_gpu()'],['../classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917',1,'mlx::core::Less::eval_gpu()'],['../classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac',1,'mlx::core::LessEqual::eval_gpu()'],['../classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d',1,'mlx::core::Load::eval_gpu()'],['../classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390',1,'mlx::core::Log::eval_gpu()'],['../classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431',1,'mlx::core::Log1p::eval_gpu()'],['../classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a',1,'mlx::core::LogicalNot::eval_gpu()'],['../classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f',1,'mlx::core::LogicalAnd::eval_gpu()'],['../classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a',1,'mlx::core::LogicalOr::eval_gpu()'],['../classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a',1,'mlx::core::LogAddExp::eval_gpu()'],['../classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7',1,'mlx::core::Matmul::eval_gpu()'],['../classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7',1,'mlx::core::Maximum::eval_gpu()'],['../classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba',1,'mlx::core::Minimum::eval_gpu()'],['../classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0',1,'mlx::core::Multiply::eval_gpu()'],['../classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b',1,'mlx::core::Negative::eval_gpu()'],['../classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2',1,'mlx::core::NotEqual::eval_gpu()'],['../classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5',1,'mlx::core::NumberOfElements::eval_gpu()'],['../classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153',1,'mlx::core::Pad::eval_gpu()'],['../classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef',1,'mlx::core::Partition::eval_gpu()'],['../classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11',1,'mlx::core::Power::eval_gpu()'],['../classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3',1,'mlx::core::QuantizedMatmul::eval_gpu()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887',1,'mlx::core::GatherQMM::eval_gpu()'],['../classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a',1,'mlx::core::RandomBits::eval_gpu()'],['../classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2',1,'mlx::core::Real::eval_gpu()'],['../classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059',1,'mlx::core::Reshape::eval_gpu()'],['../classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f',1,'mlx::core::Reduce::eval_gpu()'],['../classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec',1,'mlx::core::Round::eval_gpu()'],['../classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde',1,'mlx::core::Scan::eval_gpu()'],['../classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678',1,'mlx::core::Scatter::eval_gpu()'],['../classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca',1,'mlx::core::Sigmoid::eval_gpu()'],['../classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b',1,'mlx::core::Sign::eval_gpu()'],['../classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e',1,'mlx::core::Sin::eval_gpu()'],['../classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75',1,'mlx::core::Sinh::eval_gpu()'],['../classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a',1,'mlx::core::Slice::eval_gpu()'],['../classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b',1,'mlx::core::SliceUpdate::eval_gpu()'],['../classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af',1,'mlx::core::Softmax::eval_gpu()'],['../classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382',1,'mlx::core::Sort::eval_gpu()'],['../classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df',1,'mlx::core::Split::eval_gpu()'],['../classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045',1,'mlx::core::Square::eval_gpu()'],['../classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501',1,'mlx::core::Sqrt::eval_gpu()'],['../classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89',1,'mlx::core::StopGradient::eval_gpu()'],['../classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c',1,'mlx::core::Subtract::eval_gpu()'],['../classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f',1,'mlx::core::Tan::eval_gpu()'],['../classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761',1,'mlx::core::Tanh::eval_gpu()'],['../classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0',1,'mlx::core::Uniform::eval_gpu()'],['../classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075',1,'mlx::core::View::eval_gpu()'],['../classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e',1,'mlx::core::Transpose::eval_gpu()'],['../classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9',1,'mlx::core::QRF::eval_gpu()'],['../classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83',1,'mlx::core::SVD::eval_gpu()'],['../classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2',1,'mlx::core::Inverse::eval_gpu()'],['../classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795',1,'mlx::core::Cholesky::eval_gpu()'],['../classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2',1,'mlx::core::Eigh::eval_gpu()']]],
   ['evaluated_39',['evaluated',['../classmlx_1_1core_1_1array.html#a199726612fa8a4bcd5c2d05eadad7078a6fc3d7595445dd877584495f47535268',1,'mlx::core::array']]],
   ['event_40',['Event',['../classmlx_1_1core_1_1_event.html',1,'mlx::core::Event'],['../classmlx_1_1core_1_1_event.html#a833506419b2110ad1abd89b2dd238b4d',1,'mlx::core::Event::Event()=default'],['../classmlx_1_1core_1_1_event.html#a13e4835f2ffb2cc22e29148a448ea184',1,'mlx::core::Event::Event(const Stream &amp;steam)']]],
   ['event_41',['event',['../classmlx_1_1core_1_1array.html#a0a8e4d6e67e739a712876bb36f88f9bf',1,'mlx::core::array']]],
@@ -61,5 +61,6 @@ var searchData=
   ['expm1f_2eh_58',['expm1f.h',['../expm1f_8h.html',1,'']]],
   ['expm1f_5fscaled_5funchecked_59',['expm1f_scaled_unchecked',['../expm1f_8h.html#adf20e03405fba634ca8d01acac24592e',1,'expm1f.h']]],
   ['export_5fto_5fdot_60',['export_to_dot',['../namespacemlx_1_1core.html#a57395bdf43d9c5c134e610c169222cca',1,'mlx::core::export_to_dot(std::ostream &amp;os, const std::vector&lt; array &gt; &amp;outputs)'],['../namespacemlx_1_1core.html#a839f94dbad44f0d37333006fc876b42e',1,'mlx::core::export_to_dot(std::ostream &amp;os, Arrays &amp;&amp;... outputs)']]],
-  ['eye_61',['eye',['../group__ops.html#ga45e9e68246b0d1cf03c3cc9c9e7e6ae3',1,'mlx::core::eye(int n, int m, int k, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga2c9011310a1fa7c82f942f54102c36dd',1,'mlx::core::eye(int n, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga61657db78ef35d41112d362c869c25d2',1,'mlx::core::eye(int n, int m, StreamOrDevice s={})'],['../group__ops.html#ga908a15b42834be498a46856c99dfc779',1,'mlx::core::eye(int n, int m, int k, StreamOrDevice s={})'],['../group__ops.html#gab777fcf6d4a89172c69ec3492548dc0f',1,'mlx::core::eye(int n, StreamOrDevice s={})']]]
+  ['expsubop_61',['ExpSubOp',['../struct_exp_sub_op.html',1,'']]],
+  ['eye_62',['eye',['../group__ops.html#ga45e9e68246b0d1cf03c3cc9c9e7e6ae3',1,'mlx::core::eye(int n, int m, int k, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga2c9011310a1fa7c82f942f54102c36dd',1,'mlx::core::eye(int n, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga61657db78ef35d41112d362c869c25d2',1,'mlx::core::eye(int n, int m, StreamOrDevice s={})'],['../group__ops.html#ga908a15b42834be498a46856c99dfc779',1,'mlx::core::eye(int n, int m, int k, StreamOrDevice s={})'],['../group__ops.html#gab777fcf6d4a89172c69ec3492548dc0f',1,'mlx::core::eye(int n, StreamOrDevice s={})']]]
 ];
diff --git a/docs/build/html/search/all_6.js b/docs/build/html/search/all_6.js
index 76b0e44f0..c0265cd59 100644
--- a/docs/build/html/search/all_6.js
+++ b/docs/build/html/search/all_6.js
@@ -34,7 +34,7 @@ var searchData=
   ['float16_31',['float16',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa098e7844282e240fdee28a9dac11c1c6',1,'mlx::core::Dtype::float16'],['../namespacemlx_1_1core.html#abf228ee9d8ec48c03bb15adcc4e1f3ec',1,'mlx::core::float16']]],
   ['float16_5ft_32',['float16_t',['../backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773',1,'float16_t:&#160;utils.h'],['../namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52',1,'mlx::core::float16_t']]],
   ['float32_33',['float32',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daad33ec2b0bbea6d471a4706cea030e1e3',1,'mlx::core::Dtype::float32'],['../namespacemlx_1_1core.html#a6894543b340321193dfb8052c438a319',1,'mlx::core::float32']]],
-  ['float_5fto_5fbfloat_5fbits_34',['float_to_bfloat_bits',['../backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1',1,'bf16.h']]],
+  ['float_5fto_5fbfloat_5fbits_34',['float_to_bfloat_bits',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1',1,'bf16.h']]],
   ['floating_35',['floating',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2da374515b23d6f106696387776a6077d17',1,'mlx::core::Dtype::floating'],['../namespacemlx_1_1core.html#ac9f9ea13cf0661e671569d37d14a128a',1,'mlx::core::floating']]],
   ['floor_36',['Floor',['../struct_floor.html',1,'Floor'],['../structmlx_1_1core_1_1detail_1_1_floor.html',1,'mlx::core::detail::Floor'],['../classmlx_1_1core_1_1_floor.html',1,'mlx::core::Floor'],['../classmlx_1_1core_1_1_floor.html#ada4e979b784b732696313d7094e91340',1,'mlx::core::Floor::Floor()']]],
   ['floor_37',['floor',['../namespacemetal.html#a020790f30c28a9982c4a83deaa258277',1,'metal::floor()'],['../namespacemetal_1_1fast.html#ac012ce1701c2339914f15cce9f2c632f',1,'metal::fast::floor()'],['../namespacemetal_1_1precise.html#a66e02b028e3cecfe7c80773460dc7925',1,'metal::precise::floor()'],['../group__ops.html#ga8d656904aa2690b60955ae745aecfc30',1,'mlx::core::floor(const array &amp;a, StreamOrDevice s={})']]],
@@ -53,8 +53,8 @@ var searchData=
   ['fp16_2eh_50',['fp16.h',['../fp16_8h.html',1,'']]],
   ['fp16_5fbf16_5fbinop_5fhelper_51',['fp16_bf16_binop_helper',['../half__types_8h.html#a1f0d5d395d403bde764fffe4846617f9',1,'half_types.h']]],
   ['fract_52',['fract',['../namespacemetal.html#a6b1c15d251aeaacb1f4338a5e152ae78',1,'metal::fract()'],['../namespacemetal_1_1fast.html#aa8bb448827503e485eb649eb3edb2d4c',1,'metal::fast::fract()'],['../namespacemetal_1_1precise.html#a0f21c19332a90df1a8ff507a813b5757',1,'metal::precise::fract()']]],
-  ['frag_5fat_53',['frag_at',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4',1,'mlx::steel::MMATile::frag_at(const short i, const short j)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485',1,'mlx::steel::MMATile::frag_at(const short i, const short j) const']]],
-  ['frag_5ftype_54',['frag_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::frag_type'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef',1,'mlx::steel::MMATile::frag_type']]],
+  ['frag_5fat_53',['frag_at',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4',1,'mlx::steel::MMATile::frag_at(const short i, const short j)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485',1,'mlx::steel::MMATile::frag_at(const short i, const short j) const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4',1,'mlx::steel::MMATile::frag_at(const short i, const short j)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485',1,'mlx::steel::MMATile::frag_at(const short i, const short j) const']]],
+  ['frag_5ftype_54',['frag_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::frag_type'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171',1,'mlx::steel::MMATile::frag_type']]],
   ['free_55',['free',['../classmlx_1_1core_1_1allocator_1_1_allocator.html#ae963d551be646ae0e13df2c16f2beefb',1,'mlx::core::allocator::Allocator::free()'],['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html#a84b50d1a3cbffa12c1a6cf0ed8c71079',1,'mlx::core::allocator::CommonAllocator::free()'],['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a109a0a37fb0b3be381a62dc3b1a54bf0',1,'mlx::core::metal::MetalAllocator::free()'],['../namespacemlx_1_1core_1_1allocator.html#a77f0a1215be242db6485612bcb273af5',1,'mlx::core::allocator::free()']]],
   ['frexp_56',['frexp',['../namespacemetal.html#ac89d4ef524d21a301da6c37dbd95ff9f',1,'metal::frexp()'],['../namespacemetal_1_1fast.html#a23902df22aeaa859ef673a36381387c2',1,'metal::fast::frexp()'],['../namespacemetal_1_1precise.html#a0fbb1624c308b97380f894f92fd858b4',1,'metal::precise::frexp()']]],
   ['full_57',['Full',['../classmlx_1_1core_1_1_full.html',1,'mlx::core::Full'],['../classmlx_1_1core_1_1_full.html#aafcb86a2e41353853ec48c717e0c54d6',1,'mlx::core::Full::Full()']]],
diff --git a/docs/build/html/search/all_7.js b/docs/build/html/search/all_7.js
index 50f643bac..379fa3f61 100644
--- a/docs/build/html/search/all_7.js
+++ b/docs/build/html/search/all_7.js
@@ -4,7 +4,7 @@ var searchData=
   ['gather_1',['gather',['../namespacemlx_1_1core_1_1metal.html#a545de371fefba1feec2e70b7e9f4187c',1,'mlx::core::metal::gather()'],['../group__ops.html#gab6e7f655a9ff15350ca5379692f9d444',1,'mlx::core::gather(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const std::vector&lt; int &gt; &amp;axes, const std::vector&lt; int &gt; &amp;slice_sizes, StreamOrDevice s={})'],['../group__ops.html#gadb4337ca5d4f88fe9e7c083bc478158b',1,'mlx::core::gather(const array &amp;a, const array &amp;indices, int axis, const std::vector&lt; int &gt; &amp;slice_sizes, StreamOrDevice s={})']]],
   ['gather_2eh_2',['gather.h',['../gather_8h.html',1,'']]],
   ['gather_5fbias_3',['gather_bias',['../steel__gemm__fused_8h.html#aaaf17233201156be684f858bfd0f1b67',1,'steel_gemm_fused.h']]],
-  ['gather_5fimpl_4',['gather_impl',['../gather_8h.html#abdec470e1af0109563ddae3e85e6526c',1,'gather.h']]],
+  ['gather_5fimpl_4',['gather_impl',['../gather_8h.html#a767d7c5be6f2f649101f581449af5599',1,'gather.h']]],
   ['gather_5fkernels_5',['gather_kernels',['../jit_2indexing_8h.html#a1a03318128191891a84707602b57b3cf',1,'indexing.h']]],
   ['gather_5fmm_6',['gather_mm',['../group__ops.html#ga8d50480266d258cac40ff51bcb0fc6a7',1,'mlx::core']]],
   ['gather_5fqmm_7',['gather_qmm',['../group__ops.html#ga368a0dc0e5dfb76922e7aa55a95f12f0',1,'mlx::core']]],
@@ -13,87 +13,87 @@ var searchData=
   ['gemm_10',['gemm',['../namespacemlx_1_1core_1_1metal.html#ac46fd23516a61fc56d997910e4144281',1,'mlx::core::metal::gemm()'],['../steel__gemm__fused_8h.html#aa40dd40b9a0bbf20c8911032ed0c3e6d',1,'gemm(const device T *A, const device T *B, const device T *C, device T *D, const constant GEMMParams *params, const constant GEMMAddMMParams *addmm_params, const constant int *batch_shape, const constant size_t *batch_strides, const constant uint32_t *lhs_indices, const constant uint32_t *rhs_indices, const constant uint32_t *C_indices, const constant int *operand_shape, const constant size_t *operand_strides, const constant packed_int3 &amp;operand_batch_ndim, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid):&#160;steel_gemm_fused.h']]],
   ['gemm_2eh_11',['gemm.h',['../gemm_8h.html',1,'']]],
   ['gemm_5fk_5fiterations_12',['gemm_k_iterations',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a8b50863e4e2d3481c154be6c3629bf51',1,'mlx::steel::ImplicitGemmConv2DParams']]],
-  ['gemm_5fk_5fiterations_5faligned_13',['gemm_k_iterations_aligned',['../struct_m_l_x_fast_attention_params.html#adbc0a13076da5f704498e57239cb2bf2',1,'MLXFastAttentionParams::gemm_k_iterations_aligned'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a0d7f419ba265805b418e93ce1ca2e0f9',1,'mlx::steel::GEMMParams::gemm_k_iterations_aligned'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#aa37e05a03ac8b34ec7dc31ca42f68998',1,'mlx::steel::GEMMSpiltKParams::gemm_k_iterations_aligned']]],
-  ['gemm_5floop_14',['gemm_loop',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780',1,'mlx::steel::GEMMKernel']]],
-  ['gemm_5fn_5fiterations_5faligned_15',['gemm_n_iterations_aligned',['../struct_m_l_x_fast_attention_params.html#ab56b3db8fc6a938ce9c739ee78a7b803',1,'MLXFastAttentionParams']]],
-  ['gemm_5fparams_16',['gemm_params',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ab0724eb3ef52ee773b6607f6433b9f2c',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::gemm_params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#acc778b3c0b7ec38a43e8ea943df8704c',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::gemm_params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af59f9d356c4c3ec5627dc5a263d239d4',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::gemm_params']]],
-  ['gemm_5fsplitk_17',['gemm_splitk',['../steel__gemm__splitk_8h.html#a3be6e095a0a026d3ecf57a3e67f76188',1,'steel_gemm_splitk.h']]],
-  ['gemm_5fsplitk_5faccum_18',['gemm_splitk_accum',['../steel__gemm__splitk_8h.html#abeb921bf1dc7941125188ddd390b0907',1,'steel_gemm_splitk.h']]],
-  ['gemm_5fsplitk_5faccum_5faxpby_19',['gemm_splitk_accum_axpby',['../steel__gemm__splitk_8h.html#acc33fdfaaf3eb3a0629b3d52c7043dc1',1,'steel_gemm_splitk.h']]],
-  ['gemm_5fsv_5fm_5fblock_5fiterations_20',['gemm_sv_m_block_iterations',['../struct_m_l_x_fast_attention_params.html#a2799a2f219441fef7f351374f4cbc67c',1,'MLXFastAttentionParams']]],
-  ['gemmaddmmparams_21',['GEMMAddMMParams',['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html',1,'mlx::steel']]],
-  ['gemmkernel_22',['GEMMKernel',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html',1,'mlx::steel']]],
-  ['gemmparams_23',['GEMMParams',['../structmlx_1_1steel_1_1_g_e_m_m_params.html',1,'mlx::steel']]],
-  ['gemmspiltkparams_24',['GEMMSpiltKParams',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html',1,'mlx::steel']]],
-  ['gemv_5fmasked_25',['gemv_masked',['../namespacemlx_1_1core_1_1metal.html#abc055b75e6a059618f279c35f8de36e7',1,'mlx::core::metal::gemv_masked()'],['../kernels_2gemv__masked_8h.html#ab3070d14cdecb1dd7dc220a551da6b7b',1,'gemv_masked(const device T *mat, const device T *in_vec, device T *out_vec, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;marix_ld, const constant int &amp;batch_ndim, const constant int *batch_shape, const constant size_t *vector_batch_stride, const constant size_t *matrix_batch_stride, const device out_mask_t *out_mask, const device op_mask_t *mat_mask, const device op_mask_t *vec_mask, const constant int *mask_strides, const constant size_t *mask_batch_strides, uint3 tid, uint3 lid, uint simd_gid, uint simd_lid):&#160;gemv_masked.h']]],
-  ['gemv_5fmasked_2eh_26',['gemv_masked.h',['../jit_2gemv__masked_8h.html',1,'(Global Namespace)'],['../kernels_2gemv__masked_8h.html',1,'(Global Namespace)']]],
-  ['gemv_5fmasked_5fkernel_27',['gemv_masked_kernel',['../jit_2gemv__masked_8h.html#a933f06c211f86c37673dee329ed6901f',1,'gemv_masked.h']]],
-  ['gemv_5ft_5fmasked_28',['gemv_t_masked',['../kernels_2gemv__masked_8h.html#a0c8d353fc453e448b2d0ed9a19431b63',1,'gemv_masked.h']]],
-  ['gemvkernel_29',['GEMVKernel',['../struct_g_e_m_v_kernel.html',1,'']]],
-  ['gemvtkernel_30',['GEMVTKernel',['../struct_g_e_m_v_t_kernel.html',1,'']]],
-  ['general_31',['General',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337a0db377921f4ce762c62526131097968f',1,'mlx::core']]],
-  ['general_5fc2r_32',['general_c2r',['../namespacepocketfft_1_1detail.html#ac8ee38e8d8bcda875c99eeaf567550fc',1,'pocketfft::detail']]],
-  ['general_5fnd_33',['general_nd',['../namespacepocketfft_1_1detail.html#ab47f52551920af5eb9f57fbbded0f4f0',1,'pocketfft::detail']]],
-  ['general_5fr2c_34',['general_r2c',['../namespacepocketfft_1_1detail.html#a055a39b0a337ca12217717196eb92fed',1,'pocketfft::detail']]],
-  ['generalcontiguousreduce_35',['GeneralContiguousReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65a540cf31fe6858115a02e789938297cdb',1,'mlx::core']]],
-  ['generalgeneral_36',['GeneralGeneral',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337a6fe62e8ce1fae1e70cb9eeaa67d29dab',1,'mlx::core']]],
-  ['generalreduce_37',['GeneralReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65a3622f95ed0ec99657f9ad8ef39ec2184',1,'mlx::core']]],
-  ['generalstridedreduce_38',['GeneralStridedReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ad999b1a8ae1d7436efb5ffdfafb1dd3d',1,'mlx::core']]],
-  ['generic_39',['generic',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2da3d517f8924ac7fd03699a29d97dc52d9',1,'mlx::core::Dtype::generic'],['../namespacemlx_1_1core.html#a34d69c4d46aa9b2a4a79dba7aba093d2',1,'mlx::core::generic']]],
-  ['get_5f2d_5fgrid_5fdims_40',['get_2d_grid_dims',['../namespacemlx_1_1core.html#a8dc169474a51a1f4f761d5752819bd7c',1,'mlx::core::get_2d_grid_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides)'],['../namespacemlx_1_1core.html#a187b9a932c7b3d67ee42d9d12fcb1bb1',1,'mlx::core::get_2d_grid_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides, size_t divisor)']]],
-  ['get_5factive_5fmemory_41',['get_active_memory',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a7a3ad4e33d57a47474c98e2f88e775d7',1,'mlx::core::metal::MetalAllocator::get_active_memory()'],['../namespacemlx_1_1core_1_1metal.html#a7b75c2639016ac4d350fa6c9da386667',1,'mlx::core::metal::get_active_memory()']]],
-  ['get_5farange_5fkernel_42',['get_arange_kernel',['../namespacemlx_1_1core.html#a76f614e9956a6ca05a9be4db5a483446',1,'mlx::core']]],
-  ['get_5farchitecture_43',['get_architecture',['../classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b',1,'mlx::core::metal::Device']]],
-  ['get_5fbinary_5fkernel_44',['get_binary_kernel',['../namespacemlx_1_1core.html#a4decd4a07d91487e6903f6e3c8b7513a',1,'mlx::core']]],
-  ['get_5fbinary_5ftwo_5fkernel_45',['get_binary_two_kernel',['../namespacemlx_1_1core.html#a4e809746f48e5dcf7fa63215d3f5e33e',1,'mlx::core']]],
-  ['get_5fblock_5fdims_46',['get_block_dims',['../namespacemlx_1_1core.html#a0f0f59d3ffe2d16a684e5fc093302e15',1,'mlx::core']]],
-  ['get_5fcache_5fmemory_47',['get_cache_memory',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#ad3cabbe638917ca4114eb74dcabe381f',1,'mlx::core::metal::MetalAllocator::get_cache_memory()'],['../namespacemlx_1_1core_1_1metal.html#a43307654f62ed7c58e014be7fb03909c',1,'mlx::core::metal::get_cache_memory()']]],
-  ['get_5fcolocated_5fmtllib_5fpath_48',['get_colocated_mtllib_path',['../namespacemlx_1_1core_1_1metal.html#a5fd6ba2040e53a254b9d71ae7ebd315f',1,'mlx::core::metal']]],
-  ['get_5fcommand_5fbuffer_49',['get_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210',1,'mlx::core::metal::Device']]],
-  ['get_5fcommand_5fbuffer_5fops_50',['get_command_buffer_ops',['../classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8',1,'mlx::core::metal::Device']]],
-  ['get_5fcommand_5fencoder_51',['get_command_encoder',['../classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6',1,'mlx::core::metal::Device']]],
-  ['get_5fcoord_52',['get_coord',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
-  ['get_5fcopy_5fkernel_53',['get_copy_kernel',['../namespacemlx_1_1core.html#a05a220cff45f12439fde775983c6df78',1,'mlx::core']]],
-  ['get_5fdefault_5fstream_54',['get_default_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a2366c7b888e433608e203752edc92282',1,'mlx::core::scheduler::Scheduler']]],
-  ['get_5ffft_5fkernel_55',['get_fft_kernel',['../namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f',1,'mlx::core']]],
-  ['get_5fgemv_5fmasked_5fkernel_56',['get_gemv_masked_kernel',['../namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818',1,'mlx::core']]],
-  ['get_5fkernel_57',['get_kernel',['../classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a',1,'mlx::core::metal::Device::get_kernel(const std::string &amp;base_name, MTL::Library *mtl_lib, const std::string &amp;hash_name=&quot;&quot;, const MTLFCList &amp;func_consts={}, const std::vector&lt; MTL::Function * &gt; &amp;linked_functions={})'],['../classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf',1,'mlx::core::metal::Device::get_kernel(const std::string &amp;base_name, const std::string &amp;lib_name=&quot;mlx&quot;, const std::string &amp;hash_name=&quot;&quot;, const MTLFCList &amp;func_consts={}, const std::vector&lt; MTL::Function * &gt; &amp;linked_functions={})']]],
-  ['get_5fkernel_5fpreamble_58',['get_kernel_preamble',['../compiled__preamble_8h.html#a1dfa17a0369fb90ff615c7461f5013f3',1,'compiled_preamble.h']]],
-  ['get_5flibrary_59',['get_library',['../classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0',1,'mlx::core::metal::Device']]],
-  ['get_5fmb_5fsort_5fkernel_60',['get_mb_sort_kernel',['../namespacemlx_1_1core.html#afb57825bb763050cc9a9d194aa41ac36',1,'mlx::core']]],
-  ['get_5fname_61',['get_name',['../structmlx_1_1core_1_1_node_namer.html#a1690dd38de288c0aee2bb53156eb770e',1,'mlx::core::NodeNamer']]],
-  ['get_5fpeak_5fmemory_62',['get_peak_memory',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#ac7972a3fe58e69489de775a0f152da17',1,'mlx::core::metal::MetalAllocator::get_peak_memory()'],['../namespacemlx_1_1core_1_1metal.html#a4b67d680cefa95f0ed5801f0e14e48ce',1,'mlx::core::metal::get_peak_memory()']]],
-  ['get_5fplan_63',['get_plan',['../namespacepocketfft_1_1detail.html#ab24cdb6118901f4d3c8df06ef0f8390b',1,'pocketfft::detail']]],
-  ['get_5fpool_64',['get_pool',['../namespacepocketfft_1_1detail_1_1threading.html#a7ec2b3f99232bd0f15f7b022c59d139a',1,'pocketfft::detail::threading']]],
-  ['get_5fprimitive_5fstring_65',['get_primitive_string',['../namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60',1,'mlx::core']]],
-  ['get_5fquantized_5fkernel_66',['get_quantized_kernel',['../namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e',1,'mlx::core']]],
-  ['get_5freduce_5finit_5fkernel_67',['get_reduce_init_kernel',['../namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647',1,'mlx::core']]],
-  ['get_5freduce_5fkernel_68',['get_reduce_kernel',['../namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b',1,'mlx::core']]],
-  ['get_5freduction_5fplan_69',['get_reduction_plan',['../namespacemlx_1_1core.html#ac97b5a6f009ca3d99854ce9512c20dba',1,'mlx::core']]],
-  ['get_5fscan_5fkernel_70',['get_scan_kernel',['../namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f',1,'mlx::core']]],
-  ['get_5fshape_71',['get_shape',['../namespacemlx_1_1core.html#aab0d8a256957984acc1e3615c65c898e',1,'mlx::core']]],
-  ['get_5fsoftmax_5fkernel_72',['get_softmax_kernel',['../namespacemlx_1_1core.html#a35a412f688d79eb47e42d20a7c8650ee',1,'mlx::core']]],
-  ['get_5fsort_5fkernel_73',['get_sort_kernel',['../namespacemlx_1_1core.html#a84ebe6275218070f0ea320f126f64e22',1,'mlx::core']]],
-  ['get_5fsteel_5fconv_5fgeneral_5fkernel_74',['get_steel_conv_general_kernel',['../namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d',1,'mlx::core']]],
-  ['get_5fsteel_5fconv_5fkernel_75',['get_steel_conv_kernel',['../namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4',1,'mlx::core']]],
-  ['get_5fsteel_5fgemm_5ffused_5fkernel_76',['get_steel_gemm_fused_kernel',['../namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b',1,'mlx::core']]],
-  ['get_5fsteel_5fgemm_5fmasked_5fkernel_77',['get_steel_gemm_masked_kernel',['../namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d',1,'mlx::core']]],
-  ['get_5fsteel_5fgemm_5fsplitk_5faccum_5fkernel_78',['get_steel_gemm_splitk_accum_kernel',['../namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b',1,'mlx::core']]],
-  ['get_5fsteel_5fgemm_5fsplitk_5fkernel_79',['get_steel_gemm_splitk_kernel',['../namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5',1,'mlx::core']]],
-  ['get_5ftemplate_5fdefinition_80',['get_template_definition',['../namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032',1,'mlx::core']]],
-  ['get_5fternary_5fkernel_81',['get_ternary_kernel',['../namespacemlx_1_1core.html#a54eb3b65375022428aab5f810e40624b',1,'mlx::core']]],
-  ['get_5ftwiddle_82',['get_twiddle',['../radix_8h.html#ac5cf950316b9445296ee9ecfc56a56bd',1,'radix.h']]],
-  ['get_5ftype_5fstring_83',['get_type_string',['../namespacemlx_1_1core.html#af776fd91dd60594dcfebbafd17f19068',1,'mlx::core']]],
-  ['get_5funary_5fkernel_84',['get_unary_kernel',['../namespacemlx_1_1core.html#afbb085188b563a54606d84f87a9bf5a6',1,'mlx::core']]],
-  ['gguf_2eh_85',['gguf.h',['../gguf_8h.html',1,'']]],
-  ['gguf_5fload_5fquantized_86',['gguf_load_quantized',['../namespacemlx_1_1core.html#a65dd68163bdaef3631e3724327782498',1,'mlx::core']]],
-  ['ggufload_87',['GGUFLoad',['../namespacemlx_1_1core.html#aa5b0f7f13a941e1f41c411194e9033c7',1,'mlx::core']]],
-  ['ggufmetadata_88',['GGUFMetaData',['../namespacemlx_1_1core.html#a8c2c1b9a37aadfb48f4c3a7e806e32e3',1,'mlx::core']]],
-  ['global_5fformatter_89',['global_formatter',['../namespacemlx_1_1core.html#af5a408a78cc934717dd711ddfda58ea6',1,'mlx::core']]],
-  ['good_90',['good',['../classmlx_1_1core_1_1io_1_1_reader.html#a005d0b52c1f34866f7412b7f41dabec3',1,'mlx::core::io::Reader::good()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a0b050c2c27487007e250e2e19560ffe4',1,'mlx::core::io::Writer::good()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#ac54a2c693acc3d9e6e942412148ffcc9',1,'mlx::core::io::ParallelFileReader::good()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9ec4934b26fb358d699ddce1482b2d54',1,'mlx::core::io::FileWriter::good()']]],
-  ['good_5fsize_5fcmplx_91',['good_size_cmplx',['../structpocketfft_1_1detail_1_1util.html#a758e00d242a1b7eda8f9f0c21f35c624',1,'pocketfft::detail::util']]],
-  ['good_5fsize_5freal_92',['good_size_real',['../structpocketfft_1_1detail_1_1util.html#a173da7d5929ded86fffcebcfdc5086aa',1,'pocketfft::detail::util']]],
-  ['gpu_93',['gpu',['../structmlx_1_1core_1_1_device.html#a45ed081b56ae5d4ddd39c83a5d8a1616',1,'mlx::core::Device::gpu'],['../structmlx_1_1core_1_1_device.html#ac45b3de9b3458d8f31005136cde20fdba0aa0be2a866411d9ff03515227454947',1,'mlx::core::Device::gpu']]],
+  ['gemm_5fk_5fiterations_5faligned_13',['gemm_k_iterations_aligned',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a0d7f419ba265805b418e93ce1ca2e0f9',1,'mlx::steel::GEMMParams::gemm_k_iterations_aligned'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#aa37e05a03ac8b34ec7dc31ca42f68998',1,'mlx::steel::GEMMSpiltKParams::gemm_k_iterations_aligned']]],
+  ['gemm_5floop_14',['gemm_loop',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780',1,'mlx::steel::GEMMKernel::gemm_loop(threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread loader_a_t &amp;loader_a, thread loader_b_t &amp;loader_b, thread mma_t &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, LoopAlignment&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})'],['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780',1,'mlx::steel::GEMMKernel::gemm_loop(threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread loader_a_t &amp;loader_a, thread loader_b_t &amp;loader_b, thread mma_t &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, LoopAlignment&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})']]],
+  ['gemm_5fparams_15',['gemm_params',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ab0724eb3ef52ee773b6607f6433b9f2c',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::gemm_params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#acc778b3c0b7ec38a43e8ea943df8704c',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::gemm_params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af59f9d356c4c3ec5627dc5a263d239d4',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::gemm_params']]],
+  ['gemm_5fsplitk_16',['gemm_splitk',['../steel__gemm__splitk_8h.html#a3be6e095a0a026d3ecf57a3e67f76188',1,'steel_gemm_splitk.h']]],
+  ['gemm_5fsplitk_5faccum_17',['gemm_splitk_accum',['../steel__gemm__splitk_8h.html#abeb921bf1dc7941125188ddd390b0907',1,'steel_gemm_splitk.h']]],
+  ['gemm_5fsplitk_5faccum_5faxpby_18',['gemm_splitk_accum_axpby',['../steel__gemm__splitk_8h.html#acc33fdfaaf3eb3a0629b3d52c7043dc1',1,'steel_gemm_splitk.h']]],
+  ['gemmaddmmparams_19',['GEMMAddMMParams',['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html',1,'mlx::steel']]],
+  ['gemmkernel_20',['GEMMKernel',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html',1,'mlx::steel']]],
+  ['gemmparams_21',['GEMMParams',['../structmlx_1_1steel_1_1_g_e_m_m_params.html',1,'mlx::steel']]],
+  ['gemmspiltkparams_22',['GEMMSpiltKParams',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html',1,'mlx::steel']]],
+  ['gemv_5fmasked_23',['gemv_masked',['../namespacemlx_1_1core_1_1metal.html#abc055b75e6a059618f279c35f8de36e7',1,'mlx::core::metal::gemv_masked()'],['../kernels_2gemv__masked_8h.html#ab3070d14cdecb1dd7dc220a551da6b7b',1,'gemv_masked(const device T *mat, const device T *in_vec, device T *out_vec, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;marix_ld, const constant int &amp;batch_ndim, const constant int *batch_shape, const constant size_t *vector_batch_stride, const constant size_t *matrix_batch_stride, const device out_mask_t *out_mask, const device op_mask_t *mat_mask, const device op_mask_t *vec_mask, const constant int *mask_strides, const constant size_t *mask_batch_strides, uint3 tid, uint3 lid, uint simd_gid, uint simd_lid):&#160;gemv_masked.h']]],
+  ['gemv_5fmasked_2eh_24',['gemv_masked.h',['../jit_2gemv__masked_8h.html',1,'(Global Namespace)'],['../kernels_2gemv__masked_8h.html',1,'(Global Namespace)']]],
+  ['gemv_5fmasked_5fkernel_25',['gemv_masked_kernel',['../jit_2gemv__masked_8h.html#a933f06c211f86c37673dee329ed6901f',1,'gemv_masked.h']]],
+  ['gemv_5ft_5fmasked_26',['gemv_t_masked',['../kernels_2gemv__masked_8h.html#a0c8d353fc453e448b2d0ed9a19431b63',1,'gemv_masked.h']]],
+  ['gemvkernel_27',['GEMVKernel',['../struct_g_e_m_v_kernel.html',1,'']]],
+  ['gemvtkernel_28',['GEMVTKernel',['../struct_g_e_m_v_t_kernel.html',1,'']]],
+  ['general_29',['General',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337a0db377921f4ce762c62526131097968f',1,'mlx::core']]],
+  ['general_5fc2r_30',['general_c2r',['../namespacepocketfft_1_1detail.html#ac8ee38e8d8bcda875c99eeaf567550fc',1,'pocketfft::detail']]],
+  ['general_5fnd_31',['general_nd',['../namespacepocketfft_1_1detail.html#ab47f52551920af5eb9f57fbbded0f4f0',1,'pocketfft::detail']]],
+  ['general_5fr2c_32',['general_r2c',['../namespacepocketfft_1_1detail.html#a055a39b0a337ca12217717196eb92fed',1,'pocketfft::detail']]],
+  ['generalcontiguousreduce_33',['GeneralContiguousReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65a540cf31fe6858115a02e789938297cdb',1,'mlx::core']]],
+  ['generalgeneral_34',['GeneralGeneral',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337a6fe62e8ce1fae1e70cb9eeaa67d29dab',1,'mlx::core']]],
+  ['generalreduce_35',['GeneralReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65a3622f95ed0ec99657f9ad8ef39ec2184',1,'mlx::core']]],
+  ['generalstridedreduce_36',['GeneralStridedReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ad999b1a8ae1d7436efb5ffdfafb1dd3d',1,'mlx::core']]],
+  ['generic_37',['generic',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2da3d517f8924ac7fd03699a29d97dc52d9',1,'mlx::core::Dtype::generic'],['../namespacemlx_1_1core.html#a34d69c4d46aa9b2a4a79dba7aba093d2',1,'mlx::core::generic']]],
+  ['get_5f2d_5fgrid_5fdims_38',['get_2d_grid_dims',['../namespacemlx_1_1core.html#a8dc169474a51a1f4f761d5752819bd7c',1,'mlx::core::get_2d_grid_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides)'],['../namespacemlx_1_1core.html#a187b9a932c7b3d67ee42d9d12fcb1bb1',1,'mlx::core::get_2d_grid_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides, size_t divisor)']]],
+  ['get_5factive_5fmemory_39',['get_active_memory',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a7a3ad4e33d57a47474c98e2f88e775d7',1,'mlx::core::metal::MetalAllocator::get_active_memory()'],['../namespacemlx_1_1core_1_1metal.html#a7b75c2639016ac4d350fa6c9da386667',1,'mlx::core::metal::get_active_memory()']]],
+  ['get_5farange_5fkernel_40',['get_arange_kernel',['../namespacemlx_1_1core.html#a76f614e9956a6ca05a9be4db5a483446',1,'mlx::core']]],
+  ['get_5farchitecture_41',['get_architecture',['../classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b',1,'mlx::core::metal::Device']]],
+  ['get_5fbinary_5fkernel_42',['get_binary_kernel',['../namespacemlx_1_1core.html#a4decd4a07d91487e6903f6e3c8b7513a',1,'mlx::core']]],
+  ['get_5fbinary_5ftwo_5fkernel_43',['get_binary_two_kernel',['../namespacemlx_1_1core.html#a4e809746f48e5dcf7fa63215d3f5e33e',1,'mlx::core']]],
+  ['get_5fblock_5fdims_44',['get_block_dims',['../namespacemlx_1_1core.html#a0f0f59d3ffe2d16a684e5fc093302e15',1,'mlx::core']]],
+  ['get_5fcache_5fmemory_45',['get_cache_memory',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#ad3cabbe638917ca4114eb74dcabe381f',1,'mlx::core::metal::MetalAllocator::get_cache_memory()'],['../namespacemlx_1_1core_1_1metal.html#a43307654f62ed7c58e014be7fb03909c',1,'mlx::core::metal::get_cache_memory()']]],
+  ['get_5fcolocated_5fmtllib_5fpath_46',['get_colocated_mtllib_path',['../namespacemlx_1_1core_1_1metal.html#a5fd6ba2040e53a254b9d71ae7ebd315f',1,'mlx::core::metal']]],
+  ['get_5fcommand_5fbuffer_47',['get_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210',1,'mlx::core::metal::Device']]],
+  ['get_5fcommand_5fbuffer_5fops_48',['get_command_buffer_ops',['../classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8',1,'mlx::core::metal::Device']]],
+  ['get_5fcommand_5fencoder_49',['get_command_encoder',['../classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6',1,'mlx::core::metal::Device']]],
+  ['get_5fcoord_50',['get_coord',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::get_coord(ushort simd_lane_id)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::get_coord(ushort simd_lane_id)']]],
+  ['get_5fcopy_5fkernel_51',['get_copy_kernel',['../namespacemlx_1_1core.html#a05a220cff45f12439fde775983c6df78',1,'mlx::core']]],
+  ['get_5fdefault_5fstream_52',['get_default_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a2366c7b888e433608e203752edc92282',1,'mlx::core::scheduler::Scheduler']]],
+  ['get_5ffft_5fkernel_53',['get_fft_kernel',['../namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f',1,'mlx::core']]],
+  ['get_5fgemv_5fmasked_5fkernel_54',['get_gemv_masked_kernel',['../namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818',1,'mlx::core']]],
+  ['get_5fkernel_55',['get_kernel',['../classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a',1,'mlx::core::metal::Device::get_kernel(const std::string &amp;base_name, MTL::Library *mtl_lib, const std::string &amp;hash_name=&quot;&quot;, const MTLFCList &amp;func_consts={}, const std::vector&lt; MTL::Function * &gt; &amp;linked_functions={})'],['../classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf',1,'mlx::core::metal::Device::get_kernel(const std::string &amp;base_name, const std::string &amp;lib_name=&quot;mlx&quot;, const std::string &amp;hash_name=&quot;&quot;, const MTLFCList &amp;func_consts={}, const std::vector&lt; MTL::Function * &gt; &amp;linked_functions={})']]],
+  ['get_5fkernel_5fpreamble_56',['get_kernel_preamble',['../compiled__preamble_8h.html#a1dfa17a0369fb90ff615c7461f5013f3',1,'compiled_preamble.h']]],
+  ['get_5flibrary_57',['get_library',['../classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0',1,'mlx::core::metal::Device']]],
+  ['get_5fmb_5fsort_5fkernel_58',['get_mb_sort_kernel',['../namespacemlx_1_1core.html#afb57825bb763050cc9a9d194aa41ac36',1,'mlx::core']]],
+  ['get_5fname_59',['get_name',['../structmlx_1_1core_1_1_node_namer.html#a1690dd38de288c0aee2bb53156eb770e',1,'mlx::core::NodeNamer']]],
+  ['get_5fpeak_5fmemory_60',['get_peak_memory',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#ac7972a3fe58e69489de775a0f152da17',1,'mlx::core::metal::MetalAllocator::get_peak_memory()'],['../namespacemlx_1_1core_1_1metal.html#a4b67d680cefa95f0ed5801f0e14e48ce',1,'mlx::core::metal::get_peak_memory()']]],
+  ['get_5fplan_61',['get_plan',['../namespacepocketfft_1_1detail.html#ab24cdb6118901f4d3c8df06ef0f8390b',1,'pocketfft::detail']]],
+  ['get_5fpool_62',['get_pool',['../namespacepocketfft_1_1detail_1_1threading.html#a7ec2b3f99232bd0f15f7b022c59d139a',1,'pocketfft::detail::threading']]],
+  ['get_5fprimitive_5fstring_63',['get_primitive_string',['../namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60',1,'mlx::core']]],
+  ['get_5fquantized_5fkernel_64',['get_quantized_kernel',['../namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e',1,'mlx::core']]],
+  ['get_5freduce_5finit_5fkernel_65',['get_reduce_init_kernel',['../namespacemlx_1_1core.html#ae0470605dc819efeb6510183619f0299',1,'mlx::core']]],
+  ['get_5freduce_5fkernel_66',['get_reduce_kernel',['../namespacemlx_1_1core.html#a1be32ba7d67137dde7ac191dfe83ff49',1,'mlx::core']]],
+  ['get_5freduction_5fplan_67',['get_reduction_plan',['../namespacemlx_1_1core.html#ac97b5a6f009ca3d99854ce9512c20dba',1,'mlx::core']]],
+  ['get_5fscan_5fkernel_68',['get_scan_kernel',['../namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f',1,'mlx::core']]],
+  ['get_5fshape_69',['get_shape',['../namespacemlx_1_1core.html#aab0d8a256957984acc1e3615c65c898e',1,'mlx::core']]],
+  ['get_5fsoftmax_5fkernel_70',['get_softmax_kernel',['../namespacemlx_1_1core.html#a35a412f688d79eb47e42d20a7c8650ee',1,'mlx::core']]],
+  ['get_5fsort_5fkernel_71',['get_sort_kernel',['../namespacemlx_1_1core.html#a84ebe6275218070f0ea320f126f64e22',1,'mlx::core']]],
+  ['get_5fsteel_5fconv_5fgeneral_5fkernel_72',['get_steel_conv_general_kernel',['../namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d',1,'mlx::core']]],
+  ['get_5fsteel_5fconv_5fkernel_73',['get_steel_conv_kernel',['../namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4',1,'mlx::core']]],
+  ['get_5fsteel_5fgemm_5ffused_5fkernel_74',['get_steel_gemm_fused_kernel',['../namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b',1,'mlx::core']]],
+  ['get_5fsteel_5fgemm_5fmasked_5fkernel_75',['get_steel_gemm_masked_kernel',['../namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d',1,'mlx::core']]],
+  ['get_5fsteel_5fgemm_5fsplitk_5faccum_5fkernel_76',['get_steel_gemm_splitk_accum_kernel',['../namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b',1,'mlx::core']]],
+  ['get_5fsteel_5fgemm_5fsplitk_5fkernel_77',['get_steel_gemm_splitk_kernel',['../namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5',1,'mlx::core']]],
+  ['get_5ftemplate_5fdefinition_78',['get_template_definition',['../namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032',1,'mlx::core']]],
+  ['get_5fternary_5fkernel_79',['get_ternary_kernel',['../namespacemlx_1_1core.html#a54eb3b65375022428aab5f810e40624b',1,'mlx::core']]],
+  ['get_5ftwiddle_80',['get_twiddle',['../radix_8h.html#ac5cf950316b9445296ee9ecfc56a56bd',1,'radix.h']]],
+  ['get_5ftype_5fstring_81',['get_type_string',['../namespacemlx_1_1core.html#af776fd91dd60594dcfebbafd17f19068',1,'mlx::core']]],
+  ['get_5funary_5fkernel_82',['get_unary_kernel',['../namespacemlx_1_1core.html#afbb085188b563a54606d84f87a9bf5a6',1,'mlx::core']]],
+  ['get_5fvar_83',['get_var',['../namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3',1,'mlx::core::env']]],
+  ['gguf_2eh_84',['gguf.h',['../gguf_8h.html',1,'']]],
+  ['gguf_5fload_5fquantized_85',['gguf_load_quantized',['../namespacemlx_1_1core.html#a65dd68163bdaef3631e3724327782498',1,'mlx::core']]],
+  ['ggufload_86',['GGUFLoad',['../namespacemlx_1_1core.html#aa5b0f7f13a941e1f41c411194e9033c7',1,'mlx::core']]],
+  ['ggufmetadata_87',['GGUFMetaData',['../namespacemlx_1_1core.html#a8c2c1b9a37aadfb48f4c3a7e806e32e3',1,'mlx::core']]],
+  ['global_5fformatter_88',['global_formatter',['../namespacemlx_1_1core.html#af5a408a78cc934717dd711ddfda58ea6',1,'mlx::core']]],
+  ['good_89',['good',['../classmlx_1_1core_1_1io_1_1_reader.html#a005d0b52c1f34866f7412b7f41dabec3',1,'mlx::core::io::Reader::good()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a0b050c2c27487007e250e2e19560ffe4',1,'mlx::core::io::Writer::good()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#ac54a2c693acc3d9e6e942412148ffcc9',1,'mlx::core::io::ParallelFileReader::good()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9ec4934b26fb358d699ddce1482b2d54',1,'mlx::core::io::FileWriter::good()']]],
+  ['good_5fsize_5fcmplx_90',['good_size_cmplx',['../structpocketfft_1_1detail_1_1util.html#a758e00d242a1b7eda8f9f0c21f35c624',1,'pocketfft::detail::util']]],
+  ['good_5fsize_5freal_91',['good_size_real',['../structpocketfft_1_1detail_1_1util.html#a173da7d5929ded86fffcebcfdc5086aa',1,'pocketfft::detail::util']]],
+  ['gpu_92',['gpu',['../structmlx_1_1core_1_1_device.html#a45ed081b56ae5d4ddd39c83a5d8a1616',1,'mlx::core::Device::gpu'],['../structmlx_1_1core_1_1_device.html#ac45b3de9b3458d8f31005136cde20fdba0aa0be2a866411d9ff03515227454947',1,'mlx::core::Device::gpu']]],
+  ['gqa_5ffactor_93',['gqa_factor',['../structmlx_1_1steel_1_1_attn_params.html#a3b3e18cb993ab24819c852bc64288841',1,'mlx::steel::AttnParams']]],
   ['grad_94',['grad',['../namespacemlx_1_1core.html#a3d2b2929ed4636e9e2b86e125b2e57d9',1,'mlx::core::grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;argnums)'],['../namespacemlx_1_1core.html#af482f6c64acd77c57ef5bb4b7be9726c',1,'mlx::core::grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, int argnum=0)'],['../namespacemlx_1_1core.html#a64bc619876b0f8cc81a2637ca81c99f7',1,'mlx::core::grad(const std::function&lt; array(const array &amp;)&gt; &amp;fun)']]],
   ['graph_5futils_2eh_95',['graph_utils.h',['../graph__utils_8h.html',1,'']]],
   ['greater_96',['Greater',['../struct_greater.html',1,'Greater'],['../structmlx_1_1core_1_1detail_1_1_greater.html',1,'mlx::core::detail::Greater'],['../classmlx_1_1core_1_1_greater.html',1,'mlx::core::Greater'],['../classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b',1,'mlx::core::Greater::Greater()']]],
diff --git a/docs/build/html/search/all_8.js b/docs/build/html/search/all_8.js
index 385c5b72f..b27d6beed 100644
--- a/docs/build/html/search/all_8.js
+++ b/docs/build/html/search/all_8.js
@@ -1,27 +1,28 @@
 var searchData=
 [
-  ['h12_0',['h12',['../namespacemlx_1_1core.html#a4beeeec4413be7adcfb14feaa9cf0e2e',1,'mlx::core']]],
-  ['h20_1',['h20',['../namespacemlx_1_1core.html#a862c6b94fec384c34a699ced64d01404',1,'mlx::core']]],
-  ['h28_2',['h28',['../namespacemlx_1_1core.html#ac447ad59592dd06435adca7df37e33ad',1,'mlx::core']]],
-  ['hadamard_3',['Hadamard',['../classmlx_1_1core_1_1_hadamard.html',1,'mlx::core::Hadamard'],['../classmlx_1_1core_1_1_hadamard.html#abe4a0ed820b126940beec519d4239923',1,'mlx::core::Hadamard::Hadamard()']]],
-  ['hadamard_4',['hadamard',['../namespacemlx_1_1core_1_1metal.html#a8bd0072616087cd568c2c804e7114aa9',1,'mlx::core::metal']]],
-  ['hadamard_2eh_5',['hadamard.h',['../common_2hadamard_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2hadamard_8h.html',1,'(Global Namespace)']]],
-  ['hadamard_5fm_6',['hadamard_m',['../metal_2kernels_2hadamard_8h.html#ab0bd478f2051af35aed1869005e3370a',1,'hadamard.h']]],
-  ['hadamard_5fmatrices_7',['hadamard_matrices',['../namespacemlx_1_1core.html#a50214cf406957fab27c8bef32046f030',1,'mlx::core']]],
-  ['hadamard_5fn_8',['hadamard_n',['../metal_2kernels_2hadamard_8h.html#a63c0e8510e555cd065e1f0ddfb33ce18',1,'hadamard.h']]],
-  ['hadamard_5ftransform_9',['hadamard_transform',['../group__ops.html#ga872d2c1806e67ce2596b24d056681074',1,'mlx::core']]],
-  ['half_5fbinop_10',['half_binop',['../fp16_8h.html#af58966694c5d80f9a0241670f3128199',1,'fp16.h']]],
-  ['half_5fbinop_5fbase_11',['half_binop_base',['../fp16_8h.html#a8420acf5d2687fbdafcc9052c949f323',1,'fp16.h']]],
-  ['half_5fbinop_5fhelper_12',['half_binop_helper',['../fp16_8h.html#aa300338c53d5a9f52fbbde8fd777f13d',1,'fp16.h']]],
-  ['half_5fbitop_13',['half_bitop',['../fp16_8h.html#a2242eaa64839925fd8f586dde7a59800',1,'fp16.h']]],
-  ['half_5fcompop_14',['half_compop',['../fp16_8h.html#acec0b85a9974cbde7b270a121f382405',1,'fp16.h']]],
-  ['half_5finplace_5fbitop_15',['half_inplace_bitop',['../fp16_8h.html#a378e011e994bf62a961c3c1cd6f7c290',1,'fp16.h']]],
-  ['half_5finplace_5fop_16',['half_inplace_op',['../fp16_8h.html#a6348c00d31a50b2df1b47d18af49c4b8',1,'fp16.h']]],
-  ['half_5ftypes_2eh_17',['half_types.h',['../half__types_8h.html',1,'']]],
-  ['has_5fbatch_18',['has_batch',['../steel__gemm__fused_8h.html#adffcdc900c19ff97f1523e43f1a5a6cc',1,'steel_gemm_fused.h']]],
-  ['has_5fmul_5foperand_5fmask_19',['has_mul_operand_mask',['../struct_g_e_m_v_kernel.html#ad47223ee49b3cb7bf3746a2cec45f883',1,'GEMVKernel::has_mul_operand_mask'],['../struct_g_e_m_v_t_kernel.html#a8db6f01f96a36b216acd801c34a96ef5',1,'GEMVTKernel::has_mul_operand_mask']]],
-  ['has_5fmul_5foutput_5fmask_20',['has_mul_output_mask',['../struct_g_e_m_v_kernel.html#a0edbf2dd6a6563e7afa6dab6b670615c',1,'GEMVKernel::has_mul_output_mask'],['../struct_g_e_m_v_t_kernel.html#a8eb06f6569e4042e24fee220b11fa10d',1,'GEMVTKernel::has_mul_output_mask']]],
-  ['has_5foperand_5fmask_21',['has_operand_mask',['../struct_g_e_m_v_kernel.html#ab00784dff1512a7b0919fcb4cfa5d50e',1,'GEMVKernel::has_operand_mask'],['../struct_g_e_m_v_t_kernel.html#a6729d6e63e76a1e9c7c8e78d9aac4869',1,'GEMVTKernel::has_operand_mask']]],
-  ['has_5foutput_5fmask_22',['has_output_mask',['../struct_g_e_m_v_kernel.html#ab8b64c94f4c8f6f09c0777415589b487',1,'GEMVKernel::has_output_mask'],['../struct_g_e_m_v_t_kernel.html#aaefdf8f023da255bbb70a0c3e3408626',1,'GEMVTKernel::has_output_mask']]],
-  ['has_5fprimitive_23',['has_primitive',['../classmlx_1_1core_1_1array.html#aa5aceab15241e7826cbaf8b8a41440c1',1,'mlx::core::array']]]
+  ['h_0',['H',['../structmlx_1_1steel_1_1_attn_params.html#a3d286a0c27bace6016ed7a87f43291b7',1,'mlx::steel::AttnParams']]],
+  ['h12_1',['h12',['../namespacemlx_1_1core.html#a4beeeec4413be7adcfb14feaa9cf0e2e',1,'mlx::core']]],
+  ['h20_2',['h20',['../namespacemlx_1_1core.html#a862c6b94fec384c34a699ced64d01404',1,'mlx::core']]],
+  ['h28_3',['h28',['../namespacemlx_1_1core.html#ac447ad59592dd06435adca7df37e33ad',1,'mlx::core']]],
+  ['hadamard_4',['Hadamard',['../classmlx_1_1core_1_1_hadamard.html',1,'mlx::core::Hadamard'],['../classmlx_1_1core_1_1_hadamard.html#abe4a0ed820b126940beec519d4239923',1,'mlx::core::Hadamard::Hadamard()']]],
+  ['hadamard_5',['hadamard',['../namespacemlx_1_1core_1_1metal.html#a8bd0072616087cd568c2c804e7114aa9',1,'mlx::core::metal']]],
+  ['hadamard_2eh_6',['hadamard.h',['../common_2hadamard_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2hadamard_8h.html',1,'(Global Namespace)']]],
+  ['hadamard_5fm_7',['hadamard_m',['../metal_2kernels_2hadamard_8h.html#ab0bd478f2051af35aed1869005e3370a',1,'hadamard.h']]],
+  ['hadamard_5fmatrices_8',['hadamard_matrices',['../namespacemlx_1_1core.html#a50214cf406957fab27c8bef32046f030',1,'mlx::core']]],
+  ['hadamard_5fn_9',['hadamard_n',['../metal_2kernels_2hadamard_8h.html#a63c0e8510e555cd065e1f0ddfb33ce18',1,'hadamard.h']]],
+  ['hadamard_5ftransform_10',['hadamard_transform',['../group__ops.html#ga872d2c1806e67ce2596b24d056681074',1,'mlx::core']]],
+  ['half_5fbinop_11',['half_binop',['../fp16_8h.html#af58966694c5d80f9a0241670f3128199',1,'fp16.h']]],
+  ['half_5fbinop_5fbase_12',['half_binop_base',['../fp16_8h.html#a8420acf5d2687fbdafcc9052c949f323',1,'fp16.h']]],
+  ['half_5fbinop_5fhelper_13',['half_binop_helper',['../fp16_8h.html#aa300338c53d5a9f52fbbde8fd777f13d',1,'fp16.h']]],
+  ['half_5fbitop_14',['half_bitop',['../fp16_8h.html#a2242eaa64839925fd8f586dde7a59800',1,'fp16.h']]],
+  ['half_5fcompop_15',['half_compop',['../fp16_8h.html#acec0b85a9974cbde7b270a121f382405',1,'fp16.h']]],
+  ['half_5finplace_5fbitop_16',['half_inplace_bitop',['../fp16_8h.html#a378e011e994bf62a961c3c1cd6f7c290',1,'fp16.h']]],
+  ['half_5finplace_5fop_17',['half_inplace_op',['../fp16_8h.html#a6348c00d31a50b2df1b47d18af49c4b8',1,'fp16.h']]],
+  ['half_5ftypes_2eh_18',['half_types.h',['../half__types_8h.html',1,'']]],
+  ['has_5fbatch_19',['has_batch',['../steel__gemm__fused_8h.html#adffcdc900c19ff97f1523e43f1a5a6cc',1,'steel_gemm_fused.h']]],
+  ['has_5fmul_5foperand_5fmask_20',['has_mul_operand_mask',['../struct_g_e_m_v_kernel.html#ad47223ee49b3cb7bf3746a2cec45f883',1,'GEMVKernel::has_mul_operand_mask'],['../struct_g_e_m_v_t_kernel.html#a8db6f01f96a36b216acd801c34a96ef5',1,'GEMVTKernel::has_mul_operand_mask']]],
+  ['has_5fmul_5foutput_5fmask_21',['has_mul_output_mask',['../struct_g_e_m_v_kernel.html#a0edbf2dd6a6563e7afa6dab6b670615c',1,'GEMVKernel::has_mul_output_mask'],['../struct_g_e_m_v_t_kernel.html#a8eb06f6569e4042e24fee220b11fa10d',1,'GEMVTKernel::has_mul_output_mask']]],
+  ['has_5foperand_5fmask_22',['has_operand_mask',['../struct_g_e_m_v_kernel.html#ab00784dff1512a7b0919fcb4cfa5d50e',1,'GEMVKernel::has_operand_mask'],['../struct_g_e_m_v_t_kernel.html#a6729d6e63e76a1e9c7c8e78d9aac4869',1,'GEMVTKernel::has_operand_mask']]],
+  ['has_5foutput_5fmask_23',['has_output_mask',['../struct_g_e_m_v_kernel.html#ab8b64c94f4c8f6f09c0777415589b487',1,'GEMVKernel::has_output_mask'],['../struct_g_e_m_v_t_kernel.html#aaefdf8f023da255bbb70a0c3e3408626',1,'GEMVTKernel::has_output_mask']]],
+  ['has_5fprimitive_24',['has_primitive',['../classmlx_1_1core_1_1array.html#aa5aceab15241e7826cbaf8b8a41440c1',1,'mlx::core::array']]]
 ];
diff --git a/docs/build/html/search/all_9.js b/docs/build/html/search/all_9.js
index 0c49121e5..70e12696f 100644
--- a/docs/build/html/search/all_9.js
+++ b/docs/build/html/search/all_9.js
@@ -18,7 +18,7 @@ var searchData=
   ['in_5ftracing_15',['in_tracing',['../structmlx_1_1core_1_1detail_1_1_in_tracing.html#ac52b8e2c3f808d3076c4e1ebaf9dc63d',1,'mlx::core::detail::InTracing']]],
   ['includes_2eh_16',['includes.h',['../includes_8h.html',1,'']]],
   ['increment_5fcommand_5fbuffer_5fops_17',['increment_command_buffer_ops',['../classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6',1,'mlx::core::metal::Device']]],
-  ['index_18',['index',['../structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a',1,'looped_elem_to_loc::index'],['../structmlx_1_1core_1_1_device.html#a5e345748fe318a267833ab7398b364ac',1,'mlx::core::Device::index'],['../structmlx_1_1core_1_1_stream.html#a9d0dafc1899333e1176eb2bbc0a8b626',1,'mlx::core::Stream::index']]],
+  ['index_18',['index',['../struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333',1,'LoopedElemToLoc::index'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a2cd3b616739b3d5b41e5b46ae335957d',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::index'],['../structmlx_1_1core_1_1_device.html#a5e345748fe318a267833ab7398b364ac',1,'mlx::core::Device::index'],['../structmlx_1_1core_1_1_stream.html#a9d0dafc1899333e1176eb2bbc0a8b626',1,'mlx::core::Stream::index']]],
   ['indexing_2eh_19',['indexing.h',['../jit_2indexing_8h.html',1,'(Global Namespace)'],['../kernels_2indexing_8h.html',1,'(Global Namespace)']]],
   ['indices_20',['Indices',['../struct_indices.html',1,'']]],
   ['inexact_21',['inexact',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2dae03b116564cd944b048fde87dbd4d5c9',1,'mlx::core::Dtype::inexact'],['../namespacemlx_1_1core.html#a54c6fae21b7f2fea8e6f80011ef38534',1,'mlx::core::inexact']]],
@@ -26,7 +26,7 @@ var searchData=
   ['init_23',['init',['../struct_cum_prod_3_01bool_01_4.html#ae7a8b0ba9e6898356b87b18766e76d2c',1,'CumProd&lt; bool &gt;::init'],['../struct_cum_max.html#a16480052a2eeb4340e546838aab59cc4',1,'CumMax::init'],['../struct_cum_min.html#a8b67f739c620d0cc194b533190990ab9',1,'CumMin::init'],['../struct_less_than.html#abf97a6b0163048e4ba96460939dbd3a3',1,'LessThan::init'],['../namespacemlx_1_1core_1_1distributed.html#a33633c058c7ec82cca4f237243c6810d',1,'mlx::core::distributed::init()']]],
   ['init_5freduce_24',['init_reduce',['../reduce__init_8h.html#a0088604ac2eaa6940689ff12c4ba5fc2',1,'reduce_init.h']]],
   ['inner_25',['inner',['../group__ops.html#ga654fec16a9746b390916697a2ab2546e',1,'mlx::core']]],
-  ['inner_5flooper_26',['inner_looper',['../structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189',1,'looped_elem_to_loc']]],
+  ['inner_5flooper_26',['inner_looper',['../struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40',1,'LoopedElemToLoc']]],
   ['inp_5fjump_5fc_27',['inp_jump_c',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a78d30e843d65d1829623afb0b607f0a5',1,'mlx::steel::ImplicitGemmConv2DParams']]],
   ['inp_5fjump_5fh_28',['inp_jump_h',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a03685a4066cdb11ffb647408e2c5b122',1,'mlx::steel::ImplicitGemmConv2DParams']]],
   ['inp_5fjump_5fw_29',['inp_jump_w',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#acf168c72f4a86b72b8f5f386f07c9d8c',1,'mlx::steel::ImplicitGemmConv2DParams']]],
@@ -51,42 +51,41 @@ var searchData=
   ['intracing_48',['InTracing',['../structmlx_1_1core_1_1detail_1_1_in_tracing.html',1,'mlx::core::detail::InTracing'],['../structmlx_1_1core_1_1detail_1_1_in_tracing.html#a7a77f19391498afa5dcea3509d241a70',1,'mlx::core::detail::InTracing::InTracing()']]],
   ['inv_49',['inv',['../struct_read_writer.html#a773fa8524515bdc2ff8b0e2060741813',1,'ReadWriter::inv'],['../namespacemlx_1_1core_1_1linalg.html#ad966a0b6bff176c9f933534ed62389a2',1,'mlx::core::linalg::inv()']]],
   ['inv_5f_50',['inv_',['../backend_2metal_2kernels_2fft_8h.html#a7a83318497519ff3ff0141b7d511ed38',1,'fft.h']]],
-  ['inv_5falpha_51',['INV_ALPHA',['../struct_m_l_x_scaled_dot_product_attention_params.html#a7461e0e17cdc7d3fed80bb00d58d8644',1,'MLXScaledDotProductAttentionParams']]],
-  ['inverse_52',['Inverse',['../classmlx_1_1core_1_1_inverse.html',1,'mlx::core::Inverse'],['../classmlx_1_1core_1_1_inverse.html#a71467681e523abb725724490bfeb76ad',1,'mlx::core::Inverse::Inverse()']]],
-  ['io_2eh_53',['io.h',['../io_8h.html',1,'']]],
-  ['iofs_54',['iofs',['../classpocketfft_1_1detail_1_1multi__iter.html#ad33360d4a8ab8e6d72efadc6f9cb5bfa',1,'pocketfft::detail::multi_iter::iofs(size_t i) const'],['../classpocketfft_1_1detail_1_1multi__iter.html#a97462d97bdca6419d8d2f37c2031fe83',1,'pocketfft::detail::multi_iter::iofs(size_t j, size_t i) const']]],
-  ['irfft_55',['irfft',['../namespacemlx_1_1core_1_1fft.html#aafa721d0492e9f74913a6e86b4896ad8',1,'mlx::core::fft::irfft(const array &amp;a, int n, int axis, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#afbd0035a3cf91f428838de1fcf01a3a3',1,'mlx::core::fft::irfft(const array &amp;a, int axis=-1, StreamOrDevice s={})']]],
-  ['irfft2_56',['irfft2',['../namespacemlx_1_1core_1_1fft.html#a35754b00e98d7ef37ce8230c8887a933',1,'mlx::core::fft::irfft2(const array &amp;a, const std::vector&lt; int &gt; &amp;n, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#af7c7bbbbce26c2775a77473502a8de02',1,'mlx::core::fft::irfft2(const array &amp;a, const std::vector&lt; int &gt; &amp;axes={-2, -1}, StreamOrDevice s={})']]],
-  ['irfftn_57',['irfftn',['../namespacemlx_1_1core_1_1fft.html#a33f2973ea1b621e67064e46136d2960f',1,'mlx::core::fft::irfftn(const array &amp;a, const std::vector&lt; int &gt; &amp;n, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#a1c9ad11121c5879d5c04bbde2ee238c3',1,'mlx::core::fft::irfftn(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#aaf5a7ef93b3426b94c2363a23a5a5b36',1,'mlx::core::fft::irfftn(const array &amp;a, StreamOrDevice s={})']]],
-  ['is_58',['iS',['../struct_m_l_x_conv_params.html#a72e1c3b4da0f70622cf18036bbf97fe6',1,'MLXConvParams']]],
-  ['is_5farray_5fv_59',['is_array_v',['../namespacemlx_1_1core.html#a01b0d64a75dfa2e95d6c7b5c53d708af',1,'mlx::core']]],
-  ['is_5farrays_5fv_60',['is_arrays_v',['../namespacemlx_1_1core.html#a94c1057929b390e5613304afa16dfbda',1,'mlx::core']]],
-  ['is_5favailable_61',['is_available',['../classmlx_1_1core_1_1array.html#aebed1f37c19197be76105161102a8a40',1,'mlx::core::array::is_available()'],['../namespacemlx_1_1core_1_1metal.html#a0cdf2c08c7bc0927a86070adc206987f',1,'mlx::core::metal::is_available()'],['../namespacemlx_1_1core_1_1distributed.html#a95655473cd0032c06e5fe3fca85aeef3',1,'mlx::core::distributed::is_available()']]],
-  ['is_5fdonatable_62',['is_donatable',['../classmlx_1_1core_1_1array.html#a4677a404b5d191af20b52649225de087',1,'mlx::core::array::is_donatable()'],['../namespacemlx_1_1core.html#af650e831ce21759da1ac103037d08d84',1,'mlx::core::is_donatable()']]],
-  ['is_5fempty_63',['is_empty',['../structmetal_1_1is__empty.html',1,'metal']]],
-  ['is_5fequivalent_64',['is_equivalent',['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62',1,'mlx::core::fast::ScaledDotProductAttention::is_equivalent()'],['../classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd',1,'mlx::core::Primitive::is_equivalent()'],['../classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67',1,'mlx::core::Abs::is_equivalent()'],['../classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f',1,'mlx::core::Add::is_equivalent()'],['../classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f',1,'mlx::core::AddMM::is_equivalent()'],['../classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35',1,'mlx::core::Arange::is_equivalent()'],['../classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5',1,'mlx::core::ArcCos::is_equivalent()'],['../classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee',1,'mlx::core::ArcCosh::is_equivalent()'],['../classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab',1,'mlx::core::ArcSin::is_equivalent()'],['../classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f',1,'mlx::core::ArcSinh::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c',1,'mlx::core::ArcTan::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc',1,'mlx::core::ArcTan2::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2',1,'mlx::core::ArcTanh::is_equivalent()'],['../classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a',1,'mlx::core::ArgPartition::is_equivalent()'],['../classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97',1,'mlx::core::ArgReduce::is_equivalent()'],['../classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845',1,'mlx::core::ArgSort::is_equivalent()'],['../classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af',1,'mlx::core::AsType::is_equivalent()'],['../classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094',1,'mlx::core::AsStrided::is_equivalent()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8',1,'mlx::core::BitwiseBinary::is_equivalent()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160',1,'mlx::core::BlockMaskedMM::is_equivalent()'],['../classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b',1,'mlx::core::GatherMM::is_equivalent()'],['../classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616',1,'mlx::core::Broadcast::is_equivalent()'],['../classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52',1,'mlx::core::Ceil::is_equivalent()'],['../classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10',1,'mlx::core::Compiled::is_equivalent()'],['../classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2',1,'mlx::core::Concatenate::is_equivalent()'],['../classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e',1,'mlx::core::Conjugate::is_equivalent()'],['../classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de',1,'mlx::core::Convolution::is_equivalent()'],['../classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da',1,'mlx::core::Copy::is_equivalent()'],['../classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417',1,'mlx::core::Cos::is_equivalent()'],['../classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9',1,'mlx::core::Cosh::is_equivalent()'],['../classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650',1,'mlx::core::Divide::is_equivalent()'],['../classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a',1,'mlx::core::DivMod::is_equivalent()'],['../classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8',1,'mlx::core::Select::is_equivalent()'],['../classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814',1,'mlx::core::Remainder::is_equivalent()'],['../classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02',1,'mlx::core::Equal::is_equivalent()'],['../classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82',1,'mlx::core::Erf::is_equivalent()'],['../classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832',1,'mlx::core::ErfInv::is_equivalent()'],['../classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357',1,'mlx::core::Exp::is_equivalent()'],['../classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06',1,'mlx::core::FFT::is_equivalent()'],['../classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94',1,'mlx::core::Floor::is_equivalent()'],['../classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792',1,'mlx::core::Full::is_equivalent()'],['../classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa',1,'mlx::core::Gather::is_equivalent()'],['../classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1',1,'mlx::core::Greater::is_equivalent()'],['../classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc',1,'mlx::core::GreaterEqual::is_equivalent()'],['../classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8',1,'mlx::core::Hadamard::is_equivalent()'],['../classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5',1,'mlx::core::Imag::is_equivalent()'],['../classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63',1,'mlx::core::Less::is_equivalent()'],['../classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af',1,'mlx::core::LessEqual::is_equivalent()'],['../classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8',1,'mlx::core::Log::is_equivalent()'],['../classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99',1,'mlx::core::LogicalNot::is_equivalent()'],['../classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be',1,'mlx::core::LogicalAnd::is_equivalent()'],['../classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71',1,'mlx::core::LogicalOr::is_equivalent()'],['../classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4',1,'mlx::core::LogAddExp::is_equivalent()'],['../classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630',1,'mlx::core::Matmul::is_equivalent()'],['../classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46',1,'mlx::core::Maximum::is_equivalent()'],['../classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4',1,'mlx::core::Minimum::is_equivalent()'],['../classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2',1,'mlx::core::Multiply::is_equivalent()'],['../classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823',1,'mlx::core::Negative::is_equivalent()'],['../classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d',1,'mlx::core::NotEqual::is_equivalent()'],['../classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f',1,'mlx::core::NumberOfElements::is_equivalent()'],['../classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b',1,'mlx::core::Pad::is_equivalent()'],['../classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8',1,'mlx::core::Partition::is_equivalent()'],['../classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68',1,'mlx::core::Power::is_equivalent()'],['../classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1',1,'mlx::core::QuantizedMatmul::is_equivalent()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11',1,'mlx::core::GatherQMM::is_equivalent()'],['../classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6',1,'mlx::core::RandomBits::is_equivalent()'],['../classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239',1,'mlx::core::Real::is_equivalent()'],['../classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3',1,'mlx::core::Reshape::is_equivalent()'],['../classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e',1,'mlx::core::Reduce::is_equivalent()'],['../classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927',1,'mlx::core::Round::is_equivalent()'],['../classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6',1,'mlx::core::Scan::is_equivalent()'],['../classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f',1,'mlx::core::Scatter::is_equivalent()'],['../classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e',1,'mlx::core::Sigmoid::is_equivalent()'],['../classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb',1,'mlx::core::Sign::is_equivalent()'],['../classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a',1,'mlx::core::Sin::is_equivalent()'],['../classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d',1,'mlx::core::Sinh::is_equivalent()'],['../classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0',1,'mlx::core::Slice::is_equivalent()'],['../classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119',1,'mlx::core::SliceUpdate::is_equivalent()'],['../classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728',1,'mlx::core::Softmax::is_equivalent()'],['../classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511',1,'mlx::core::Sort::is_equivalent()'],['../classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345',1,'mlx::core::Split::is_equivalent()'],['../classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2',1,'mlx::core::Square::is_equivalent()'],['../classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46',1,'mlx::core::Sqrt::is_equivalent()'],['../classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3',1,'mlx::core::StopGradient::is_equivalent()'],['../classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b',1,'mlx::core::Subtract::is_equivalent()'],['../classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4',1,'mlx::core::Tan::is_equivalent()'],['../classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda',1,'mlx::core::Tanh::is_equivalent()'],['../classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b',1,'mlx::core::Uniform::is_equivalent()'],['../classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64',1,'mlx::core::View::is_equivalent()'],['../classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab',1,'mlx::core::Transpose::is_equivalent()'],['../classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381',1,'mlx::core::Eigh::is_equivalent()']]],
-  ['is_5fintegral_65',['is_integral',['../structmlx_1_1steel_1_1is__integral.html',1,'mlx::steel']]],
-  ['is_5fintegral_3c_20integral_5fconstant_3c_20t_2c_20v_20_3e_20_3e_66',['is_integral&lt; integral_constant&lt; T, v &gt; &gt;',['../structmlx_1_1steel_1_1is__integral_3_01integral__constant_3_01_t_00_01v_01_4_01_4.html',1,'mlx::steel']]],
-  ['is_5fintegral_5fv_67',['is_integral_v',['../namespacemlx_1_1steel.html#a92a3465716ea7fd682d22cecc08d45fd',1,'mlx::steel']]],
-  ['is_5fmetal_5fatomic_68',['is_metal_atomic',['../atomic_8h.html#a91a8bdcae647947a83c6689d7f252d24',1,'atomic.h']]],
-  ['is_5fopen_69',['is_open',['../classmlx_1_1core_1_1io_1_1_reader.html#a780f504058bd9c80cb3d105046a9f985',1,'mlx::core::io::Reader::is_open()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a85aa36bdb0dbfb8c5b6cfd955b03417a',1,'mlx::core::io::Writer::is_open()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a653009adbcbce8248bc666df502fdbde',1,'mlx::core::io::ParallelFileReader::is_open()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#ad5d2ee671a81700cb1658c41309d6676',1,'mlx::core::io::FileWriter::is_open()']]],
-  ['is_5fpower_5fof_5f2_70',['is_power_of_2',['../namespacemlx_1_1core.html#adacbc4526e8964b267a8ec3eb1bc1a32',1,'mlx::core']]],
-  ['is_5fpower_5fof_5f2_5f_71',['is_power_of_2_',['../backend_2metal_2kernels_2fft_8h.html#a2a4df90e329b84ee6c1890ba7c265c9c',1,'fft.h']]],
-  ['is_5fready_72',['is_ready',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#ab41ecc5adb6187aa2682ca190fd920f3',1,'pocketfft::detail::threading::latch']]],
-  ['is_5fsame_5fshape_73',['is_same_shape',['../namespacemlx_1_1core.html#ad4b664de4a4abd305827b30879b9da33',1,'mlx::core']]],
-  ['is_5fscalar_74',['is_scalar',['../namespacemlx_1_1core.html#a985c60929757190e0b4ec51f57c767d0',1,'mlx::core']]],
-  ['is_5fsignaled_75',['is_signaled',['../classmlx_1_1core_1_1_event.html#a05a9a3de88185b4a89e154242b4e770a',1,'mlx::core::Event']]],
-  ['is_5fstatic_76',['is_static',['../structmetal_1_1is__static.html',1,'metal']]],
-  ['is_5fstatic_5fcast_77',['is_static_cast',['../namespacemlx_1_1core.html#afd9e740e567f9d7c28e00113caf46d5f',1,'mlx::core']]],
-  ['is_5ftracer_78',['is_tracer',['../classmlx_1_1core_1_1array.html#af9acb115019b995354d366c4ac6b968c',1,'mlx::core::array']]],
-  ['isclose_79',['isclose',['../group__ops.html#ga51eac95c04400921c54716de14b52491',1,'mlx::core']]],
-  ['isfinite_80',['isfinite',['../group__ops.html#ga725ff0789f934b1fdd54ee29e47022ff',1,'mlx::core']]],
-  ['isinf_81',['isinf',['../group__ops.html#ga8fc238d5e5d1153e69da8b36015d9844',1,'mlx::core']]],
-  ['isnan_82',['isnan',['../namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb',1,'metal::isnan()'],['../group__ops.html#ga175592792471b0ffb45196dca4711ba6',1,'mlx::core::isnan(const array &amp;a, StreamOrDevice s={})']]],
-  ['isneginf_83',['isneginf',['../group__ops.html#ga1940523da381ed7be50656a3bc465ff3',1,'mlx::core']]],
-  ['isposinf_84',['isposinf',['../group__ops.html#gad80f7c4a58c12b6cb30a8b9a73008993',1,'mlx::core']]],
-  ['issubdtype_85',['issubdtype',['../namespacemlx_1_1core.html#aca9e69b06f4212eba44bf0ce6711d5f7',1,'mlx::core::issubdtype(const Dtype &amp;a, const Dtype &amp;b)'],['../namespacemlx_1_1core.html#aca39f224c1d17bde35dfcb9088430704',1,'mlx::core::issubdtype(const Dtype::Category &amp;a, const Dtype &amp;b)'],['../namespacemlx_1_1core.html#ae9ee4a7c205df061c1caa7e62b7504e8',1,'mlx::core::issubdtype(const Dtype &amp;a, const Dtype::Category &amp;b)'],['../namespacemlx_1_1core.html#ab5b1a5a3d545a5de00c3117f76d71a1d',1,'mlx::core::issubdtype(const Dtype::Category &amp;a, const Dtype::Category &amp;b)']]],
-  ['item_86',['item',['../classmlx_1_1core_1_1array.html#a90c5afddc2fa3028c0f8099bd64c8a99',1,'mlx::core::array::item()'],['../classmlx_1_1core_1_1array.html#a8650a99a6b7549bc823b03ad92590ff7',1,'mlx::core::array::item() const']]],
-  ['itemsize_87',['itemsize',['../classmlx_1_1core_1_1array.html#af329d9432c92de87cbaa2de8454eefc0',1,'mlx::core::array']]],
-  ['iterator_5fcategory_88',['iterator_category',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a2cbf481e39164245668b3be6cbcc614d',1,'mlx::core::array::ArrayIterator']]]
+  ['inverse_51',['Inverse',['../classmlx_1_1core_1_1_inverse.html',1,'mlx::core::Inverse'],['../classmlx_1_1core_1_1_inverse.html#a71467681e523abb725724490bfeb76ad',1,'mlx::core::Inverse::Inverse()']]],
+  ['io_2eh_52',['io.h',['../io_8h.html',1,'']]],
+  ['iofs_53',['iofs',['../classpocketfft_1_1detail_1_1multi__iter.html#ad33360d4a8ab8e6d72efadc6f9cb5bfa',1,'pocketfft::detail::multi_iter::iofs(size_t i) const'],['../classpocketfft_1_1detail_1_1multi__iter.html#a97462d97bdca6419d8d2f37c2031fe83',1,'pocketfft::detail::multi_iter::iofs(size_t j, size_t i) const']]],
+  ['irfft_54',['irfft',['../namespacemlx_1_1core_1_1fft.html#aafa721d0492e9f74913a6e86b4896ad8',1,'mlx::core::fft::irfft(const array &amp;a, int n, int axis, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#afbd0035a3cf91f428838de1fcf01a3a3',1,'mlx::core::fft::irfft(const array &amp;a, int axis=-1, StreamOrDevice s={})']]],
+  ['irfft2_55',['irfft2',['../namespacemlx_1_1core_1_1fft.html#a35754b00e98d7ef37ce8230c8887a933',1,'mlx::core::fft::irfft2(const array &amp;a, const std::vector&lt; int &gt; &amp;n, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#af7c7bbbbce26c2775a77473502a8de02',1,'mlx::core::fft::irfft2(const array &amp;a, const std::vector&lt; int &gt; &amp;axes={-2, -1}, StreamOrDevice s={})']]],
+  ['irfftn_56',['irfftn',['../namespacemlx_1_1core_1_1fft.html#a33f2973ea1b621e67064e46136d2960f',1,'mlx::core::fft::irfftn(const array &amp;a, const std::vector&lt; int &gt; &amp;n, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#a1c9ad11121c5879d5c04bbde2ee238c3',1,'mlx::core::fft::irfftn(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#aaf5a7ef93b3426b94c2363a23a5a5b36',1,'mlx::core::fft::irfftn(const array &amp;a, StreamOrDevice s={})']]],
+  ['is_57',['iS',['../struct_m_l_x_conv_params.html#a72e1c3b4da0f70622cf18036bbf97fe6',1,'MLXConvParams']]],
+  ['is_5farray_5fv_58',['is_array_v',['../namespacemlx_1_1core.html#a01b0d64a75dfa2e95d6c7b5c53d708af',1,'mlx::core']]],
+  ['is_5farrays_5fv_59',['is_arrays_v',['../namespacemlx_1_1core.html#a94c1057929b390e5613304afa16dfbda',1,'mlx::core']]],
+  ['is_5favailable_60',['is_available',['../classmlx_1_1core_1_1array.html#aebed1f37c19197be76105161102a8a40',1,'mlx::core::array::is_available()'],['../namespacemlx_1_1core_1_1metal.html#a0cdf2c08c7bc0927a86070adc206987f',1,'mlx::core::metal::is_available()'],['../namespacemlx_1_1core_1_1distributed.html#a95655473cd0032c06e5fe3fca85aeef3',1,'mlx::core::distributed::is_available()']]],
+  ['is_5fdonatable_61',['is_donatable',['../classmlx_1_1core_1_1array.html#a4677a404b5d191af20b52649225de087',1,'mlx::core::array::is_donatable()'],['../namespacemlx_1_1core.html#af650e831ce21759da1ac103037d08d84',1,'mlx::core::is_donatable()']]],
+  ['is_5fempty_62',['is_empty',['../structmetal_1_1is__empty.html',1,'metal']]],
+  ['is_5fequivalent_63',['is_equivalent',['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62',1,'mlx::core::fast::ScaledDotProductAttention::is_equivalent()'],['../classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd',1,'mlx::core::Primitive::is_equivalent()'],['../classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67',1,'mlx::core::Abs::is_equivalent()'],['../classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f',1,'mlx::core::Add::is_equivalent()'],['../classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f',1,'mlx::core::AddMM::is_equivalent()'],['../classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35',1,'mlx::core::Arange::is_equivalent()'],['../classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5',1,'mlx::core::ArcCos::is_equivalent()'],['../classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee',1,'mlx::core::ArcCosh::is_equivalent()'],['../classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab',1,'mlx::core::ArcSin::is_equivalent()'],['../classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f',1,'mlx::core::ArcSinh::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c',1,'mlx::core::ArcTan::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc',1,'mlx::core::ArcTan2::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2',1,'mlx::core::ArcTanh::is_equivalent()'],['../classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a',1,'mlx::core::ArgPartition::is_equivalent()'],['../classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97',1,'mlx::core::ArgReduce::is_equivalent()'],['../classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845',1,'mlx::core::ArgSort::is_equivalent()'],['../classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af',1,'mlx::core::AsType::is_equivalent()'],['../classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094',1,'mlx::core::AsStrided::is_equivalent()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8',1,'mlx::core::BitwiseBinary::is_equivalent()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160',1,'mlx::core::BlockMaskedMM::is_equivalent()'],['../classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b',1,'mlx::core::GatherMM::is_equivalent()'],['../classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616',1,'mlx::core::Broadcast::is_equivalent()'],['../classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52',1,'mlx::core::Ceil::is_equivalent()'],['../classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10',1,'mlx::core::Compiled::is_equivalent()'],['../classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2',1,'mlx::core::Concatenate::is_equivalent()'],['../classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e',1,'mlx::core::Conjugate::is_equivalent()'],['../classmlx_1_1core_1_1_contiguous.html#aa5d273a461fc6e64f3c9a67c24cb3372',1,'mlx::core::Contiguous::is_equivalent()'],['../classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de',1,'mlx::core::Convolution::is_equivalent()'],['../classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da',1,'mlx::core::Copy::is_equivalent()'],['../classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417',1,'mlx::core::Cos::is_equivalent()'],['../classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9',1,'mlx::core::Cosh::is_equivalent()'],['../classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650',1,'mlx::core::Divide::is_equivalent()'],['../classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a',1,'mlx::core::DivMod::is_equivalent()'],['../classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8',1,'mlx::core::Select::is_equivalent()'],['../classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814',1,'mlx::core::Remainder::is_equivalent()'],['../classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02',1,'mlx::core::Equal::is_equivalent()'],['../classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82',1,'mlx::core::Erf::is_equivalent()'],['../classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832',1,'mlx::core::ErfInv::is_equivalent()'],['../classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357',1,'mlx::core::Exp::is_equivalent()'],['../classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06',1,'mlx::core::FFT::is_equivalent()'],['../classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94',1,'mlx::core::Floor::is_equivalent()'],['../classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792',1,'mlx::core::Full::is_equivalent()'],['../classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa',1,'mlx::core::Gather::is_equivalent()'],['../classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1',1,'mlx::core::Greater::is_equivalent()'],['../classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc',1,'mlx::core::GreaterEqual::is_equivalent()'],['../classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8',1,'mlx::core::Hadamard::is_equivalent()'],['../classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5',1,'mlx::core::Imag::is_equivalent()'],['../classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63',1,'mlx::core::Less::is_equivalent()'],['../classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af',1,'mlx::core::LessEqual::is_equivalent()'],['../classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8',1,'mlx::core::Log::is_equivalent()'],['../classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99',1,'mlx::core::LogicalNot::is_equivalent()'],['../classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be',1,'mlx::core::LogicalAnd::is_equivalent()'],['../classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71',1,'mlx::core::LogicalOr::is_equivalent()'],['../classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4',1,'mlx::core::LogAddExp::is_equivalent()'],['../classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630',1,'mlx::core::Matmul::is_equivalent()'],['../classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46',1,'mlx::core::Maximum::is_equivalent()'],['../classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4',1,'mlx::core::Minimum::is_equivalent()'],['../classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2',1,'mlx::core::Multiply::is_equivalent()'],['../classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823',1,'mlx::core::Negative::is_equivalent()'],['../classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d',1,'mlx::core::NotEqual::is_equivalent()'],['../classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f',1,'mlx::core::NumberOfElements::is_equivalent()'],['../classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b',1,'mlx::core::Pad::is_equivalent()'],['../classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8',1,'mlx::core::Partition::is_equivalent()'],['../classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68',1,'mlx::core::Power::is_equivalent()'],['../classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1',1,'mlx::core::QuantizedMatmul::is_equivalent()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11',1,'mlx::core::GatherQMM::is_equivalent()'],['../classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6',1,'mlx::core::RandomBits::is_equivalent()'],['../classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239',1,'mlx::core::Real::is_equivalent()'],['../classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3',1,'mlx::core::Reshape::is_equivalent()'],['../classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e',1,'mlx::core::Reduce::is_equivalent()'],['../classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927',1,'mlx::core::Round::is_equivalent()'],['../classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6',1,'mlx::core::Scan::is_equivalent()'],['../classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f',1,'mlx::core::Scatter::is_equivalent()'],['../classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e',1,'mlx::core::Sigmoid::is_equivalent()'],['../classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb',1,'mlx::core::Sign::is_equivalent()'],['../classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a',1,'mlx::core::Sin::is_equivalent()'],['../classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d',1,'mlx::core::Sinh::is_equivalent()'],['../classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0',1,'mlx::core::Slice::is_equivalent()'],['../classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119',1,'mlx::core::SliceUpdate::is_equivalent()'],['../classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728',1,'mlx::core::Softmax::is_equivalent()'],['../classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511',1,'mlx::core::Sort::is_equivalent()'],['../classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345',1,'mlx::core::Split::is_equivalent()'],['../classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2',1,'mlx::core::Square::is_equivalent()'],['../classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46',1,'mlx::core::Sqrt::is_equivalent()'],['../classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3',1,'mlx::core::StopGradient::is_equivalent()'],['../classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b',1,'mlx::core::Subtract::is_equivalent()'],['../classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4',1,'mlx::core::Tan::is_equivalent()'],['../classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda',1,'mlx::core::Tanh::is_equivalent()'],['../classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b',1,'mlx::core::Uniform::is_equivalent()'],['../classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64',1,'mlx::core::View::is_equivalent()'],['../classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab',1,'mlx::core::Transpose::is_equivalent()'],['../classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381',1,'mlx::core::Eigh::is_equivalent()']]],
+  ['is_5fintegral_64',['is_integral',['../structmlx_1_1steel_1_1is__integral.html',1,'mlx::steel']]],
+  ['is_5fintegral_3c_20integral_5fconstant_3c_20t_2c_20v_20_3e_20_3e_65',['is_integral&lt; integral_constant&lt; T, v &gt; &gt;',['../structmlx_1_1steel_1_1is__integral_3_01integral__constant_3_01_t_00_01v_01_4_01_4.html',1,'mlx::steel']]],
+  ['is_5fintegral_5fv_66',['is_integral_v',['../namespacemlx_1_1steel.html#a92a3465716ea7fd682d22cecc08d45fd',1,'mlx::steel']]],
+  ['is_5fmetal_5fatomic_67',['is_metal_atomic',['../atomic_8h.html#a91a8bdcae647947a83c6689d7f252d24',1,'atomic.h']]],
+  ['is_5fopen_68',['is_open',['../classmlx_1_1core_1_1io_1_1_reader.html#a780f504058bd9c80cb3d105046a9f985',1,'mlx::core::io::Reader::is_open()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a85aa36bdb0dbfb8c5b6cfd955b03417a',1,'mlx::core::io::Writer::is_open()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a653009adbcbce8248bc666df502fdbde',1,'mlx::core::io::ParallelFileReader::is_open()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#ad5d2ee671a81700cb1658c41309d6676',1,'mlx::core::io::FileWriter::is_open()']]],
+  ['is_5fpower_5fof_5f2_69',['is_power_of_2',['../namespacemlx_1_1core.html#adacbc4526e8964b267a8ec3eb1bc1a32',1,'mlx::core']]],
+  ['is_5fpower_5fof_5f2_5f_70',['is_power_of_2_',['../backend_2metal_2kernels_2fft_8h.html#a2a4df90e329b84ee6c1890ba7c265c9c',1,'fft.h']]],
+  ['is_5fready_71',['is_ready',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#ab41ecc5adb6187aa2682ca190fd920f3',1,'pocketfft::detail::threading::latch']]],
+  ['is_5fsame_5fshape_72',['is_same_shape',['../namespacemlx_1_1core.html#ad4b664de4a4abd305827b30879b9da33',1,'mlx::core']]],
+  ['is_5fscalar_73',['is_scalar',['../namespacemlx_1_1core.html#a985c60929757190e0b4ec51f57c767d0',1,'mlx::core']]],
+  ['is_5fsignaled_74',['is_signaled',['../classmlx_1_1core_1_1_event.html#a05a9a3de88185b4a89e154242b4e770a',1,'mlx::core::Event']]],
+  ['is_5fstatic_75',['is_static',['../structmetal_1_1is__static.html',1,'metal']]],
+  ['is_5fstatic_5fcast_76',['is_static_cast',['../namespacemlx_1_1core.html#afd9e740e567f9d7c28e00113caf46d5f',1,'mlx::core']]],
+  ['is_5ftracer_77',['is_tracer',['../classmlx_1_1core_1_1array.html#af9acb115019b995354d366c4ac6b968c',1,'mlx::core::array']]],
+  ['isclose_78',['isclose',['../group__ops.html#ga51eac95c04400921c54716de14b52491',1,'mlx::core']]],
+  ['isfinite_79',['isfinite',['../group__ops.html#ga725ff0789f934b1fdd54ee29e47022ff',1,'mlx::core']]],
+  ['isinf_80',['isinf',['../group__ops.html#ga8fc238d5e5d1153e69da8b36015d9844',1,'mlx::core']]],
+  ['isnan_81',['isnan',['../namespacemetal.html#a83320ba983d90dd1fa5847b6940dc0bb',1,'metal::isnan()'],['../group__ops.html#ga175592792471b0ffb45196dca4711ba6',1,'mlx::core::isnan(const array &amp;a, StreamOrDevice s={})']]],
+  ['isneginf_82',['isneginf',['../group__ops.html#ga1940523da381ed7be50656a3bc465ff3',1,'mlx::core']]],
+  ['isposinf_83',['isposinf',['../group__ops.html#gad80f7c4a58c12b6cb30a8b9a73008993',1,'mlx::core']]],
+  ['issubdtype_84',['issubdtype',['../namespacemlx_1_1core.html#aca9e69b06f4212eba44bf0ce6711d5f7',1,'mlx::core::issubdtype(const Dtype &amp;a, const Dtype &amp;b)'],['../namespacemlx_1_1core.html#aca39f224c1d17bde35dfcb9088430704',1,'mlx::core::issubdtype(const Dtype::Category &amp;a, const Dtype &amp;b)'],['../namespacemlx_1_1core.html#ae9ee4a7c205df061c1caa7e62b7504e8',1,'mlx::core::issubdtype(const Dtype &amp;a, const Dtype::Category &amp;b)'],['../namespacemlx_1_1core.html#ab5b1a5a3d545a5de00c3117f76d71a1d',1,'mlx::core::issubdtype(const Dtype::Category &amp;a, const Dtype::Category &amp;b)']]],
+  ['item_85',['item',['../classmlx_1_1core_1_1array.html#a90c5afddc2fa3028c0f8099bd64c8a99',1,'mlx::core::array::item()'],['../classmlx_1_1core_1_1array.html#a8650a99a6b7549bc823b03ad92590ff7',1,'mlx::core::array::item() const']]],
+  ['itemsize_86',['itemsize',['../classmlx_1_1core_1_1array.html#af329d9432c92de87cbaa2de8454eefc0',1,'mlx::core::array']]],
+  ['iterator_5fcategory_87',['iterator_category',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a2cbf481e39164245668b3be6cbcc614d',1,'mlx::core::array::ArrayIterator']]]
 ];
diff --git a/docs/build/html/search/all_a.js b/docs/build/html/search/all_a.js
index 21bfc5e15..97a0c9517 100644
--- a/docs/build/html/search/all_a.js
+++ b/docs/build/html/search/all_a.js
@@ -1,5 +1,8 @@
 var searchData=
 [
-  ['jump_5fparams_0',['jump_params',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a21b9ee9168dad4af84a611f861519e77',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::jump_params'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aa5611e9a84bebaee966d2b339c214ff5',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::jump_params']]],
-  ['jvp_1',['jvp',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80',1,'mlx::core::distributed::AllReduce::jvp()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913',1,'mlx::core::distributed::AllGather::jvp()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584',1,'mlx::core::fast::Custom::jvp()'],['../classmlx_1_1core_1_1_primitive.html#a9fecf38f53da08ba1947543c2b3158c2',1,'mlx::core::Primitive::jvp()'],['../classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11',1,'mlx::core::Abs::jvp()'],['../classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7',1,'mlx::core::Add::jvp()'],['../classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9',1,'mlx::core::ArcCos::jvp()'],['../classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7',1,'mlx::core::ArcCosh::jvp()'],['../classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4',1,'mlx::core::ArcSin::jvp()'],['../classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4',1,'mlx::core::ArcSinh::jvp()'],['../classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760',1,'mlx::core::ArcTan::jvp()'],['../classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738',1,'mlx::core::ArcTan2::jvp()'],['../classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a',1,'mlx::core::ArcTanh::jvp()'],['../classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595',1,'mlx::core::ArgPartition::jvp()'],['../classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa',1,'mlx::core::ArgReduce::jvp()'],['../classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0',1,'mlx::core::AsType::jvp()'],['../classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53',1,'mlx::core::AsStrided::jvp()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d',1,'mlx::core::BitwiseBinary::jvp()'],['../classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece',1,'mlx::core::Broadcast::jvp()'],['../classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066',1,'mlx::core::Ceil::jvp()'],['../classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205',1,'mlx::core::Compiled::jvp()'],['../classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1',1,'mlx::core::Concatenate::jvp()'],['../classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc',1,'mlx::core::Copy::jvp()'],['../classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1',1,'mlx::core::Cos::jvp()'],['../classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863',1,'mlx::core::Cosh::jvp()'],['../classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720',1,'mlx::core::CustomTransforms::jvp()'],['../classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c',1,'mlx::core::Divide::jvp()'],['../classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9',1,'mlx::core::DivMod::jvp()'],['../classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6',1,'mlx::core::Select::jvp()'],['../classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79',1,'mlx::core::Remainder::jvp()'],['../classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f',1,'mlx::core::Equal::jvp()'],['../classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe',1,'mlx::core::Erf::jvp()'],['../classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be',1,'mlx::core::ErfInv::jvp()'],['../classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59',1,'mlx::core::Exp::jvp()'],['../classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1',1,'mlx::core::Expm1::jvp()'],['../classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6',1,'mlx::core::FFT::jvp()'],['../classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af',1,'mlx::core::Floor::jvp()'],['../classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407',1,'mlx::core::Full::jvp()'],['../classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d',1,'mlx::core::Gather::jvp()'],['../classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1',1,'mlx::core::Greater::jvp()'],['../classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20',1,'mlx::core::GreaterEqual::jvp()'],['../classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a',1,'mlx::core::Hadamard::jvp()'],['../classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a',1,'mlx::core::Imag::jvp()'],['../classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce',1,'mlx::core::Less::jvp()'],['../classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f',1,'mlx::core::LessEqual::jvp()'],['../classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832',1,'mlx::core::Log::jvp()'],['../classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2',1,'mlx::core::Log1p::jvp()'],['../classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c',1,'mlx::core::LogicalNot::jvp()'],['../classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434',1,'mlx::core::LogicalAnd::jvp()'],['../classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4',1,'mlx::core::LogicalOr::jvp()'],['../classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329',1,'mlx::core::LogAddExp::jvp()'],['../classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39',1,'mlx::core::Maximum::jvp()'],['../classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038',1,'mlx::core::Minimum::jvp()'],['../classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4',1,'mlx::core::Multiply::jvp()'],['../classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979',1,'mlx::core::Negative::jvp()'],['../classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17',1,'mlx::core::NotEqual::jvp()'],['../classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72',1,'mlx::core::Pad::jvp()'],['../classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a',1,'mlx::core::Partition::jvp()'],['../classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a',1,'mlx::core::Power::jvp()'],['../classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23',1,'mlx::core::QuantizedMatmul::jvp()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0',1,'mlx::core::GatherQMM::jvp()'],['../classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526',1,'mlx::core::Real::jvp()'],['../classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5',1,'mlx::core::Reshape::jvp()'],['../classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7',1,'mlx::core::Round::jvp()'],['../classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee',1,'mlx::core::Scan::jvp()'],['../classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934',1,'mlx::core::Scatter::jvp()'],['../classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db',1,'mlx::core::Sigmoid::jvp()'],['../classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b',1,'mlx::core::Sign::jvp()'],['../classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de',1,'mlx::core::Sin::jvp()'],['../classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c',1,'mlx::core::Sinh::jvp()'],['../classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36',1,'mlx::core::Slice::jvp()'],['../classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611',1,'mlx::core::SliceUpdate::jvp()'],['../classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f',1,'mlx::core::Softmax::jvp()'],['../classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62',1,'mlx::core::Sort::jvp()'],['../classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282',1,'mlx::core::Split::jvp()'],['../classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d',1,'mlx::core::Square::jvp()'],['../classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818',1,'mlx::core::Sqrt::jvp()'],['../classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220',1,'mlx::core::Subtract::jvp()'],['../classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2',1,'mlx::core::Tan::jvp()'],['../classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a',1,'mlx::core::Tanh::jvp()'],['../classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1',1,'mlx::core::Transpose::jvp()'],['../namespacemlx_1_1core.html#a179a632200366c223d6ab56d3e032592',1,'mlx::core::jvp(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;tangents)'],['../namespacemlx_1_1core.html#af38e7582db29519bb39326f6fa531d20',1,'mlx::core::jvp(const std::function&lt; array(const array &amp;)&gt; &amp;fun, const array &amp;primal, const array &amp;tangent)']]]
+  ['jit_5felse_0',['jit_else',['../backend_2metal_2kernels_2jit_2bf16_8h.html#a4b2f08732045407adc7ee181e39e5ae3',1,'bf16.h']]],
+  ['jit_5fendif_1',['jit_endif',['../backend_2metal_2kernels_2jit_2bf16_8h.html#a5049b44a1fffcb837e0c470ae4cafc56',1,'bf16.h']]],
+  ['jit_5fif_2',['jit_if',['../backend_2metal_2kernels_2jit_2bf16_8h.html#aaf5bb88c2349054a6c4c2aefee63d3d2',1,'bf16.h']]],
+  ['jump_5fparams_3',['jump_params',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a21b9ee9168dad4af84a611f861519e77',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::jump_params'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aa5611e9a84bebaee966d2b339c214ff5',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::jump_params']]],
+  ['jvp_4',['jvp',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80',1,'mlx::core::distributed::AllReduce::jvp()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913',1,'mlx::core::distributed::AllGather::jvp()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584',1,'mlx::core::fast::Custom::jvp()'],['../classmlx_1_1core_1_1_primitive.html#a9fecf38f53da08ba1947543c2b3158c2',1,'mlx::core::Primitive::jvp()'],['../classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11',1,'mlx::core::Abs::jvp()'],['../classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7',1,'mlx::core::Add::jvp()'],['../classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9',1,'mlx::core::ArcCos::jvp()'],['../classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7',1,'mlx::core::ArcCosh::jvp()'],['../classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4',1,'mlx::core::ArcSin::jvp()'],['../classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4',1,'mlx::core::ArcSinh::jvp()'],['../classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760',1,'mlx::core::ArcTan::jvp()'],['../classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738',1,'mlx::core::ArcTan2::jvp()'],['../classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a',1,'mlx::core::ArcTanh::jvp()'],['../classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595',1,'mlx::core::ArgPartition::jvp()'],['../classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa',1,'mlx::core::ArgReduce::jvp()'],['../classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0',1,'mlx::core::AsType::jvp()'],['../classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53',1,'mlx::core::AsStrided::jvp()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d',1,'mlx::core::BitwiseBinary::jvp()'],['../classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece',1,'mlx::core::Broadcast::jvp()'],['../classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066',1,'mlx::core::Ceil::jvp()'],['../classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205',1,'mlx::core::Compiled::jvp()'],['../classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1',1,'mlx::core::Concatenate::jvp()'],['../classmlx_1_1core_1_1_contiguous.html#a1f9fcae7235e0ae9217825b78cb0f991',1,'mlx::core::Contiguous::jvp()'],['../classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc',1,'mlx::core::Copy::jvp()'],['../classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1',1,'mlx::core::Cos::jvp()'],['../classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863',1,'mlx::core::Cosh::jvp()'],['../classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720',1,'mlx::core::CustomTransforms::jvp()'],['../classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c',1,'mlx::core::Divide::jvp()'],['../classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9',1,'mlx::core::DivMod::jvp()'],['../classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6',1,'mlx::core::Select::jvp()'],['../classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79',1,'mlx::core::Remainder::jvp()'],['../classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f',1,'mlx::core::Equal::jvp()'],['../classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe',1,'mlx::core::Erf::jvp()'],['../classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be',1,'mlx::core::ErfInv::jvp()'],['../classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59',1,'mlx::core::Exp::jvp()'],['../classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1',1,'mlx::core::Expm1::jvp()'],['../classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6',1,'mlx::core::FFT::jvp()'],['../classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af',1,'mlx::core::Floor::jvp()'],['../classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407',1,'mlx::core::Full::jvp()'],['../classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d',1,'mlx::core::Gather::jvp()'],['../classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1',1,'mlx::core::Greater::jvp()'],['../classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20',1,'mlx::core::GreaterEqual::jvp()'],['../classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a',1,'mlx::core::Hadamard::jvp()'],['../classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a',1,'mlx::core::Imag::jvp()'],['../classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce',1,'mlx::core::Less::jvp()'],['../classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f',1,'mlx::core::LessEqual::jvp()'],['../classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832',1,'mlx::core::Log::jvp()'],['../classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2',1,'mlx::core::Log1p::jvp()'],['../classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c',1,'mlx::core::LogicalNot::jvp()'],['../classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434',1,'mlx::core::LogicalAnd::jvp()'],['../classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4',1,'mlx::core::LogicalOr::jvp()'],['../classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329',1,'mlx::core::LogAddExp::jvp()'],['../classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39',1,'mlx::core::Maximum::jvp()'],['../classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038',1,'mlx::core::Minimum::jvp()'],['../classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4',1,'mlx::core::Multiply::jvp()'],['../classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979',1,'mlx::core::Negative::jvp()'],['../classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17',1,'mlx::core::NotEqual::jvp()'],['../classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72',1,'mlx::core::Pad::jvp()'],['../classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a',1,'mlx::core::Partition::jvp()'],['../classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a',1,'mlx::core::Power::jvp()'],['../classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23',1,'mlx::core::QuantizedMatmul::jvp()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0',1,'mlx::core::GatherQMM::jvp()'],['../classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526',1,'mlx::core::Real::jvp()'],['../classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5',1,'mlx::core::Reshape::jvp()'],['../classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7',1,'mlx::core::Round::jvp()'],['../classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee',1,'mlx::core::Scan::jvp()'],['../classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934',1,'mlx::core::Scatter::jvp()'],['../classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db',1,'mlx::core::Sigmoid::jvp()'],['../classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b',1,'mlx::core::Sign::jvp()'],['../classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de',1,'mlx::core::Sin::jvp()'],['../classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c',1,'mlx::core::Sinh::jvp()'],['../classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36',1,'mlx::core::Slice::jvp()'],['../classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611',1,'mlx::core::SliceUpdate::jvp()'],['../classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f',1,'mlx::core::Softmax::jvp()'],['../classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62',1,'mlx::core::Sort::jvp()'],['../classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282',1,'mlx::core::Split::jvp()'],['../classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d',1,'mlx::core::Square::jvp()'],['../classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818',1,'mlx::core::Sqrt::jvp()'],['../classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220',1,'mlx::core::Subtract::jvp()'],['../classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2',1,'mlx::core::Tan::jvp()'],['../classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a',1,'mlx::core::Tanh::jvp()'],['../classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1',1,'mlx::core::Transpose::jvp()'],['../namespacemlx_1_1core.html#a179a632200366c223d6ab56d3e032592',1,'mlx::core::jvp(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;tangents)'],['../namespacemlx_1_1core.html#af38e7582db29519bb39326f6fa531d20',1,'mlx::core::jvp(const std::function&lt; array(const array &amp;)&gt; &amp;fun, const array &amp;primal, const array &amp;tangent)']]]
 ];
diff --git a/docs/build/html/search/all_b.js b/docs/build/html/search/all_b.js
index d9a4955de..708cda096 100644
--- a/docs/build/html/search/all_b.js
+++ b/docs/build/html/search/all_b.js
@@ -1,25 +1,28 @@
 var searchData=
 [
-  ['k_0',['K',['../struct_m_l_x_fast_attention_params.html#ada454f5ad22ec36a22d0ff596751af23',1,'MLXFastAttentionParams::K'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ae1b0386e4cd1a7018f4b654c4e9493ba',1,'mlx::steel::ImplicitGemmConv2DParams::K'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#aa0851af4da8df820bdad9589ff517cff',1,'mlx::steel::GEMMParams::K'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a3733f9031e82e761ec44e72ed5c6d0e7',1,'mlx::steel::GEMMSpiltKParams::K']]],
-  ['kcols_1',['kCols',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257',1,'mlx::steel::MMATile']]],
-  ['kdil_2',['kdil',['../struct_m_l_x_conv_params.html#a7611db8f1621c7e09fc685ed44073b14',1,'MLXConvParams']]],
-  ['kelemcols_3',['kElemCols',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
-  ['kelemrows_4',['kElemRows',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
-  ['kelemsperfrag_5',['kElemsPerFrag',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a3c34dfdc944db110f4735f1b25307cf0',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kElemsPerFrag'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6',1,'mlx::steel::MMATile::kElemsPerFrag']]],
-  ['kelemspertile_6',['kElemsPerTile',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f',1,'mlx::steel::MMATile']]],
-  ['kernelmergesort_7',['KernelMergeSort',['../struct_kernel_merge_sort.html',1,'']]],
-  ['kernelmultiblockmergesort_8',['KernelMultiBlockMergeSort',['../struct_kernel_multi_block_merge_sort.html',1,'']]],
-  ['kernels_2eh_9',['kernels.h',['../kernels_8h.html',1,'']]],
-  ['key_10',['key',['../namespacemlx_1_1core_1_1random.html#acf04b6f42de11383e86dcc7f98c67bd8',1,'mlx::core::random']]],
-  ['keysequence_11',['KeySequence',['../classmlx_1_1core_1_1random_1_1_key_sequence.html',1,'mlx::core::random::KeySequence'],['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a196eb6ce5ba1eb37cc8c67d6d1332bfe',1,'mlx::core::random::KeySequence::KeySequence()']]],
-  ['kfragcols_12',['kFragCols',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a211102315e2afbcfcd2e2c201b638e9f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kFragCols'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906',1,'mlx::steel::MMATile::kFragCols']]],
-  ['kfragrows_13',['kFragRows',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kFragRows'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7',1,'mlx::steel::MMATile::kFragRows']]],
-  ['kfragsize_14',['kFragSize',['../structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d',1,'mlx::steel::BlockMMA']]],
-  ['kind_15',['Kind',['../structmlx_1_1core_1_1_dtype.html#adb1ea8b45a0c53e04a0e73b168702715',1,'mlx::core::Dtype']]],
-  ['kindof_16',['kindof',['../namespacemlx_1_1core.html#ad527b86818823db040195785efd7d724',1,'mlx::core']]],
-  ['knumfrags_17',['kNumFrags',['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3',1,'mlx::steel::MMATile']]],
-  ['krows_18',['kRows',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323',1,'mlx::steel::MMATile']]],
-  ['ktilecols_19',['kTileCols',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4',1,'mlx::steel::MMATile']]],
-  ['ktilerows_20',['kTileRows',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a',1,'mlx::steel::MMATile']]],
-  ['kv_5ftiles_21',['KV_TILES',['../struct_m_l_x_scaled_dot_product_attention_params.html#a58ef2765fd681e6b35b2ba72030610e0',1,'MLXScaledDotProductAttentionParams']]]
+  ['k_0',['K',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ae1b0386e4cd1a7018f4b654c4e9493ba',1,'mlx::steel::ImplicitGemmConv2DParams::K'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#aa0851af4da8df820bdad9589ff517cff',1,'mlx::steel::GEMMParams::K'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a3733f9031e82e761ec44e72ed5c6d0e7',1,'mlx::steel::GEMMSpiltKParams::K']]],
+  ['k_5fstrides_1',['K_strides',['../structmlx_1_1steel_1_1_attn_params.html#a03e5480d1cca6af541be54a8720e9974',1,'mlx::steel::AttnParams']]],
+  ['kcols_2',['kCols',['../structmlx_1_1steel_1_1_c_shape.html#a01b09227356b6a682a0694523a8e6901',1,'mlx::steel::CShape::kCols'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257',1,'mlx::steel::MMATile::kCols']]],
+  ['kcolsperthread_3',['kColsPerThread',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1ea49efd92696b15302ee4b52ecd548c',1,'mlx::steel::MMATile']]],
+  ['kdil_4',['kdil',['../struct_m_l_x_conv_params.html#a7611db8f1621c7e09fc685ed44073b14',1,'MLXConvParams']]],
+  ['kelemcols_5',['kElemCols',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
+  ['kelemrows_6',['kElemRows',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
+  ['kelemsperfrag_7',['kElemsPerFrag',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a3c34dfdc944db110f4735f1b25307cf0',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kElemsPerFrag'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6',1,'mlx::steel::MMATile::kElemsPerFrag']]],
+  ['kelemspertile_8',['kElemsPerTile',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f',1,'mlx::steel::MMATile']]],
+  ['kernelmergesort_9',['KernelMergeSort',['../struct_kernel_merge_sort.html',1,'']]],
+  ['kernelmultiblockmergesort_10',['KernelMultiBlockMergeSort',['../struct_kernel_multi_block_merge_sort.html',1,'']]],
+  ['kernels_2eh_11',['kernels.h',['../kernels_8h.html',1,'']]],
+  ['key_12',['key',['../namespacemlx_1_1core_1_1random.html#acf04b6f42de11383e86dcc7f98c67bd8',1,'mlx::core::random']]],
+  ['keysequence_13',['KeySequence',['../classmlx_1_1core_1_1random_1_1_key_sequence.html',1,'mlx::core::random::KeySequence'],['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a196eb6ce5ba1eb37cc8c67d6d1332bfe',1,'mlx::core::random::KeySequence::KeySequence()']]],
+  ['kfragcols_14',['kFragCols',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a211102315e2afbcfcd2e2c201b638e9f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kFragCols'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906',1,'mlx::steel::MMATile::kFragCols']]],
+  ['kfragrows_15',['kFragRows',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kFragRows'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7',1,'mlx::steel::MMATile::kFragRows']]],
+  ['kfragsize_16',['kFragSize',['../structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d',1,'mlx::steel::BlockMMA']]],
+  ['kind_17',['Kind',['../structmlx_1_1core_1_1_dtype.html#adb1ea8b45a0c53e04a0e73b168702715',1,'mlx::core::Dtype']]],
+  ['kindof_18',['kindof',['../namespacemlx_1_1core.html#ad527b86818823db040195785efd7d724',1,'mlx::core']]],
+  ['kl_19',['kL',['../structmlx_1_1steel_1_1_attn_params.html#a497b7404bcd25b535c3589c61f269f63',1,'mlx::steel::AttnParams']]],
+  ['knumfrags_20',['kNumFrags',['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3',1,'mlx::steel::MMATile']]],
+  ['krows_21',['kRows',['../structmlx_1_1steel_1_1_c_shape.html#a5caf36cb9acf9f90ba59a9b0b4197993',1,'mlx::steel::CShape::kRows'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323',1,'mlx::steel::MMATile::kRows']]],
+  ['krowsperthread_22',['kRowsPerThread',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e',1,'mlx::steel::MMATile']]],
+  ['ktilecols_23',['kTileCols',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4',1,'mlx::steel::MMATile']]],
+  ['ktilerows_24',['kTileRows',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a',1,'mlx::steel::MMATile']]]
 ];
diff --git a/docs/build/html/search/all_c.js b/docs/build/html/search/all_c.js
index fc3efb463..7377a7751 100644
--- a/docs/build/html/search/all_c.js
+++ b/docs/build/html/search/all_c.js
@@ -8,82 +8,79 @@ var searchData=
   ['layer_5fnorm_5',['layer_norm',['../namespacemlx_1_1core_1_1fast.html#a01bd533ebd0e2415c4ee30032d51d7bf',1,'mlx::core::fast']]],
   ['layernorm_6',['LayerNorm',['../classmlx_1_1core_1_1fast_1_1_layer_norm.html',1,'mlx::core::fast::LayerNorm'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5ac38d50e62850589bf51ee313303153',1,'mlx::core::fast::LayerNorm::LayerNorm()']]],
   ['layernormvjp_7',['LayerNormVJP',['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html',1,'mlx::core::fast::LayerNormVJP'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a41bc1391dbc0cf63b2c85b67956c08d9',1,'mlx::core::fast::LayerNormVJP::LayerNormVJP()']]],
-  ['lda_8',['lda',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#afec42b532ffcad32bbffd494526bef03',1,'mlx::steel::GEMMParams::lda'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a6fac3c4a7c35af7b46b53f9662f882c6',1,'mlx::steel::GEMMSpiltKParams::lda']]],
-  ['ldb_9',['ldb',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a6032a081ab707c14b5f28069faa7cf62',1,'mlx::steel::GEMMParams::ldb'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a7f6f511854ccc98fa573bb560776ebed',1,'mlx::steel::GEMMSpiltKParams::ldb']]],
-  ['ldc_10',['ldc',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a888730efa5c5c8ae7ed771c3084d583c',1,'mlx::steel::GEMMSpiltKParams::ldc'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a801e2245a36632160975a784b762a4e6',1,'mlx::steel::GEMMAddMMParams::ldc']]],
-  ['ldd_11',['ldd',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a6e8ae14e3f97c499ad9c39358a1855ab',1,'mlx::steel::GEMMParams']]],
-  ['ldexp_12',['ldexp',['../namespacemetal.html#a3deed001738b6f03accd3c2195586c2b',1,'metal::ldexp()'],['../namespacemetal_1_1fast.html#adb045765987e76c7ad4b511fab0c867e',1,'metal::fast::ldexp()'],['../namespacemetal_1_1precise.html#aa0462827a08a9f475fdaeb104c98b6ab',1,'metal::precise::ldexp()']]],
-  ['ldk_13',['ldk',['../struct_m_l_x_fast_attention_params.html#a1f8c89bd55d89ad7b9fe27c60e3cb8d5',1,'MLXFastAttentionParams']]],
-  ['ldo_14',['ldo',['../struct_m_l_x_fast_attention_params.html#a9e73dc1971b5ab913bd85a7afa7cf46c',1,'MLXFastAttentionParams']]],
-  ['ldq_15',['ldq',['../struct_m_l_x_fast_attention_params.html#af2dadba2a28f5db2ca52472d00937e58',1,'MLXFastAttentionParams']]],
-  ['lds_16',['lds',['../struct_m_l_x_fast_attention_params.html#a274eeb8591c02511014dce50c4240c8a',1,'MLXFastAttentionParams']]],
-  ['ldv_17',['ldv',['../struct_m_l_x_fast_attention_params.html#aebada0bf0789e8706dce564752208e8b',1,'MLXFastAttentionParams']]],
-  ['left_5fshift_18',['left_shift',['../group__ops.html#ga89682bf78491761e062d4ee7bef0c829',1,'mlx::core']]],
-  ['leftshift_19',['LeftShift',['../struct_left_shift.html',1,'LeftShift'],['../structmlx_1_1core_1_1detail_1_1_left_shift.html',1,'mlx::core::detail::LeftShift'],['../classmlx_1_1core_1_1_bitwise_binary.html#a6f8b5d455d0c1770428a6bef1608f23da986b39e75cbe29fcda1d7bf7942a65a0',1,'mlx::core::BitwiseBinary::LeftShift']]],
-  ['length_20',['length',['../classpocketfft_1_1detail_1_1pocketfft__c.html#a1fd1a2f9b3ae5ee9f00b9ca6946eb16d',1,'pocketfft::detail::pocketfft_c::length()'],['../classpocketfft_1_1detail_1_1pocketfft__r.html#a83222fdbf81a7c6d560e0841cdfca8c6',1,'pocketfft::detail::pocketfft_r::length()'],['../classpocketfft_1_1detail_1_1_t__dct1.html#ac7a04c91d507bd8f173d2266bb5bb168',1,'pocketfft::detail::T_dct1::length()'],['../classpocketfft_1_1detail_1_1_t__dst1.html#ab205d901650e38b592ff860b7978fa3e',1,'pocketfft::detail::T_dst1::length()'],['../classpocketfft_1_1detail_1_1_t__dcst23.html#a6dab012b487ff98d11b8a9418653a478',1,'pocketfft::detail::T_dcst23::length()'],['../classpocketfft_1_1detail_1_1_t__dcst4.html#af25bf28a7ccd4690ca9934e3aa79c12f',1,'pocketfft::detail::T_dcst4::length()']]],
-  ['length_5fin_21',['length_in',['../classpocketfft_1_1detail_1_1multi__iter.html#a5318b79d934cddf109dff7bf96a330c8',1,'pocketfft::detail::multi_iter']]],
-  ['length_5fout_22',['length_out',['../classpocketfft_1_1detail_1_1multi__iter.html#a93cd515d07cd479138a35da9df66bd41',1,'pocketfft::detail::multi_iter']]],
-  ['less_23',['Less',['../struct_less.html',1,'Less'],['../structmlx_1_1core_1_1detail_1_1_less.html',1,'mlx::core::detail::Less'],['../classmlx_1_1core_1_1_less.html',1,'mlx::core::Less'],['../classmlx_1_1core_1_1_less.html#aa55c5cfbab0ac30e1b72c080fe9525d7',1,'mlx::core::Less::Less()']]],
-  ['less_24',['less',['../group__ops.html#ga9142b8d717699a8abfa2a7398891ff8a',1,'mlx::core']]],
-  ['less_5fequal_25',['less_equal',['../group__ops.html#ga0d49e0c7011d0573c369c13c8f045a09',1,'mlx::core']]],
-  ['lessequal_26',['LessEqual',['../struct_less_equal.html',1,'LessEqual'],['../structmlx_1_1core_1_1detail_1_1_less_equal.html',1,'mlx::core::detail::LessEqual'],['../classmlx_1_1core_1_1_less_equal.html',1,'mlx::core::LessEqual'],['../classmlx_1_1core_1_1_less_equal.html#a52492a43224d47e7851beec646c27bbc',1,'mlx::core::LessEqual::LessEqual()']]],
-  ['lessthan_27',['LessThan',['../struct_less_than.html',1,'']]],
-  ['lib_5fname_28',['lib_name',['../classmlx_1_1core_1_1_compiled.html#ae5c16cb91ac31b97e7652cc526c07439',1,'mlx::core::Compiled']]],
-  ['limits_29',['Limits',['../struct_limits.html',1,'']]],
-  ['limits_3c_20bfloat16_5ft_20_3e_30',['Limits&lt; bfloat16_t &gt;',['../struct_limits_3_01bfloat16__t_01_4.html',1,'']]],
-  ['limits_3c_20bool_20_3e_31',['Limits&lt; bool &gt;',['../struct_limits_3_01bool_01_4.html',1,'']]],
-  ['limits_3c_20complex64_5ft_20_3e_32',['Limits&lt; complex64_t &gt;',['../struct_limits_3_01complex64__t_01_4.html',1,'']]],
-  ['limits_3c_20float_20_3e_33',['Limits&lt; float &gt;',['../struct_limits_3_01float_01_4.html',1,'']]],
-  ['limits_3c_20half_20_3e_34',['Limits&lt; half &gt;',['../struct_limits_3_01half_01_4.html',1,'']]],
-  ['limits_3c_20int16_5ft_20_3e_35',['Limits&lt; int16_t &gt;',['../struct_limits_3_01int16__t_01_4.html',1,'']]],
-  ['limits_3c_20int32_5ft_20_3e_36',['Limits&lt; int32_t &gt;',['../struct_limits_3_01int32__t_01_4.html',1,'']]],
-  ['limits_3c_20int64_5ft_20_3e_37',['Limits&lt; int64_t &gt;',['../struct_limits_3_01int64__t_01_4.html',1,'']]],
-  ['limits_3c_20int8_5ft_20_3e_38',['Limits&lt; int8_t &gt;',['../struct_limits_3_01int8__t_01_4.html',1,'']]],
-  ['limits_3c_20uint16_5ft_20_3e_39',['Limits&lt; uint16_t &gt;',['../struct_limits_3_01uint16__t_01_4.html',1,'']]],
-  ['limits_3c_20uint32_5ft_20_3e_40',['Limits&lt; uint32_t &gt;',['../struct_limits_3_01uint32__t_01_4.html',1,'']]],
-  ['limits_3c_20uint64_5ft_20_3e_41',['Limits&lt; uint64_t &gt;',['../struct_limits_3_01uint64__t_01_4.html',1,'']]],
-  ['limits_3c_20uint8_5ft_20_3e_42',['Limits&lt; uint8_t &gt;',['../struct_limits_3_01uint8__t_01_4.html',1,'']]],
-  ['linalg_2eh_43',['linalg.h',['../linalg_8h.html',1,'']]],
-  ['linspace_44',['linspace',['../group__ops.html#ga968bcabed902311dcfbd903b0fb886ec',1,'mlx::core']]],
-  ['load_45',['Load',['../classmlx_1_1core_1_1_load.html',1,'mlx::core::Load'],['../classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a',1,'mlx::core::Load::Load()']]],
-  ['load_46',['load',['../struct_read_writer.html#a120eaf4b5f32e80972a18d14e82a2d75',1,'ReadWriter::load()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96',1,'mlx::steel::MMATile::load(const threadgroup U *src)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9',1,'mlx::steel::MMATile::load(const device U *src, const int ld)'],['../struct_read_writer.html#a8a97ba42db5692898ef7391db08d8fd0',1,'ReadWriter::load() const'],['../struct_read_writer.html#a2506ee61be67826ac9494efb12a81900',1,'ReadWriter::load() const'],['../namespacemlx_1_1core.html#a954de19249da7c1fa39b89bdc47368aa',1,'mlx::core::load(array &amp;out, size_t offset, const std::shared_ptr&lt; io::Reader &gt; &amp;reader, bool swap_endianess)'],['../namespacemlx_1_1core.html#abada9bfa834d7423959362386720f3db',1,'mlx::core::load(std::shared_ptr&lt; io::Reader &gt; in_stream, StreamOrDevice s={})'],['../namespacemlx_1_1core.html#ac71a08bf4c052ae3c77e9e89cbea071d',1,'mlx::core::load(std::string file, StreamOrDevice s={})']]],
-  ['load_2eh_47',['load.h',['../backend_2common_2load_8h.html',1,'(Global Namespace)'],['../io_2load_8h.html',1,'(Global Namespace)']]],
-  ['load_5fgguf_48',['load_gguf',['../namespacemlx_1_1core.html#a2aa12b351ce559deb14cda0a5292c2ce',1,'mlx::core']]],
-  ['load_5fpadded_49',['load_padded',['../struct_read_writer.html#add5bd3f647793a5a19d63197a19df73c',1,'ReadWriter::load_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#af3ce6bbb1a8dfb3bab1ae18d3eb45bc0',1,'ReadWriter::load_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#ab116f4569bb9dc6eaef0d8d08472e239',1,'ReadWriter::load_padded(int length, const device float2 *w_k) const']]],
-  ['load_5fsafe_50',['load_safe',['../struct_g_e_m_v_kernel.html#a04bb72da9a93d6d1eba468fa311bbba7',1,'GEMVKernel::load_safe()'],['../struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b',1,'QuantizedBlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d',1,'mlx::steel::BlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da',1,'mlx::steel::MMATile::load_safe()'],['../scan_8h.html#ae8eb101e538b85f8a4bcf451489ae0ac',1,'load_safe():&#160;scan.h']]],
-  ['load_5fsafetensors_51',['load_safetensors',['../namespacemlx_1_1core.html#a96cc40e1af8c4626c813ce4859f70a5c',1,'mlx::core::load_safetensors(std::shared_ptr&lt; io::Reader &gt; in_stream, StreamOrDevice s={})'],['../namespacemlx_1_1core.html#af7eea1682a38d363c56a066321e6d526',1,'mlx::core::load_safetensors(const std::string &amp;file, StreamOrDevice s={})']]],
-  ['load_5fstrided_52',['load_strided',['../struct_read_writer.html#a998ef484bade81f726b9edfc6b878197',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a3d9c8cbc582cad6b5218339d0f721559',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a795a71a8e1f154a5af415ebe1b3f0713',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a0935b946b8bf2e769427fcbf2da2f7be',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a7d45368c74a8b7c632659504b3273a13',1,'ReadWriter::load_strided(int stride, int overall_n)']]],
-  ['load_5funsafe_53',['load_unsafe',['../struct_g_e_m_v_kernel.html#a6013e9c5b2f72fa1311dd038172df0ce',1,'GEMVKernel::load_unsafe()'],['../struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc',1,'QuantizedBlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a961836be363409744e48e595d5e0c2ec',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8034abc10483487fc94313e3674d1111',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a69e2f7c9814d1cc1c5c267be8618dc55',1,'mlx::steel::Conv2DWeightBlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#aa11d1a142bc868df462f48a7102147f3',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a0e262b003ac0e7ee6272585eac921704',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3859ca11b5991ef6ee9b99afdc3ea30a',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8f078982186421f5b484c0b53af9c655',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::load_unsafe()'],['../structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27',1,'mlx::steel::BlockLoader::load_unsafe()'],['../scan_8h.html#a9c415d07921f3961bad0a00a34f4a9a3',1,'load_unsafe(U values[N_READS], const device T *input):&#160;scan.h']]],
-  ['load_5fvector_54',['load_vector',['../quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9',1,'quantized.h']]],
-  ['load_5fvector_5fsafe_55',['load_vector_safe',['../quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7',1,'quantized.h']]],
-  ['loader_2eh_56',['loader.h',['../conv_2loader_8h.html',1,'(Global Namespace)'],['../gemm_2loader_8h.html',1,'(Global Namespace)']]],
-  ['loader_5fa_5ft_57',['loader_a_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa8a04ed74d2259f99b337d4662c64d83',1,'mlx::steel::GEMMKernel']]],
-  ['loader_5fb_5ft_58',['loader_b_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa98f32278b5fd98c93ae5483c3596395',1,'mlx::steel::GEMMKernel']]],
-  ['loader_5fchannel_5fl_2eh_59',['loader_channel_l.h',['../loader__channel__l_8h.html',1,'']]],
-  ['loader_5fchannel_5fn_2eh_60',['loader_channel_n.h',['../loader__channel__n_8h.html',1,'']]],
-  ['loader_5fgeneral_2eh_61',['loader_general.h',['../loader__general_8h.html',1,'']]],
-  ['loc_62',['loc',['../structmlx_1_1core_1_1_contiguous_iterator.html#a027b29e06d5cb467d961c019699514b1',1,'mlx::core::ContiguousIterator']]],
-  ['location_63',['location',['../structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2',1,'looped_elem_to_loc::location()'],['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a368d2a2204cee5055386954acd5ccb90',1,'looped_elem_to_loc&lt; 1, offset_t &gt;::location()'],['../structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a8c7aaffda0ca500d9f9566e5e74217a2',1,'looped_elem_to_loc&lt; 0, offset_t &gt;::location()']]],
-  ['log_64',['Log',['../struct_log.html',1,'Log'],['../structmlx_1_1core_1_1detail_1_1_log.html',1,'mlx::core::detail::Log'],['../classmlx_1_1core_1_1_log.html',1,'mlx::core::Log'],['../classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9',1,'mlx::core::Log::Log()']]],
-  ['log_65',['log',['../namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6',1,'metal::log()'],['../namespacemetal_1_1fast.html#aef942e7f9e5c2e58c58644ab1bdd58d1',1,'metal::fast::log()'],['../namespacemetal_1_1precise.html#a341c2b8c27d1bed860f85f8b355023d4',1,'metal::precise::log()'],['../group__ops.html#ga6fb22d4926133573e430fcc92f4eef31',1,'mlx::core::log()']]],
-  ['log10_66',['Log10',['../struct_log10.html',1,'Log10'],['../structmlx_1_1core_1_1detail_1_1_log10.html',1,'mlx::core::detail::Log10']]],
-  ['log10_67',['log10',['../namespacemetal.html#a042b98827baa910e9d726227cec55a80',1,'metal::log10()'],['../namespacemetal_1_1fast.html#a0d1150cf2deee5100a7ea2988b3bb39e',1,'metal::fast::log10()'],['../namespacemetal_1_1precise.html#a44239067e8e9248b1574353f98e94d72',1,'metal::precise::log10()'],['../group__ops.html#ga1fdcc7fc8819caf2e6f1c327ed4e9b9e',1,'mlx::core::log10()']]],
-  ['log1p_68',['Log1p',['../struct_log1p.html',1,'Log1p'],['../structmlx_1_1core_1_1detail_1_1_log1p.html',1,'mlx::core::detail::Log1p'],['../classmlx_1_1core_1_1_log1p.html',1,'mlx::core::Log1p'],['../classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a',1,'mlx::core::Log1p::Log1p()']]],
-  ['log1p_69',['log1p',['../backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a',1,'log1p(float x):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a3501b665c8837eabf9789ea27a7d6946',1,'log1p(bfloat16_t x):&#160;utils.h'],['../group__ops.html#ga20a1f4270c35b0fa544f5105a87a1604',1,'mlx::core::log1p()']]],
-  ['log2_70',['Log2',['../struct_log2.html',1,'Log2'],['../structmlx_1_1core_1_1detail_1_1_log2.html',1,'mlx::core::detail::Log2']]],
-  ['log2_71',['log2',['../namespacemetal.html#ae894dd5fc13799f120b55cab6267c89c',1,'metal::log2()'],['../namespacemetal_1_1fast.html#a986ef245dd433ae62af864f5cbb07118',1,'metal::fast::log2()'],['../namespacemetal_1_1precise.html#a632dbbdcc1a465cf4739a14306147573',1,'metal::precise::log2()'],['../group__ops.html#ga144228d7222d15af3a135b8b0f3fa21b',1,'mlx::core::log2()']]],
-  ['logaddexp_72',['LogAddExp',['../struct_log_add_exp.html',1,'LogAddExp'],['../structmlx_1_1core_1_1detail_1_1_log_add_exp.html',1,'mlx::core::detail::LogAddExp'],['../classmlx_1_1core_1_1_log_add_exp.html',1,'mlx::core::LogAddExp'],['../classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a',1,'mlx::core::LogAddExp::LogAddExp()']]],
-  ['logaddexp_73',['logaddexp',['../group__ops.html#gaf985df6609c6bd75a14a844655d89eaa',1,'mlx::core']]],
-  ['logical_5fand_74',['logical_and',['../group__ops.html#ga768977cda8d68cf23f464a6af9907876',1,'mlx::core']]],
-  ['logical_5fnot_75',['logical_not',['../group__ops.html#gabca78d34ce93f0de2814e62225bb2a53',1,'mlx::core']]],
-  ['logical_5for_76',['logical_or',['../group__ops.html#ga23768728e4dd070c917fbb0ed0d0c2ec',1,'mlx::core']]],
-  ['logicaland_77',['LogicalAnd',['../struct_logical_and.html',1,'LogicalAnd'],['../structmlx_1_1core_1_1detail_1_1_logical_and.html',1,'mlx::core::detail::LogicalAnd'],['../classmlx_1_1core_1_1_logical_and.html',1,'mlx::core::LogicalAnd'],['../classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3',1,'mlx::core::LogicalAnd::LogicalAnd()']]],
-  ['logicalnot_78',['LogicalNot',['../struct_logical_not.html',1,'LogicalNot'],['../structmlx_1_1core_1_1detail_1_1_logical_not.html',1,'mlx::core::detail::LogicalNot'],['../classmlx_1_1core_1_1_logical_not.html',1,'mlx::core::LogicalNot'],['../classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7',1,'mlx::core::LogicalNot::LogicalNot()']]],
-  ['logicalor_79',['LogicalOr',['../struct_logical_or.html',1,'LogicalOr'],['../structmlx_1_1core_1_1detail_1_1_logical_or.html',1,'mlx::core::detail::LogicalOr'],['../classmlx_1_1core_1_1_logical_or.html',1,'mlx::core::LogicalOr'],['../classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918',1,'mlx::core::LogicalOr::LogicalOr()']]],
-  ['logsumexp_80',['logsumexp',['../group__ops.html#gacff4eb57c085d571e722083680267ac5',1,'mlx::core::logsumexp(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga59be50b4e92f1dc20b53460cefa3910d',1,'mlx::core::logsumexp(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gae3969c7bd24c4f3ab97831df28239689',1,'mlx::core::logsumexp(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gafef5cb2159c16a60a95470cc823bdd44',1,'mlx::core::logsumexp(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['loopalignment_81',['LoopAlignment',['../structmlx_1_1steel_1_1_loop_alignment.html',1,'mlx::steel']]],
-  ['looped_5felem_5fto_5floc_82',['looped_elem_to_loc',['../structlooped__elem__to__loc.html',1,'']]],
-  ['looped_5felem_5fto_5floc_3c_200_2c_20offset_5ft_20_3e_83',['looped_elem_to_loc&lt; 0, offset_t &gt;',['../structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html',1,'']]],
-  ['looped_5felem_5fto_5floc_3c_201_2c_20offset_5ft_20_3e_84',['looped_elem_to_loc&lt; 1, offset_t &gt;',['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html',1,'']]],
-  ['lowest_85',['lowest',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]]
+  ['layout_8',['layout',['../structmlx_1_1steel_1_1_layout2_d.html#a6beedf1677ee1b192fb48c83a29ac8a1',1,'mlx::steel::Layout2D']]],
+  ['layout2d_9',['Layout2D',['../structmlx_1_1steel_1_1_layout2_d.html',1,'mlx::steel']]],
+  ['lda_10',['lda',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#afec42b532ffcad32bbffd494526bef03',1,'mlx::steel::GEMMParams::lda'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a6fac3c4a7c35af7b46b53f9662f882c6',1,'mlx::steel::GEMMSpiltKParams::lda']]],
+  ['ldb_11',['ldb',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a6032a081ab707c14b5f28069faa7cf62',1,'mlx::steel::GEMMParams::ldb'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a7f6f511854ccc98fa573bb560776ebed',1,'mlx::steel::GEMMSpiltKParams::ldb']]],
+  ['ldc_12',['ldc',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a888730efa5c5c8ae7ed771c3084d583c',1,'mlx::steel::GEMMSpiltKParams::ldc'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a801e2245a36632160975a784b762a4e6',1,'mlx::steel::GEMMAddMMParams::ldc']]],
+  ['ldd_13',['ldd',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a6e8ae14e3f97c499ad9c39358a1855ab',1,'mlx::steel::GEMMParams']]],
+  ['ldexp_14',['ldexp',['../namespacemetal.html#a3deed001738b6f03accd3c2195586c2b',1,'metal::ldexp()'],['../namespacemetal_1_1fast.html#adb045765987e76c7ad4b511fab0c867e',1,'metal::fast::ldexp()'],['../namespacemetal_1_1precise.html#aa0462827a08a9f475fdaeb104c98b6ab',1,'metal::precise::ldexp()']]],
+  ['left_5fshift_15',['left_shift',['../group__ops.html#ga89682bf78491761e062d4ee7bef0c829',1,'mlx::core']]],
+  ['leftshift_16',['LeftShift',['../struct_left_shift.html',1,'LeftShift'],['../structmlx_1_1core_1_1detail_1_1_left_shift.html',1,'mlx::core::detail::LeftShift'],['../classmlx_1_1core_1_1_bitwise_binary.html#a6f8b5d455d0c1770428a6bef1608f23da986b39e75cbe29fcda1d7bf7942a65a0',1,'mlx::core::BitwiseBinary::LeftShift']]],
+  ['length_17',['length',['../classpocketfft_1_1detail_1_1pocketfft__c.html#a1fd1a2f9b3ae5ee9f00b9ca6946eb16d',1,'pocketfft::detail::pocketfft_c::length()'],['../classpocketfft_1_1detail_1_1pocketfft__r.html#a83222fdbf81a7c6d560e0841cdfca8c6',1,'pocketfft::detail::pocketfft_r::length()'],['../classpocketfft_1_1detail_1_1_t__dct1.html#ac7a04c91d507bd8f173d2266bb5bb168',1,'pocketfft::detail::T_dct1::length()'],['../classpocketfft_1_1detail_1_1_t__dst1.html#ab205d901650e38b592ff860b7978fa3e',1,'pocketfft::detail::T_dst1::length()'],['../classpocketfft_1_1detail_1_1_t__dcst23.html#a6dab012b487ff98d11b8a9418653a478',1,'pocketfft::detail::T_dcst23::length()'],['../classpocketfft_1_1detail_1_1_t__dcst4.html#af25bf28a7ccd4690ca9934e3aa79c12f',1,'pocketfft::detail::T_dcst4::length()']]],
+  ['length_5fin_18',['length_in',['../classpocketfft_1_1detail_1_1multi__iter.html#a5318b79d934cddf109dff7bf96a330c8',1,'pocketfft::detail::multi_iter']]],
+  ['length_5fout_19',['length_out',['../classpocketfft_1_1detail_1_1multi__iter.html#a93cd515d07cd479138a35da9df66bd41',1,'pocketfft::detail::multi_iter']]],
+  ['less_20',['Less',['../struct_less.html',1,'Less'],['../structmlx_1_1core_1_1detail_1_1_less.html',1,'mlx::core::detail::Less'],['../classmlx_1_1core_1_1_less.html',1,'mlx::core::Less'],['../classmlx_1_1core_1_1_less.html#aa55c5cfbab0ac30e1b72c080fe9525d7',1,'mlx::core::Less::Less()']]],
+  ['less_21',['less',['../group__ops.html#ga9142b8d717699a8abfa2a7398891ff8a',1,'mlx::core']]],
+  ['less_5fequal_22',['less_equal',['../group__ops.html#ga0d49e0c7011d0573c369c13c8f045a09',1,'mlx::core']]],
+  ['lessequal_23',['LessEqual',['../struct_less_equal.html',1,'LessEqual'],['../structmlx_1_1core_1_1detail_1_1_less_equal.html',1,'mlx::core::detail::LessEqual'],['../classmlx_1_1core_1_1_less_equal.html',1,'mlx::core::LessEqual'],['../classmlx_1_1core_1_1_less_equal.html#a52492a43224d47e7851beec646c27bbc',1,'mlx::core::LessEqual::LessEqual()']]],
+  ['lessthan_24',['LessThan',['../struct_less_than.html',1,'']]],
+  ['lib_5fname_25',['lib_name',['../classmlx_1_1core_1_1_compiled.html#ae5c16cb91ac31b97e7652cc526c07439',1,'mlx::core::Compiled']]],
+  ['limits_26',['Limits',['../struct_limits.html',1,'']]],
+  ['limits_3c_20bfloat16_5ft_20_3e_27',['Limits&lt; bfloat16_t &gt;',['../struct_limits_3_01bfloat16__t_01_4.html',1,'']]],
+  ['limits_3c_20bool_20_3e_28',['Limits&lt; bool &gt;',['../struct_limits_3_01bool_01_4.html',1,'']]],
+  ['limits_3c_20complex64_5ft_20_3e_29',['Limits&lt; complex64_t &gt;',['../struct_limits_3_01complex64__t_01_4.html',1,'']]],
+  ['limits_3c_20float_20_3e_30',['Limits&lt; float &gt;',['../struct_limits_3_01float_01_4.html',1,'']]],
+  ['limits_3c_20half_20_3e_31',['Limits&lt; half &gt;',['../struct_limits_3_01half_01_4.html',1,'']]],
+  ['limits_3c_20int16_5ft_20_3e_32',['Limits&lt; int16_t &gt;',['../struct_limits_3_01int16__t_01_4.html',1,'']]],
+  ['limits_3c_20int32_5ft_20_3e_33',['Limits&lt; int32_t &gt;',['../struct_limits_3_01int32__t_01_4.html',1,'']]],
+  ['limits_3c_20int64_5ft_20_3e_34',['Limits&lt; int64_t &gt;',['../struct_limits_3_01int64__t_01_4.html',1,'']]],
+  ['limits_3c_20int8_5ft_20_3e_35',['Limits&lt; int8_t &gt;',['../struct_limits_3_01int8__t_01_4.html',1,'']]],
+  ['limits_3c_20uint16_5ft_20_3e_36',['Limits&lt; uint16_t &gt;',['../struct_limits_3_01uint16__t_01_4.html',1,'']]],
+  ['limits_3c_20uint32_5ft_20_3e_37',['Limits&lt; uint32_t &gt;',['../struct_limits_3_01uint32__t_01_4.html',1,'']]],
+  ['limits_3c_20uint64_5ft_20_3e_38',['Limits&lt; uint64_t &gt;',['../struct_limits_3_01uint64__t_01_4.html',1,'']]],
+  ['limits_3c_20uint8_5ft_20_3e_39',['Limits&lt; uint8_t &gt;',['../struct_limits_3_01uint8__t_01_4.html',1,'']]],
+  ['linalg_2eh_40',['linalg.h',['../linalg_8h.html',1,'']]],
+  ['linspace_41',['linspace',['../group__ops.html#ga968bcabed902311dcfbd903b0fb886ec',1,'mlx::core']]],
+  ['load_42',['Load',['../classmlx_1_1core_1_1_load.html',1,'mlx::core::Load'],['../classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a',1,'mlx::core::Load::Load()']]],
+  ['load_43',['load',['../struct_read_writer.html#a120eaf4b5f32e80972a18d14e82a2d75',1,'ReadWriter::load()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96',1,'mlx::steel::MMATile::load(const threadgroup U *src)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9',1,'mlx::steel::MMATile::load(const device U *src, const int ld)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96',1,'mlx::steel::MMATile::load(const threadgroup U *src)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9',1,'mlx::steel::MMATile::load(const device U *src, const int ld)'],['../struct_read_writer.html#a8a97ba42db5692898ef7391db08d8fd0',1,'ReadWriter::load() const'],['../struct_read_writer.html#a2506ee61be67826ac9494efb12a81900',1,'ReadWriter::load() const'],['../namespacemlx_1_1core.html#a954de19249da7c1fa39b89bdc47368aa',1,'mlx::core::load(array &amp;out, size_t offset, const std::shared_ptr&lt; io::Reader &gt; &amp;reader, bool swap_endianess)'],['../namespacemlx_1_1core.html#abada9bfa834d7423959362386720f3db',1,'mlx::core::load(std::shared_ptr&lt; io::Reader &gt; in_stream, StreamOrDevice s={})'],['../namespacemlx_1_1core.html#ac71a08bf4c052ae3c77e9e89cbea071d',1,'mlx::core::load(std::string file, StreamOrDevice s={})']]],
+  ['load_2eh_44',['load.h',['../backend_2common_2load_8h.html',1,'(Global Namespace)'],['../io_2load_8h.html',1,'(Global Namespace)']]],
+  ['load_5fgguf_45',['load_gguf',['../namespacemlx_1_1core.html#a2aa12b351ce559deb14cda0a5292c2ce',1,'mlx::core']]],
+  ['load_5fpadded_46',['load_padded',['../struct_read_writer.html#add5bd3f647793a5a19d63197a19df73c',1,'ReadWriter::load_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#af3ce6bbb1a8dfb3bab1ae18d3eb45bc0',1,'ReadWriter::load_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#ab116f4569bb9dc6eaef0d8d08472e239',1,'ReadWriter::load_padded(int length, const device float2 *w_k) const']]],
+  ['load_5fsafe_47',['load_safe',['../struct_g_e_m_v_kernel.html#a04bb72da9a93d6d1eba468fa311bbba7',1,'GEMVKernel::load_safe()'],['../struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b',1,'QuantizedBlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d',1,'mlx::steel::BlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_block_loader_t.html#ac2d95e35ba39e0984e6f1e58ca935f7d',1,'mlx::steel::BlockLoaderT::load_safe()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da',1,'mlx::steel::MMATile::load_safe()'],['../structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d',1,'mlx::steel::BlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da',1,'mlx::steel::MMATile::load_safe()'],['../scan_8h.html#ae8eb101e538b85f8a4bcf451489ae0ac',1,'load_safe():&#160;scan.h']]],
+  ['load_5fsafetensors_48',['load_safetensors',['../namespacemlx_1_1core.html#a96cc40e1af8c4626c813ce4859f70a5c',1,'mlx::core::load_safetensors(std::shared_ptr&lt; io::Reader &gt; in_stream, StreamOrDevice s={})'],['../namespacemlx_1_1core.html#af7eea1682a38d363c56a066321e6d526',1,'mlx::core::load_safetensors(const std::string &amp;file, StreamOrDevice s={})']]],
+  ['load_5fstrided_49',['load_strided',['../struct_read_writer.html#a998ef484bade81f726b9edfc6b878197',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a3d9c8cbc582cad6b5218339d0f721559',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a795a71a8e1f154a5af415ebe1b3f0713',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a0935b946b8bf2e769427fcbf2da2f7be',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a7d45368c74a8b7c632659504b3273a13',1,'ReadWriter::load_strided(int stride, int overall_n)']]],
+  ['load_5funsafe_50',['load_unsafe',['../struct_g_e_m_v_kernel.html#a6013e9c5b2f72fa1311dd038172df0ce',1,'GEMVKernel::load_unsafe()'],['../struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc',1,'QuantizedBlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27',1,'mlx::steel::BlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_block_loader_t.html#acb743f32146fdc7986264b7beb35fb38',1,'mlx::steel::BlockLoaderT::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a961836be363409744e48e595d5e0c2ec',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8034abc10483487fc94313e3674d1111',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a69e2f7c9814d1cc1c5c267be8618dc55',1,'mlx::steel::Conv2DWeightBlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#aa11d1a142bc868df462f48a7102147f3',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a0e262b003ac0e7ee6272585eac921704',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3859ca11b5991ef6ee9b99afdc3ea30a',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8f078982186421f5b484c0b53af9c655',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::load_unsafe()'],['../structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27',1,'mlx::steel::BlockLoader::load_unsafe()'],['../scan_8h.html#a9c415d07921f3961bad0a00a34f4a9a3',1,'load_unsafe(U values[N_READS], const device T *input):&#160;scan.h']]],
+  ['load_5fvector_51',['load_vector',['../quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9',1,'quantized.h']]],
+  ['load_5fvector_5fsafe_52',['load_vector_safe',['../quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7',1,'quantized.h']]],
+  ['loader_2eh_53',['loader.h',['../attn_2loader_8h.html',1,'(Global Namespace)'],['../conv_2loader_8h.html',1,'(Global Namespace)'],['../gemm_2loader_8h.html',1,'(Global Namespace)']]],
+  ['loader_5fa_5ft_54',['loader_a_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a98b6ec692580510081e2aa887a61944b',1,'mlx::steel::GEMMKernel']]],
+  ['loader_5fb_5ft_55',['loader_b_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1a115d5af0fb6e260165adba2e377635',1,'mlx::steel::GEMMKernel']]],
+  ['loader_5fchannel_5fl_2eh_56',['loader_channel_l.h',['../loader__channel__l_8h.html',1,'']]],
+  ['loader_5fchannel_5fn_2eh_57',['loader_channel_n.h',['../loader__channel__n_8h.html',1,'']]],
+  ['loader_5fgeneral_2eh_58',['loader_general.h',['../loader__general_8h.html',1,'']]],
+  ['loc_59',['loc',['../structmlx_1_1core_1_1_contiguous_iterator.html#a027b29e06d5cb467d961c019699514b1',1,'mlx::core::ContiguousIterator']]],
+  ['location_60',['location',['../struct_looped_elem_to_loc.html#aba051a428ad0934a9c6d04d4d3ee6e0e',1,'LoopedElemToLoc::location()'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a66b84b12f6c1494e5908989ed2849a9f',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::location()'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a89d9ec4dc2f2f0d77e27aa0c05f261ef',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::location()']]],
+  ['log_61',['Log',['../struct_log.html',1,'Log'],['../structmlx_1_1core_1_1detail_1_1_log.html',1,'mlx::core::detail::Log'],['../classmlx_1_1core_1_1_log.html',1,'mlx::core::Log'],['../classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9',1,'mlx::core::Log::Log()']]],
+  ['log_62',['log',['../namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6',1,'metal::log()'],['../namespacemetal_1_1fast.html#aef942e7f9e5c2e58c58644ab1bdd58d1',1,'metal::fast::log()'],['../namespacemetal_1_1precise.html#a341c2b8c27d1bed860f85f8b355023d4',1,'metal::precise::log()'],['../group__ops.html#ga6fb22d4926133573e430fcc92f4eef31',1,'mlx::core::log()']]],
+  ['log10_63',['Log10',['../struct_log10.html',1,'Log10'],['../structmlx_1_1core_1_1detail_1_1_log10.html',1,'mlx::core::detail::Log10']]],
+  ['log10_64',['log10',['../namespacemetal.html#a042b98827baa910e9d726227cec55a80',1,'metal::log10()'],['../namespacemetal_1_1fast.html#a0d1150cf2deee5100a7ea2988b3bb39e',1,'metal::fast::log10()'],['../namespacemetal_1_1precise.html#a44239067e8e9248b1574353f98e94d72',1,'metal::precise::log10()'],['../group__ops.html#ga1fdcc7fc8819caf2e6f1c327ed4e9b9e',1,'mlx::core::log10()']]],
+  ['log1p_65',['Log1p',['../struct_log1p.html',1,'Log1p'],['../structmlx_1_1core_1_1detail_1_1_log1p.html',1,'mlx::core::detail::Log1p'],['../classmlx_1_1core_1_1_log1p.html',1,'mlx::core::Log1p'],['../classmlx_1_1core_1_1_log1p.html#ab0d6eb90c6f98775fce56f3446ff127a',1,'mlx::core::Log1p::Log1p()']]],
+  ['log1p_66',['log1p',['../backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a',1,'log1p(float x):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a3501b665c8837eabf9789ea27a7d6946',1,'log1p(bfloat16_t x):&#160;utils.h'],['../group__ops.html#ga20a1f4270c35b0fa544f5105a87a1604',1,'mlx::core::log1p()']]],
+  ['log2_67',['Log2',['../struct_log2.html',1,'Log2'],['../structmlx_1_1core_1_1detail_1_1_log2.html',1,'mlx::core::detail::Log2']]],
+  ['log2_68',['log2',['../namespacemetal.html#ae894dd5fc13799f120b55cab6267c89c',1,'metal::log2()'],['../namespacemetal_1_1fast.html#a986ef245dd433ae62af864f5cbb07118',1,'metal::fast::log2()'],['../namespacemetal_1_1precise.html#a632dbbdcc1a465cf4739a14306147573',1,'metal::precise::log2()'],['../group__ops.html#ga144228d7222d15af3a135b8b0f3fa21b',1,'mlx::core::log2()']]],
+  ['logaddexp_69',['LogAddExp',['../struct_log_add_exp.html',1,'LogAddExp'],['../structmlx_1_1core_1_1detail_1_1_log_add_exp.html',1,'mlx::core::detail::LogAddExp'],['../classmlx_1_1core_1_1_log_add_exp.html',1,'mlx::core::LogAddExp'],['../classmlx_1_1core_1_1_log_add_exp.html#ad8938ca90ccf1a3259973fc68902975a',1,'mlx::core::LogAddExp::LogAddExp()']]],
+  ['logaddexp_70',['logaddexp',['../group__ops.html#gaf985df6609c6bd75a14a844655d89eaa',1,'mlx::core']]],
+  ['logical_5fand_71',['logical_and',['../group__ops.html#ga768977cda8d68cf23f464a6af9907876',1,'mlx::core']]],
+  ['logical_5fnot_72',['logical_not',['../group__ops.html#gabca78d34ce93f0de2814e62225bb2a53',1,'mlx::core']]],
+  ['logical_5for_73',['logical_or',['../group__ops.html#ga23768728e4dd070c917fbb0ed0d0c2ec',1,'mlx::core']]],
+  ['logicaland_74',['LogicalAnd',['../struct_logical_and.html',1,'LogicalAnd'],['../structmlx_1_1core_1_1detail_1_1_logical_and.html',1,'mlx::core::detail::LogicalAnd'],['../classmlx_1_1core_1_1_logical_and.html',1,'mlx::core::LogicalAnd'],['../classmlx_1_1core_1_1_logical_and.html#aaf2cab8ffcf6606b8babfef60fc06fb3',1,'mlx::core::LogicalAnd::LogicalAnd()']]],
+  ['logicalnot_75',['LogicalNot',['../struct_logical_not.html',1,'LogicalNot'],['../structmlx_1_1core_1_1detail_1_1_logical_not.html',1,'mlx::core::detail::LogicalNot'],['../classmlx_1_1core_1_1_logical_not.html',1,'mlx::core::LogicalNot'],['../classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7',1,'mlx::core::LogicalNot::LogicalNot()']]],
+  ['logicalor_76',['LogicalOr',['../struct_logical_or.html',1,'LogicalOr'],['../structmlx_1_1core_1_1detail_1_1_logical_or.html',1,'mlx::core::detail::LogicalOr'],['../classmlx_1_1core_1_1_logical_or.html',1,'mlx::core::LogicalOr'],['../classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918',1,'mlx::core::LogicalOr::LogicalOr()']]],
+  ['logsumexp_77',['logsumexp',['../group__ops.html#gacff4eb57c085d571e722083680267ac5',1,'mlx::core::logsumexp(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga59be50b4e92f1dc20b53460cefa3910d',1,'mlx::core::logsumexp(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gae3969c7bd24c4f3ab97831df28239689',1,'mlx::core::logsumexp(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gafef5cb2159c16a60a95470cc823bdd44',1,'mlx::core::logsumexp(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['loopalignment_78',['LoopAlignment',['../structmlx_1_1steel_1_1_loop_alignment.html',1,'mlx::steel']]],
+  ['loopedelemtoloc_79',['LoopedElemToLoc',['../struct_looped_elem_to_loc.html',1,'LoopedElemToLoc&lt; DIM, OffsetT, General &gt;'],['../struct_looped_elem_to_loc.html#a5653be1c990722a4a215be27efe5648b',1,'LoopedElemToLoc::LoopedElemToLoc()'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#abf536c7162d36af7367e390789944c86',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::LoopedElemToLoc()'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a0e21977d9f23b6994773e8e4f3ee70de',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::LoopedElemToLoc()']]],
+  ['loopedelemtoloc_3c_201_2c_20offsett_2c_20false_20_3e_80',['LoopedElemToLoc&lt; 1, OffsetT, false &gt;',['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html',1,'']]],
+  ['loopedelemtoloc_3c_201_2c_20offsett_2c_20true_20_3e_81',['LoopedElemToLoc&lt; 1, OffsetT, true &gt;',['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html',1,'']]],
+  ['lowest_82',['lowest',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]]
 ];
diff --git a/docs/build/html/search/all_d.js b/docs/build/html/search/all_d.js
index b1b018f5f..63018b94f 100644
--- a/docs/build/html/search/all_d.js
+++ b/docs/build/html/search/all_d.js
@@ -1,6 +1,6 @@
 var searchData=
 [
-  ['m_0',['M',['../struct_m_l_x_fast_attention_params.html#a5cd3ede5f41d5fdf8177cab3f059f4d8',1,'MLXFastAttentionParams::M'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a2117fc93662d5177c8f3e7c2dbb9e2db',1,'mlx::steel::ImplicitGemmConv2DParams::M'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a85b20a4c4558cc78d76fcbd045a9c694',1,'mlx::steel::GEMMParams::M'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a8bab0cf8a20d2abefe294a7505917e7e',1,'mlx::steel::GEMMSpiltKParams::M']]],
+  ['m_0',['M',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a2117fc93662d5177c8f3e7c2dbb9e2db',1,'mlx::steel::ImplicitGemmConv2DParams::M'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a85b20a4c4558cc78d76fcbd045a9c694',1,'mlx::steel::GEMMParams::M'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a8bab0cf8a20d2abefe294a7505917e7e',1,'mlx::steel::GEMMSpiltKParams::M']]],
   ['make_5farrays_1',['make_arrays',['../classmlx_1_1core_1_1array.html#a1173db4e23f5a8230911cb8fba45d5e6',1,'mlx::core::array']]],
   ['make_5fcontiguous_5fstrides_2',['make_contiguous_strides',['../namespacemlx_1_1core.html#a085379297e21d57f5b3aa38ae1c26070',1,'mlx::core']]],
   ['make_5fstring_3',['make_string',['../namespacemlx_1_1core.html#aed148d95e7b5221f1312473deded0d27',1,'mlx::core']]],
@@ -12,8 +12,8 @@ var searchData=
   ['mask_5fh_9',['mask_h',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0b892c1a7edb9ed20c076d8945855c19',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
   ['mask_5ft_10',['mask_t',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a270ab3da7c98a12525a59952742cc97d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
   ['mask_5fw_11',['mask_w',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a19ddba7259c3c2c02ed90f3f635557be',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
-  ['mat_5fat_12',['mat_at',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e',1,'mlx::steel::MMATile']]],
-  ['mat_5ftype_13',['mat_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mat_type'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190',1,'mlx::steel::MMATile::mat_type']]],
+  ['mat_5fat_12',['mat_at',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e',1,'mlx::steel::MMATile::mat_at(const short i, const short j)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e',1,'mlx::steel::MMATile::mat_at(const short i, const short j)']]],
+  ['mat_5ftype_13',['mat_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mat_type'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a',1,'mlx::steel::MMATile::mat_type']]],
   ['matmul_14',['Matmul',['../classmlx_1_1core_1_1_matmul.html',1,'mlx::core::Matmul'],['../classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7',1,'mlx::core::Matmul::Matmul()']]],
   ['matmul_15',['matmul',['../group__ops.html#ga753d59f5a9f5f2362865ee83b4dced2a',1,'mlx::core']]],
   ['matmul_2eh_16',['matmul.h',['../matmul_8h.html',1,'']]],
@@ -23,91 +23,94 @@ var searchData=
   ['max_5fdigits10_20',['max_digits10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a8d3905e6f158379a0c52682266e8d0e2',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
   ['max_5fexponent_21',['max_exponent',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61bb136f819fa392c50bdf3c38f3aad2',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
   ['max_5fexponent10_22',['max_exponent10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a76bfb2deb0e0afc011f77bf5a6d0ed94',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['max_5foutput_5fsize_23',['MAX_OUTPUT_SIZE',['../backend_2metal_2kernels_2fft_8h.html#a28d683cf067736d76f867f30c066317e',1,'fft.h']]],
-  ['max_5fradix_24',['MAX_RADIX',['../backend_2metal_2kernels_2fft_8h.html#a7b6e56afa21f022c5e754b000955735a',1,'MAX_RADIX:&#160;fft.h'],['../readwrite_8h.html#a7b6e56afa21f022c5e754b000955735a',1,'MAX_RADIX:&#160;readwrite.h']]],
-  ['max_5freduce_5fspecialized_5fdims_25',['MAX_REDUCE_SPECIALIZED_DIMS',['../defines_8h.html#a15629f1b81a2b6f1cca26d07a2734623',1,'defines.h']]],
-  ['max_5fthreads_26',['max_threads',['../namespacepocketfft_1_1detail_1_1threading.html#a2d5c0729f0b66cf061918baea4337d70',1,'pocketfft::detail::threading']]],
-  ['maximum_27',['Maximum',['../struct_maximum.html',1,'Maximum'],['../structmlx_1_1core_1_1detail_1_1_maximum.html',1,'mlx::core::detail::Maximum'],['../classmlx_1_1core_1_1_maximum.html',1,'mlx::core::Maximum'],['../classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816',1,'mlx::core::Maximum::Maximum()']]],
-  ['maximum_28',['maximum',['../group__ops.html#ga7ade2ea305e2e4219c3609443fb5db8d',1,'mlx::core']]],
-  ['maybeinsertbarrier_29',['maybeInsertBarrier',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991',1,'mlx::core::metal::CommandEncoder']]],
-  ['mb_5fblock_5fmerge_30',['mb_block_merge',['../sort_8h.html#ab381cd57f344bc7304ab580bfdc78807',1,'sort.h']]],
-  ['mb_5fblock_5fpartition_31',['mb_block_partition',['../sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2',1,'sort.h']]],
-  ['mb_5fblock_5fsort_32',['mb_block_sort',['../sort_8h.html#aa48ff1aff1e9dc1301b6781aa0721d6b',1,'sort.h']]],
-  ['mean_33',['mean',['../group__ops.html#gade46e768fd46b8b640eb16f26abeecef',1,'mlx::core::mean(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga52b59fdd8e8430538e564f5bbcfa31e6',1,'mlx::core::mean(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga066161f3d3e395a1d76c638cb680d444',1,'mlx::core::mean(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga45fba73eab0e3b6e128ed3ce2f43a5da',1,'mlx::core::mean(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['median3_34',['median3',['../namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157',1,'metal::median3()'],['../namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc',1,'metal::fast::median3()'],['../namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2',1,'metal::precise::median3()']]],
-  ['merge_5fpartition_35',['merge_partition',['../struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca',1,'BlockMergeSort::merge_partition()'],['../struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe',1,'KernelMultiBlockMergeSort::merge_partition()']]],
-  ['merge_5fstep_36',['merge_step',['../struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c',1,'BlockMergeSort']]],
-  ['meshgrid_37',['meshgrid',['../group__ops.html#ga577c911618575314de63d1060656a26e',1,'mlx::core']]],
-  ['metal_38',['metal',['../namespacemetal.html',1,'']]],
-  ['metal_2eh_39',['metal.h',['../metal_8h.html',1,'']]],
-  ['metal_3a_3afast_40',['fast',['../namespacemetal_1_1fast.html',1,'metal']]],
-  ['metal_3a_3aprecise_41',['precise',['../namespacemetal_1_1precise.html',1,'metal']]],
-  ['metal_5fimpl_2eh_42',['metal_impl.h',['../metal__impl_8h.html',1,'']]],
-  ['metal_5fkernel_43',['metal_kernel',['../namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e',1,'mlx::core::fast']]],
-  ['metalallocator_44',['MetalAllocator',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html',1,'mlx::core::metal']]],
-  ['metalkernelfunction_45',['MetalKernelFunction',['../namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0',1,'mlx::core::fast']]],
-  ['min_46',['Min',['../struct_min.html',1,'Min&lt; U &gt;'],['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924a4f685dcd48e6614d6bb2ccda4f2686ef',1,'mlx::core::distributed::AllReduce::Min'],['../classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a0d3d1f5c94725bdc42fa692e2c074418',1,'mlx::core::Reduce::Min'],['../classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1a7d2ee8f14f2e70a9d47170fecc6da898',1,'mlx::core::Scan::Min'],['../classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613cad914e4c3475ce9858f2de4bf35dcfdbf',1,'mlx::core::Scatter::Min']]],
-  ['min_47',['min',['../struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e',1,'Limits::min'],['../struct_limits_3_01uint8__t_01_4.html#a408bd5a337e7292f06e63da81193629a',1,'Limits&lt; uint8_t &gt;::min'],['../struct_limits_3_01uint16__t_01_4.html#ae173984c3be8b6750f27daed581805fe',1,'Limits&lt; uint16_t &gt;::min'],['../struct_limits_3_01uint32__t_01_4.html#ab0c3975e02053b234c7b606ababa66e1',1,'Limits&lt; uint32_t &gt;::min'],['../struct_limits_3_01uint64__t_01_4.html#a80627f39e951398283942cefa48f4dd0',1,'Limits&lt; uint64_t &gt;::min'],['../struct_limits_3_01int8__t_01_4.html#a7a809307d2bba80382f0645d277eaa4b',1,'Limits&lt; int8_t &gt;::min'],['../struct_limits_3_01int16__t_01_4.html#adca7139647801e223c35b0abc7da5240',1,'Limits&lt; int16_t &gt;::min'],['../struct_limits_3_01int32__t_01_4.html#af336a1b22a8ed6a83a4cfb5bf8869771',1,'Limits&lt; int32_t &gt;::min'],['../struct_limits_3_01int64__t_01_4.html#a1c90fb96af515badaccaa835b08f7428',1,'Limits&lt; int64_t &gt;::min'],['../struct_limits_3_01half_01_4.html#aca7b036c257878bf1b80912fb5d4516d',1,'Limits&lt; half &gt;::min'],['../struct_limits_3_01float_01_4.html#a3225e334d372ee86128c89a440d8648f',1,'Limits&lt; float &gt;::min'],['../struct_limits_3_01bfloat16__t_01_4.html#a2fd1811b9f615b2b897904bc27d1cb49',1,'Limits&lt; bfloat16_t &gt;::min'],['../struct_limits_3_01bool_01_4.html#a139f787b57536d455490b8ef801d37cc',1,'Limits&lt; bool &gt;::min'],['../struct_limits_3_01complex64__t_01_4.html#aa67b04aa7abcd67f7af0808737ab8e14',1,'Limits&lt; complex64_t &gt;::min'],['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min()'],['../namespacemetal.html#a6653b28c9473087141eddce39878d4d3',1,'metal::min()'],['../namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61',1,'metal::fast::min()'],['../namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e',1,'metal::precise::min()'],['../group__ops.html#gab27599802617a4c8f9964ab5f4ffee12',1,'mlx::core::min(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga0140b91e9cdfc3fef0da8e332f65a9e8',1,'mlx::core::min(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga6efb83cd46436678c8f8c4af15cc00f5',1,'mlx::core::min(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga36fa315eef677f4143868f552cd26d03',1,'mlx::core::min(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['min3_48',['min3',['../namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f',1,'metal::min3()'],['../namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f',1,'metal::fast::min3()'],['../namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231',1,'metal::precise::min3()']]],
-  ['min_5fexponent_49',['min_exponent',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a13829f8c7a7c0efdc8946eff5d3c9470',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['min_5fexponent10_50',['min_exponent10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aeaed172780720e06b8731cef3177e277',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['minimum_51',['Minimum',['../struct_minimum.html',1,'Minimum'],['../structmlx_1_1core_1_1detail_1_1_minimum.html',1,'mlx::core::detail::Minimum'],['../classmlx_1_1core_1_1_minimum.html',1,'mlx::core::Minimum'],['../classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5',1,'mlx::core::Minimum::Minimum()']]],
-  ['minimum_52',['minimum',['../group__ops.html#ga49ba00c090f81f331c91b0c97040bce0',1,'mlx::core']]],
-  ['mlx_53',['mlx',['../namespacemlx.html',1,'']]],
-  ['mlx_2eh_54',['mlx.h',['../mlx_8h.html',1,'']]],
-  ['mlx_3a_3acore_55',['core',['../namespacemlx_1_1core.html',1,'mlx']]],
-  ['mlx_3a_3acore_3a_3aallocator_56',['allocator',['../namespacemlx_1_1core_1_1allocator.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3adetail_57',['detail',['../namespacemlx_1_1core_1_1detail.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3adistributed_58',['distributed',['../namespacemlx_1_1core_1_1distributed.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3adistributed_3a_3adetail_59',['detail',['../namespacemlx_1_1core_1_1distributed_1_1detail.html',1,'mlx::core::distributed']]],
-  ['mlx_3a_3acore_3a_3afast_60',['fast',['../namespacemlx_1_1core_1_1fast.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3afft_61',['fft',['../namespacemlx_1_1core_1_1fft.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3aio_62',['io',['../namespacemlx_1_1core_1_1io.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3alinalg_63',['linalg',['../namespacemlx_1_1core_1_1linalg.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3ametal_64',['metal',['../namespacemlx_1_1core_1_1metal.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3arandom_65',['random',['../namespacemlx_1_1core_1_1random.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3ascheduler_66',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html',1,'mlx::core']]],
-  ['mlx_3a_3asteel_67',['steel',['../namespacemlx_1_1steel.html',1,'mlx']]],
-  ['mlx_5fatomic_68',['mlx_atomic',['../structmlx__atomic.html',1,'']]],
-  ['mlx_5fatomic_3c_20t_2c_20enable_5fif_5ft_3c_20is_5fmetal_5fatomic_3c_20t_20_3e_20_3e_20_3e_69',['mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;',['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html',1,'']]],
-  ['mlx_5fatomic_5fcompare_5fexchange_5fweak_5fexplicit_70',['mlx_atomic_compare_exchange_weak_explicit',['../atomic_8h.html#ad7f32327ff66354cfa2f0cfdac79316f',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread T *expected, T val, size_t offset):&#160;atomic.h'],['../atomic_8h.html#aa8f47b2e9b95d4b00ad51f08b070deb5',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread uint *expected, uint val, size_t offset):&#160;atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fadd_5fexplicit_71',['mlx_atomic_fetch_add_explicit',['../atomic_8h.html#aad448d9e06e001700b65ca8317216a3b',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fand_5fexplicit_72',['mlx_atomic_fetch_and_explicit',['../atomic_8h.html#a253e3c870c0ddc7c28ab2f6ca2c3eae5',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_73',['mlx_atomic_fetch_max_explicit',['../atomic_8h.html#ac480f2b459a8ad9095cee353e152d00c',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_3c_20float_20_3e_74',['mlx_atomic_fetch_max_explicit&lt; float &gt;',['../atomic_8h.html#a1dce2abfa16417122c4d2bf261129ae4',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_75',['mlx_atomic_fetch_min_explicit',['../atomic_8h.html#a2ec33dca0039bd944d73d1c2b378cc19',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_3c_20float_20_3e_76',['mlx_atomic_fetch_min_explicit&lt; float &gt;',['../atomic_8h.html#ab7d1dc49f319f239b7ee0b7c72976dd0',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmul_5fexplicit_77',['mlx_atomic_fetch_mul_explicit',['../atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5for_5fexplicit_78',['mlx_atomic_fetch_or_explicit',['../atomic_8h.html#ab7391f197001471e4788312bdb6ab37a',1,'atomic.h']]],
-  ['mlx_5fatomic_5fload_5fexplicit_79',['mlx_atomic_load_explicit',['../atomic_8h.html#a253a4e8c2c5768a069e2791b627dfc99',1,'atomic.h']]],
-  ['mlx_5fatomic_5fstore_5fexplicit_80',['mlx_atomic_store_explicit',['../atomic_8h.html#a0ae453140b0819a4c02f265334de98c0',1,'atomic.h']]],
-  ['mlx_5flapack_5ffunc_81',['MLX_LAPACK_FUNC',['../lapack_8h.html#ae22db9704827bf013a0a61f21a47464b',1,'lapack.h']]],
-  ['mlx_5fmtl_5fconst_82',['MLX_MTL_CONST',['../kernels_2gemv__masked_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;gemv_masked.h'],['../quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;quantized.h'],['../sort_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;sort.h']]],
-  ['mlx_5fmtl_5floop_5funroll_83',['MLX_MTL_LOOP_UNROLL',['../sort_8h.html#ad34b622323cebef136669fedd7229515',1,'sort.h']]],
-  ['mlx_5fmtl_5fpragma_5funroll_84',['MLX_MTL_PRAGMA_UNROLL',['../kernels_2gemv__masked_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;gemv_masked.h'],['../backend_2metal_2kernels_2utils_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;utils.h']]],
-  ['mlxconvparams_85',['MLXConvParams',['../struct_m_l_x_conv_params.html',1,'']]],
-  ['mlxconvparams_3c_202_20_3e_86',['MLXConvParams&lt; 2 &gt;',['../struct_m_l_x_conv_params.html',1,'']]],
-  ['mlxfastattentionparams_87',['MLXFastAttentionParams',['../struct_m_l_x_fast_attention_params.html',1,'']]],
-  ['mlxscaleddotproductattentionparams_88',['MLXScaledDotProductAttentionParams',['../struct_m_l_x_scaled_dot_product_attention_params.html',1,'']]],
-  ['mma_89',['mma',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()']]],
-  ['mma_2eh_90',['mma.h',['../mma_8h.html',1,'']]],
-  ['mma_5ft_91',['mma_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#add8c6a31011a4895667c2a94a5af3782',1,'mlx::steel::GEMMKernel']]],
-  ['mmafrag_5facc_5ft_92',['MMAFrag_acc_t',['../structmlx_1_1steel_1_1_block_m_m_a.html#ae2c42cb6d0dde785859164c195f4d13c',1,'mlx::steel::BlockMMA']]],
-  ['mmafrag_5ft_93',['MMAFrag_t',['../structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382',1,'mlx::steel::MMATile']]],
-  ['mmatile_94',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile::MMATile()']]],
-  ['mmatile_3c_20float_2c_201_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_95',['MMATile&lt; float, 1, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
-  ['mmatile_3c_20float_2c_20tm_2c_201_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_96',['MMATile&lt; float, TM, 1, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
-  ['mmatile_3c_20float_2c_20tm_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_97',['MMATile&lt; float, TM, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
-  ['move_5fshared_5fbuffer_98',['move_shared_buffer',['../classmlx_1_1core_1_1array.html#acce00db63e0f3d80f797b02397ade836',1,'mlx::core::array::move_shared_buffer(array other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a38d7ad605f8282e5e49d0c09e0555c78',1,'mlx::core::array::move_shared_buffer(array other)']]],
-  ['moveaxis_99',['moveaxis',['../group__ops.html#ga24067d10a842db2c9d509ea48135a2c3',1,'mlx::core']]],
-  ['mpinplace_100',['MPINPLACE',['../namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2',1,'pocketfft::detail']]],
-  ['mtl_5fconst_101',['MTL_CONST',['../defines_8h.html#a767ed9f2604de22b259cee02c4ce1d22',1,'defines.h']]],
-  ['mtl_5fdevice_102',['mtl_device',['../classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653',1,'mlx::core::metal::Device']]],
-  ['mtl_5fresidency_5fset_103',['mtl_residency_set',['../classmlx_1_1core_1_1metal_1_1_residency_set.html#ac4bfe5ef5e2eaebc458a1ed1953d15e9',1,'mlx::core::metal::ResidencySet']]],
-  ['mtlfclist_104',['MTLFCList',['../namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54',1,'mlx::core::metal']]],
-  ['mtx_105',['mtx',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a70410c9e612f871663929f1e8441a976',1,'mlx::core::scheduler::StreamThread']]],
-  ['multi_5fiter_106',['multi_iter',['../classpocketfft_1_1detail_1_1multi__iter.html',1,'pocketfft::detail::multi_iter&lt; N &gt;'],['../classpocketfft_1_1detail_1_1multi__iter.html#a9be43bb18840202da6d17988fccc64b9',1,'pocketfft::detail::multi_iter::multi_iter()']]],
-  ['multiply_107',['Multiply',['../structmlx_1_1core_1_1detail_1_1_multiply.html',1,'mlx::core::detail::Multiply'],['../classmlx_1_1core_1_1_multiply.html',1,'mlx::core::Multiply'],['../struct_multiply.html',1,'Multiply'],['../classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c',1,'mlx::core::Multiply::Multiply()']]],
-  ['multiply_108',['multiply',['../group__ops.html#gaf57392e641640b5d06e4c99518391c38',1,'mlx::core']]],
-  ['multivariate_5fnormal_109',['multivariate_normal',['../namespacemlx_1_1core_1_1random.html#a8c37da3c1c0c561cad7499d6d9db81fb',1,'mlx::core::random']]]
+  ['max_5fops_5fper_5fbuffer_23',['max_ops_per_buffer',['../namespacemlx_1_1core_1_1env.html#aedbf4e739553024c33dd0094dd9107aa',1,'mlx::core::env']]],
+  ['max_5foutput_5fsize_24',['MAX_OUTPUT_SIZE',['../backend_2metal_2kernels_2fft_8h.html#a28d683cf067736d76f867f30c066317e',1,'fft.h']]],
+  ['max_5fradix_25',['MAX_RADIX',['../backend_2metal_2kernels_2fft_8h.html#a7b6e56afa21f022c5e754b000955735a',1,'MAX_RADIX:&#160;fft.h'],['../readwrite_8h.html#a7b6e56afa21f022c5e754b000955735a',1,'MAX_RADIX:&#160;readwrite.h']]],
+  ['max_5freduce_5fspecialized_5fdims_26',['MAX_REDUCE_SPECIALIZED_DIMS',['../defines_8h.html#a15629f1b81a2b6f1cca26d07a2734623',1,'defines.h']]],
+  ['max_5fthreads_27',['max_threads',['../namespacepocketfft_1_1detail_1_1threading.html#a2d5c0729f0b66cf061918baea4337d70',1,'pocketfft::detail::threading']]],
+  ['maximum_28',['Maximum',['../struct_maximum.html',1,'Maximum'],['../structmlx_1_1core_1_1detail_1_1_maximum.html',1,'mlx::core::detail::Maximum'],['../classmlx_1_1core_1_1_maximum.html',1,'mlx::core::Maximum'],['../classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816',1,'mlx::core::Maximum::Maximum()']]],
+  ['maximum_29',['maximum',['../group__ops.html#ga7ade2ea305e2e4219c3609443fb5db8d',1,'mlx::core']]],
+  ['maxop_30',['MaxOp',['../struct_max_op.html',1,'']]],
+  ['maybeinsertbarrier_31',['maybeInsertBarrier',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991',1,'mlx::core::metal::CommandEncoder']]],
+  ['mb_5fblock_5fmerge_32',['mb_block_merge',['../sort_8h.html#ab381cd57f344bc7304ab580bfdc78807',1,'sort.h']]],
+  ['mb_5fblock_5fpartition_33',['mb_block_partition',['../sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2',1,'sort.h']]],
+  ['mb_5fblock_5fsort_34',['mb_block_sort',['../sort_8h.html#aa48ff1aff1e9dc1301b6781aa0721d6b',1,'sort.h']]],
+  ['mean_35',['mean',['../group__ops.html#gade46e768fd46b8b640eb16f26abeecef',1,'mlx::core::mean(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga52b59fdd8e8430538e564f5bbcfa31e6',1,'mlx::core::mean(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga066161f3d3e395a1d76c638cb680d444',1,'mlx::core::mean(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga45fba73eab0e3b6e128ed3ce2f43a5da',1,'mlx::core::mean(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['median3_36',['median3',['../namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157',1,'metal::median3()'],['../namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc',1,'metal::fast::median3()'],['../namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2',1,'metal::precise::median3()']]],
+  ['merge_5fpartition_37',['merge_partition',['../struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca',1,'BlockMergeSort::merge_partition()'],['../struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe',1,'KernelMultiBlockMergeSort::merge_partition()']]],
+  ['merge_5fstep_38',['merge_step',['../struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c',1,'BlockMergeSort']]],
+  ['meshgrid_39',['meshgrid',['../group__ops.html#ga577c911618575314de63d1060656a26e',1,'mlx::core']]],
+  ['metal_40',['metal',['../namespacemetal.html',1,'']]],
+  ['metal_2eh_41',['metal.h',['../metal_8h.html',1,'']]],
+  ['metal_3a_3afast_42',['fast',['../namespacemetal_1_1fast.html',1,'metal']]],
+  ['metal_3a_3aprecise_43',['precise',['../namespacemetal_1_1precise.html',1,'metal']]],
+  ['metal_5fimpl_2eh_44',['metal_impl.h',['../metal__impl_8h.html',1,'']]],
+  ['metal_5fkernel_45',['metal_kernel',['../namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e',1,'mlx::core::fast']]],
+  ['metalallocator_46',['MetalAllocator',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html',1,'mlx::core::metal']]],
+  ['metalkernelfunction_47',['MetalKernelFunction',['../namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0',1,'mlx::core::fast']]],
+  ['min_48',['Min',['../struct_min.html',1,'Min&lt; U &gt;'],['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924a4f685dcd48e6614d6bb2ccda4f2686ef',1,'mlx::core::distributed::AllReduce::Min'],['../classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a0d3d1f5c94725bdc42fa692e2c074418',1,'mlx::core::Reduce::Min'],['../classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1a7d2ee8f14f2e70a9d47170fecc6da898',1,'mlx::core::Scan::Min'],['../classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613cad914e4c3475ce9858f2de4bf35dcfdbf',1,'mlx::core::Scatter::Min']]],
+  ['min_49',['min',['../struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e',1,'Limits::min'],['../struct_limits_3_01uint8__t_01_4.html#a408bd5a337e7292f06e63da81193629a',1,'Limits&lt; uint8_t &gt;::min'],['../struct_limits_3_01uint16__t_01_4.html#ae173984c3be8b6750f27daed581805fe',1,'Limits&lt; uint16_t &gt;::min'],['../struct_limits_3_01uint32__t_01_4.html#ab0c3975e02053b234c7b606ababa66e1',1,'Limits&lt; uint32_t &gt;::min'],['../struct_limits_3_01uint64__t_01_4.html#a80627f39e951398283942cefa48f4dd0',1,'Limits&lt; uint64_t &gt;::min'],['../struct_limits_3_01int8__t_01_4.html#a7a809307d2bba80382f0645d277eaa4b',1,'Limits&lt; int8_t &gt;::min'],['../struct_limits_3_01int16__t_01_4.html#adca7139647801e223c35b0abc7da5240',1,'Limits&lt; int16_t &gt;::min'],['../struct_limits_3_01int32__t_01_4.html#af336a1b22a8ed6a83a4cfb5bf8869771',1,'Limits&lt; int32_t &gt;::min'],['../struct_limits_3_01int64__t_01_4.html#a1c90fb96af515badaccaa835b08f7428',1,'Limits&lt; int64_t &gt;::min'],['../struct_limits_3_01half_01_4.html#aca7b036c257878bf1b80912fb5d4516d',1,'Limits&lt; half &gt;::min'],['../struct_limits_3_01float_01_4.html#a3225e334d372ee86128c89a440d8648f',1,'Limits&lt; float &gt;::min'],['../struct_limits_3_01bfloat16__t_01_4.html#a2fd1811b9f615b2b897904bc27d1cb49',1,'Limits&lt; bfloat16_t &gt;::min'],['../struct_limits_3_01bool_01_4.html#a139f787b57536d455490b8ef801d37cc',1,'Limits&lt; bool &gt;::min'],['../struct_limits_3_01complex64__t_01_4.html#aa67b04aa7abcd67f7af0808737ab8e14',1,'Limits&lt; complex64_t &gt;::min'],['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min()'],['../namespacemetal.html#a6653b28c9473087141eddce39878d4d3',1,'metal::min()'],['../namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61',1,'metal::fast::min()'],['../namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e',1,'metal::precise::min()'],['../group__ops.html#gab27599802617a4c8f9964ab5f4ffee12',1,'mlx::core::min(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga0140b91e9cdfc3fef0da8e332f65a9e8',1,'mlx::core::min(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga6efb83cd46436678c8f8c4af15cc00f5',1,'mlx::core::min(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga36fa315eef677f4143868f552cd26d03',1,'mlx::core::min(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['min3_50',['min3',['../namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f',1,'metal::min3()'],['../namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f',1,'metal::fast::min3()'],['../namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231',1,'metal::precise::min3()']]],
+  ['min_5fexponent_51',['min_exponent',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a13829f8c7a7c0efdc8946eff5d3c9470',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['min_5fexponent10_52',['min_exponent10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aeaed172780720e06b8731cef3177e277',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['minimum_53',['Minimum',['../struct_minimum.html',1,'Minimum'],['../structmlx_1_1core_1_1detail_1_1_minimum.html',1,'mlx::core::detail::Minimum'],['../classmlx_1_1core_1_1_minimum.html',1,'mlx::core::Minimum'],['../classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5',1,'mlx::core::Minimum::Minimum()']]],
+  ['minimum_54',['minimum',['../group__ops.html#ga49ba00c090f81f331c91b0c97040bce0',1,'mlx::core']]],
+  ['mlx_55',['mlx',['../namespacemlx.html',1,'']]],
+  ['mlx_2eh_56',['mlx.h',['../mlx_8h.html',1,'']]],
+  ['mlx_3a_3acore_57',['core',['../namespacemlx_1_1core.html',1,'mlx']]],
+  ['mlx_3a_3acore_3a_3aallocator_58',['allocator',['../namespacemlx_1_1core_1_1allocator.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3adetail_59',['detail',['../namespacemlx_1_1core_1_1detail.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3adistributed_60',['distributed',['../namespacemlx_1_1core_1_1distributed.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3adistributed_3a_3adetail_61',['detail',['../namespacemlx_1_1core_1_1distributed_1_1detail.html',1,'mlx::core::distributed']]],
+  ['mlx_3a_3acore_3a_3aenv_62',['env',['../namespacemlx_1_1core_1_1env.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3afast_63',['fast',['../namespacemlx_1_1core_1_1fast.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3afft_64',['fft',['../namespacemlx_1_1core_1_1fft.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3aio_65',['io',['../namespacemlx_1_1core_1_1io.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3alinalg_66',['linalg',['../namespacemlx_1_1core_1_1linalg.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3ametal_67',['metal',['../namespacemlx_1_1core_1_1metal.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3arandom_68',['random',['../namespacemlx_1_1core_1_1random.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3ascheduler_69',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html',1,'mlx::core']]],
+  ['mlx_3a_3asteel_70',['steel',['../namespacemlx_1_1steel.html',1,'mlx']]],
+  ['mlx_5fatomic_71',['mlx_atomic',['../structmlx__atomic.html',1,'']]],
+  ['mlx_5fatomic_3c_20t_2c_20enable_5fif_5ft_3c_20is_5fmetal_5fatomic_3c_20t_20_3e_20_3e_20_3e_72',['mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;',['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html',1,'']]],
+  ['mlx_5fatomic_5fcompare_5fexchange_5fweak_5fexplicit_73',['mlx_atomic_compare_exchange_weak_explicit',['../atomic_8h.html#ad7f32327ff66354cfa2f0cfdac79316f',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread T *expected, T val, size_t offset):&#160;atomic.h'],['../atomic_8h.html#aa8f47b2e9b95d4b00ad51f08b070deb5',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread uint *expected, uint val, size_t offset):&#160;atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fadd_5fexplicit_74',['mlx_atomic_fetch_add_explicit',['../atomic_8h.html#aad448d9e06e001700b65ca8317216a3b',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fand_5fexplicit_75',['mlx_atomic_fetch_and_explicit',['../atomic_8h.html#a253e3c870c0ddc7c28ab2f6ca2c3eae5',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_76',['mlx_atomic_fetch_max_explicit',['../atomic_8h.html#ac480f2b459a8ad9095cee353e152d00c',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_3c_20float_20_3e_77',['mlx_atomic_fetch_max_explicit&lt; float &gt;',['../atomic_8h.html#a1dce2abfa16417122c4d2bf261129ae4',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_78',['mlx_atomic_fetch_min_explicit',['../atomic_8h.html#a2ec33dca0039bd944d73d1c2b378cc19',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_3c_20float_20_3e_79',['mlx_atomic_fetch_min_explicit&lt; float &gt;',['../atomic_8h.html#ab7d1dc49f319f239b7ee0b7c72976dd0',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmul_5fexplicit_80',['mlx_atomic_fetch_mul_explicit',['../atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5for_5fexplicit_81',['mlx_atomic_fetch_or_explicit',['../atomic_8h.html#ab7391f197001471e4788312bdb6ab37a',1,'atomic.h']]],
+  ['mlx_5fatomic_5fload_5fexplicit_82',['mlx_atomic_load_explicit',['../atomic_8h.html#a253a4e8c2c5768a069e2791b627dfc99',1,'atomic.h']]],
+  ['mlx_5fatomic_5fstore_5fexplicit_83',['mlx_atomic_store_explicit',['../atomic_8h.html#a0ae453140b0819a4c02f265334de98c0',1,'atomic.h']]],
+  ['mlx_5flapack_5ffunc_84',['MLX_LAPACK_FUNC',['../lapack_8h.html#ae22db9704827bf013a0a61f21a47464b',1,'lapack.h']]],
+  ['mlx_5fmtl_5fconst_85',['MLX_MTL_CONST',['../kernels_2gemv__masked_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;gemv_masked.h'],['../quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;quantized.h'],['../sort_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;sort.h']]],
+  ['mlx_5fmtl_5floop_5funroll_86',['MLX_MTL_LOOP_UNROLL',['../sort_8h.html#ad34b622323cebef136669fedd7229515',1,'sort.h']]],
+  ['mlx_5fmtl_5fpragma_5funroll_87',['MLX_MTL_PRAGMA_UNROLL',['../kernels_2gemv__masked_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;gemv_masked.h'],['../backend_2metal_2kernels_2utils_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;utils.h']]],
+  ['mlxconvparams_88',['MLXConvParams',['../struct_m_l_x_conv_params.html',1,'']]],
+  ['mlxconvparams_3c_202_20_3e_89',['MLXConvParams&lt; 2 &gt;',['../struct_m_l_x_conv_params.html',1,'']]],
+  ['mma_90',['mma',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()']]],
+  ['mma_2eh_91',['mma.h',['../attn_2mma_8h.html',1,'(Global Namespace)'],['../gemm_2mma_8h.html',1,'(Global Namespace)']]],
+  ['mma_5ft_92',['mma_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ae52eb09c9478cd4f199662346ac0c83e',1,'mlx::steel::GEMMKernel']]],
+  ['mmafrag_5facc_5ft_93',['MMAFrag_acc_t',['../structmlx_1_1steel_1_1_block_m_m_a.html#a8231b0e3475077c1381eb8f5daf62e35',1,'mlx::steel::BlockMMA']]],
+  ['mmafrag_5ft_94',['MMAFrag_t',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4',1,'mlx::steel::MMATile']]],
+  ['mmatile_95',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile::MMATile() thread'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile::MMATile() thread']]],
+  ['mmatile_3c_20float_2c_201_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_96',['MMATile&lt; float, 1, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['mmatile_3c_20float_2c_20tm_2c_201_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_97',['MMATile&lt; float, TM, 1, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['mmatile_3c_20float_2c_20tm_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_98',['MMATile&lt; float, TM, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['move_5for_5fcopy_99',['move_or_copy',['../namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2',1,'mlx::core::move_or_copy(const array &amp;in, array &amp;out)'],['../namespacemlx_1_1core.html#aae1e770954edf1f9a35d19e0de4d857a',1,'mlx::core::move_or_copy(const array &amp;in, array &amp;out, const std::vector&lt; size_t &gt; &amp;strides, array::Flags flags, size_t data_size, size_t offset=0)']]],
+  ['move_5fshared_5fbuffer_100',['move_shared_buffer',['../classmlx_1_1core_1_1array.html#acce00db63e0f3d80f797b02397ade836',1,'mlx::core::array::move_shared_buffer(array other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a38d7ad605f8282e5e49d0c09e0555c78',1,'mlx::core::array::move_shared_buffer(array other)']]],
+  ['moveaxis_101',['moveaxis',['../group__ops.html#ga24067d10a842db2c9d509ea48135a2c3',1,'mlx::core']]],
+  ['mpinplace_102',['MPINPLACE',['../namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2',1,'pocketfft::detail']]],
+  ['mtl_5fconst_103',['MTL_CONST',['../defines_8h.html#a767ed9f2604de22b259cee02c4ce1d22',1,'defines.h']]],
+  ['mtl_5fdevice_104',['mtl_device',['../classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653',1,'mlx::core::metal::Device']]],
+  ['mtl_5fresidency_5fset_105',['mtl_residency_set',['../classmlx_1_1core_1_1metal_1_1_residency_set.html#ac4bfe5ef5e2eaebc458a1ed1953d15e9',1,'mlx::core::metal::ResidencySet']]],
+  ['mtlfclist_106',['MTLFCList',['../namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54',1,'mlx::core::metal']]],
+  ['mtx_107',['mtx',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a70410c9e612f871663929f1e8441a976',1,'mlx::core::scheduler::StreamThread']]],
+  ['mulop_108',['MulOp',['../struct_mul_op.html',1,'']]],
+  ['multi_5fiter_109',['multi_iter',['../classpocketfft_1_1detail_1_1multi__iter.html',1,'pocketfft::detail::multi_iter&lt; N &gt;'],['../classpocketfft_1_1detail_1_1multi__iter.html#a9be43bb18840202da6d17988fccc64b9',1,'pocketfft::detail::multi_iter::multi_iter()']]],
+  ['multiply_110',['Multiply',['../structmlx_1_1core_1_1detail_1_1_multiply.html',1,'mlx::core::detail::Multiply'],['../classmlx_1_1core_1_1_multiply.html',1,'mlx::core::Multiply'],['../struct_multiply.html',1,'Multiply'],['../classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c',1,'mlx::core::Multiply::Multiply()']]],
+  ['multiply_111',['multiply',['../group__ops.html#gaf57392e641640b5d06e4c99518391c38',1,'mlx::core']]],
+  ['multivariate_5fnormal_112',['multivariate_normal',['../namespacemlx_1_1core_1_1random.html#a8c37da3c1c0c561cad7499d6d9db81fb',1,'mlx::core::random']]]
 ];
diff --git a/docs/build/html/search/all_e.js b/docs/build/html/search/all_e.js
index 4beb98b3d..908443b77 100644
--- a/docs/build/html/search/all_e.js
+++ b/docs/build/html/search/all_e.js
@@ -1,30 +1,30 @@
 var searchData=
 [
-  ['n_0',['N',['../struct_m_l_x_fast_attention_params.html#ab42c792a80388002e34992cbd837a167',1,'MLXFastAttentionParams::N'],['../struct_m_l_x_conv_params.html#ae6b7054dc3cffa8e6aedeb29fa7da932',1,'MLXConvParams::N'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a213f5ea4018120d8b61ab82754aaba83',1,'mlx::steel::ImplicitGemmConv2DParams::N'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a174626ab98515d89923b2841a664b9a1',1,'mlx::steel::GEMMParams::N'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a1103e79fb8962812b9a3c9d5c902ff86',1,'mlx::steel::GEMMSpiltKParams::N']]],
+  ['n_0',['N',['../struct_m_l_x_conv_params.html#ae6b7054dc3cffa8e6aedeb29fa7da932',1,'MLXConvParams::N'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a213f5ea4018120d8b61ab82754aaba83',1,'mlx::steel::ImplicitGemmConv2DParams::N'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a174626ab98515d89923b2841a664b9a1',1,'mlx::steel::GEMMParams::N'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a1103e79fb8962812b9a3c9d5c902ff86',1,'mlx::steel::GEMMSpiltKParams::N']]],
   ['n_1',['n',['../struct_read_writer.html#a655346c9ebfc33a69da3f1c1d4238dfb',1,'ReadWriter']]],
   ['n_5factive_5ftasks_2',['n_active_tasks',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a3c9fa21442974acba3409d49bb033131',1,'mlx::core::scheduler::Scheduler::n_active_tasks()'],['../namespacemlx_1_1core_1_1scheduler.html#a9bf641981df5fc16b0fb0dbacc0c3afd',1,'mlx::core::scheduler::n_active_tasks()']]],
   ['n_5fchannels_3',['n_channels',['../structmlx_1_1steel_1_1_channel_helper.html#aa476bd0fcb38494c268547fc9820fc0a',1,'mlx::steel::ChannelHelper::n_channels'],['../structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a06c2fb9c93660e8f6916228cd77f9494',1,'mlx::steel::ChannelHelper&lt; 1 &gt;::n_channels'],['../structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#ac66ff37bc2cf78d96667192a6cca73b5',1,'mlx::steel::ChannelHelper&lt; 2 &gt;::n_channels'],['../structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a071c015713b7bab09930661165517eff',1,'mlx::steel::ChannelHelper&lt; 3 &gt;::n_channels'],['../structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#a167b00a84adf93b60e3d7a943d5eb977',1,'mlx::steel::ChannelHelper&lt; 4 &gt;::n_channels']]],
-  ['n_5fkv_5fheads_4',['N_KV_HEADS',['../struct_m_l_x_scaled_dot_product_attention_params.html#a68a292b9986c20560aca88394f82e9f7',1,'MLXScaledDotProductAttentionParams']]],
-  ['n_5fper_5fblock_5',['N_PER_BLOCK',['../struct_kernel_merge_sort.html#a959aaf5bfb70796a525fed318f7ae8ab',1,'KernelMergeSort::N_PER_BLOCK'],['../struct_kernel_multi_block_merge_sort.html#ae5113ca5852d11999ae932439af95a5c',1,'KernelMultiBlockMergeSort::N_PER_BLOCK']]],
-  ['n_5fq_5fheads_6',['N_Q_HEADS',['../struct_m_l_x_scaled_dot_product_attention_params.html#a1a63d2e7ad712b4ba26219c784c95177',1,'MLXScaledDotProductAttentionParams']]],
-  ['n_5freads_7',['n_reads',['../struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb',1,'QuantizedBlockLoader']]],
-  ['n_5frows_8',['n_rows',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a097c48a23e1bd7d8cf3e9d531397602f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3ec8a92c9e6643c1d5bf8af278026fe8',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a593ec140370d53f8c968f6240116d38b',1,'mlx::steel::Conv2DWeightBlockLoader::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a8b6c0936c9ad2766242664f034d1115f',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae905e56c1129606e93dbbcd7baed8f0f',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#abff29c5d96645d9113314c9a997dd7a8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aaebb6da2cac9961f5edf52d16c18de7d',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::n_rows'],['../structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5',1,'mlx::steel::BlockLoader::n_rows']]],
-  ['names_9',['names',['../structmlx_1_1core_1_1_node_namer.html#a57823f9a2cdc60b2f06f857b36019277',1,'mlx::core::NodeNamer']]],
-  ['nan_5fto_5fnum_10',['nan_to_num',['../group__ops.html#gab1467c6a9e675152e768afd6dcfb61de',1,'mlx::core']]],
-  ['nanequal_11',['NaNEqual',['../structmlx_1_1core_1_1detail_1_1_na_n_equal.html',1,'mlx::core::detail::NaNEqual'],['../struct_na_n_equal.html',1,'NaNEqual']]],
-  ['nbytes_12',['nbytes',['../classmlx_1_1core_1_1array.html#a387b67cd3ef5cfc1e749c371766c4a05',1,'mlx::core::array']]],
-  ['nd_5floop_13',['nd_loop',['../namespacemlx_1_1core.html#a9a9254ce9975ec247a2718bc02d6f201',1,'mlx::core']]],
-  ['ndarr_14',['ndarr',['../classpocketfft_1_1detail_1_1ndarr.html',1,'pocketfft::detail::ndarr&lt; T &gt;'],['../classpocketfft_1_1detail_1_1ndarr.html#a8f0037a172d96cb1ad915a5069175fa2',1,'pocketfft::detail::ndarr::ndarr()']]],
-  ['ndim_15',['ndim',['../struct_indices.html#a7dec359e91d0eb2b64e5461b54308313',1,'Indices::ndim'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051',1,'mlx::core::fast::CustomKernelShapeInfo::ndim'],['../classpocketfft_1_1detail_1_1arr__info.html#ac608c8af2a59a28a0012e308be7ee414',1,'pocketfft::detail::arr_info::ndim()'],['../classmlx_1_1core_1_1array.html#a53006e77d13d9d88b525ef577748939f',1,'mlx::core::array::ndim()']]],
-  ['needs_5ftgp_5freduction_16',['needs_tgp_reduction',['../struct_g_e_m_v_kernel.html#ae8113fddf6fb637acfd12efd978b704c',1,'GEMVKernel::needs_tgp_reduction'],['../struct_g_e_m_v_t_kernel.html#a67be7ec69c3791f02e97ccdb00ae0e03',1,'GEMVTKernel::needs_tgp_reduction']]],
-  ['negative_17',['Negative',['../structmlx_1_1core_1_1detail_1_1_negative.html',1,'mlx::core::detail::Negative'],['../classmlx_1_1core_1_1_negative.html',1,'mlx::core::Negative'],['../struct_negative.html',1,'Negative'],['../classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70',1,'mlx::core::Negative::Negative()']]],
-  ['negative_18',['negative',['../group__ops.html#ga95d9a9425533b5ed1707eb00184dffc6',1,'mlx::core']]],
-  ['new_5fqueue_19',['new_queue',['../classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67',1,'mlx::core::metal::Device']]],
-  ['new_5fscoped_5fmemory_5fpool_20',['new_scoped_memory_pool',['../namespacemlx_1_1core_1_1metal.html#a46583a1aba89449fa72e6cb3a7090981',1,'mlx::core::metal']]],
-  ['new_5fstream_21',['new_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a157c8da85fa1bddb8eacf8515a3cc879',1,'mlx::core::scheduler::Scheduler::new_stream()'],['../namespacemlx_1_1core_1_1metal.html#a8b4188f9a090a1da42d62b8a369bf106',1,'mlx::core::metal::new_stream()'],['../namespacemlx_1_1core.html#a6f7c63a9be10337b3b96d527e1db3c2f',1,'mlx::core::new_stream()']]],
-  ['next_22',['next',['../struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9',1,'QuantizedBlockLoader::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3b71f379ff9baf39830c92f4f1ecde52',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a78d2b0098311a278be8394edbd5fc731',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aae56c19bb562219770fec38e5666c6ce',1,'mlx::steel::Conv2DWeightBlockLoader::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af9ce1a767266664bea131a5437002c80',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a30b10bebde7f08b89d03bdd9ea0f48da',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3e5ee68ed0ee43f7e979dd4222f76a8c',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a11743cb1c108f42ccdc6e59204a5b3e8',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::next()'],['../structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8',1,'mlx::steel::BlockLoader::next()'],['../structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca',1,'looped_elem_to_loc::next(const constant int *shape, const constant size_t *strides)'],['../structlooped__elem__to__loc.html#add610f331ef8d7d2d1917050890f82b2',1,'looped_elem_to_loc::next(int n, const constant int *shape, const constant size_t *strides)'],['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a96cf2987c04210c9197e5237e425c4b4',1,'looped_elem_to_loc&lt; 1, offset_t &gt;::next(const constant int *, const constant size_t *strides)'],['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#af2984b35f7d7300d4812e7872b3c8851',1,'looped_elem_to_loc&lt; 1, offset_t &gt;::next(int n, const constant int *, const constant size_t *strides)'],['../structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#aa1e9e1009c16befb9a730835836436e0',1,'looped_elem_to_loc&lt; 0, offset_t &gt;::next(const constant int *, const constant size_t *)'],['../structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a1064cdfdcef779b5628ce5357a6fe4f0',1,'looped_elem_to_loc&lt; 0, offset_t &gt;::next(int, const constant int *, const constant size_t *)'],['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a4193c5eac3ef093a740d5305b25d3e18',1,'mlx::core::random::KeySequence::next()'],['../backend_2metal_2allocator_8h.html#ae704ab07eac590091daa5fc4aec7bddb',1,'next:&#160;allocator.h']]],
-  ['next_5fpower_5fof_5f2_23',['next_power_of_2',['../namespacemlx_1_1core.html#a685c0530e338aabc622325685846ce93',1,'mlx::core']]],
-  ['nextafter_24',['nextafter',['../namespacemetal.html#a9547fd7b09164931986f6db4813bd72d',1,'metal::nextafter()'],['../namespacemetal_1_1fast.html#a4583e8be04fc0bd475b97b0934604f23',1,'metal::fast::nextafter()'],['../namespacemetal_1_1precise.html#ad012ceeb55b77f1533749b351331e026',1,'metal::precise::nextafter()']]],
+  ['n_5fper_5fblock_4',['N_PER_BLOCK',['../struct_kernel_merge_sort.html#a959aaf5bfb70796a525fed318f7ae8ab',1,'KernelMergeSort::N_PER_BLOCK'],['../struct_kernel_multi_block_merge_sort.html#ae5113ca5852d11999ae932439af95a5c',1,'KernelMultiBlockMergeSort::N_PER_BLOCK']]],
+  ['n_5freads_5',['n_reads',['../struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb',1,'QuantizedBlockLoader']]],
+  ['n_5frows_6',['n_rows',['../structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5',1,'mlx::steel::BlockLoader::n_rows'],['../structmlx_1_1steel_1_1_block_loader_t.html#a0ccc7caa93e6e709981a1a08159d41dc',1,'mlx::steel::BlockLoaderT::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a097c48a23e1bd7d8cf3e9d531397602f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3ec8a92c9e6643c1d5bf8af278026fe8',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a593ec140370d53f8c968f6240116d38b',1,'mlx::steel::Conv2DWeightBlockLoader::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a8b6c0936c9ad2766242664f034d1115f',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae905e56c1129606e93dbbcd7baed8f0f',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#abff29c5d96645d9113314c9a997dd7a8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aaebb6da2cac9961f5edf52d16c18de7d',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::n_rows']]],
+  ['names_7',['names',['../structmlx_1_1core_1_1_node_namer.html#a57823f9a2cdc60b2f06f857b36019277',1,'mlx::core::NodeNamer']]],
+  ['nan_5fto_5fnum_8',['nan_to_num',['../group__ops.html#gab1467c6a9e675152e768afd6dcfb61de',1,'mlx::core']]],
+  ['nanequal_9',['NaNEqual',['../structmlx_1_1core_1_1detail_1_1_na_n_equal.html',1,'mlx::core::detail::NaNEqual'],['../struct_na_n_equal.html',1,'NaNEqual']]],
+  ['nbytes_10',['nbytes',['../classmlx_1_1core_1_1array.html#a387b67cd3ef5cfc1e749c371766c4a05',1,'mlx::core::array']]],
+  ['nd_5floop_11',['nd_loop',['../namespacemlx_1_1core.html#a9a9254ce9975ec247a2718bc02d6f201',1,'mlx::core']]],
+  ['ndarr_12',['ndarr',['../classpocketfft_1_1detail_1_1ndarr.html',1,'pocketfft::detail::ndarr&lt; T &gt;'],['../classpocketfft_1_1detail_1_1ndarr.html#a8f0037a172d96cb1ad915a5069175fa2',1,'pocketfft::detail::ndarr::ndarr()']]],
+  ['ndim_13',['ndim',['../struct_indices.html#a7dec359e91d0eb2b64e5461b54308313',1,'Indices::ndim'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051',1,'mlx::core::fast::CustomKernelShapeInfo::ndim'],['../classpocketfft_1_1detail_1_1arr__info.html#ac608c8af2a59a28a0012e308be7ee414',1,'pocketfft::detail::arr_info::ndim()'],['../classmlx_1_1core_1_1array.html#a53006e77d13d9d88b525ef577748939f',1,'mlx::core::array::ndim()']]],
+  ['needs_5ftgp_5freduction_14',['needs_tgp_reduction',['../struct_g_e_m_v_kernel.html#ae8113fddf6fb637acfd12efd978b704c',1,'GEMVKernel::needs_tgp_reduction'],['../struct_g_e_m_v_t_kernel.html#a67be7ec69c3791f02e97ccdb00ae0e03',1,'GEMVTKernel::needs_tgp_reduction']]],
+  ['negative_15',['Negative',['../structmlx_1_1core_1_1detail_1_1_negative.html',1,'mlx::core::detail::Negative'],['../classmlx_1_1core_1_1_negative.html',1,'mlx::core::Negative'],['../struct_negative.html',1,'Negative'],['../classmlx_1_1core_1_1_negative.html#aa3b73395d9fa5b7215dca488bc0d3c70',1,'mlx::core::Negative::Negative()']]],
+  ['negative_16',['negative',['../group__ops.html#ga95d9a9425533b5ed1707eb00184dffc6',1,'mlx::core']]],
+  ['new_5fqueue_17',['new_queue',['../classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67',1,'mlx::core::metal::Device']]],
+  ['new_5fscoped_5fmemory_5fpool_18',['new_scoped_memory_pool',['../namespacemlx_1_1core_1_1metal.html#a46583a1aba89449fa72e6cb3a7090981',1,'mlx::core::metal']]],
+  ['new_5fstream_19',['new_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a157c8da85fa1bddb8eacf8515a3cc879',1,'mlx::core::scheduler::Scheduler::new_stream()'],['../namespacemlx_1_1core_1_1metal.html#a8b4188f9a090a1da42d62b8a369bf106',1,'mlx::core::metal::new_stream()'],['../namespacemlx_1_1core.html#a6f7c63a9be10337b3b96d527e1db3c2f',1,'mlx::core::new_stream()']]],
+  ['next_20',['next',['../struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9',1,'QuantizedBlockLoader::next()'],['../structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8',1,'mlx::steel::BlockLoader::next()'],['../structmlx_1_1steel_1_1_block_loader_t.html#a6008ef45ff980dbe1119da0630f6c697',1,'mlx::steel::BlockLoaderT::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3b71f379ff9baf39830c92f4f1ecde52',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a78d2b0098311a278be8394edbd5fc731',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aae56c19bb562219770fec38e5666c6ce',1,'mlx::steel::Conv2DWeightBlockLoader::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af9ce1a767266664bea131a5437002c80',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a30b10bebde7f08b89d03bdd9ea0f48da',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3e5ee68ed0ee43f7e979dd4222f76a8c',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a11743cb1c108f42ccdc6e59204a5b3e8',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::next()'],['../structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8',1,'mlx::steel::BlockLoader::next()'],['../struct_looped_elem_to_loc.html#a54c743940bf96350f3be42bba5d28205',1,'LoopedElemToLoc::next(const constant int *shape, const constant size_t *strides)'],['../struct_looped_elem_to_loc.html#a7da7bd04e79ba86f71c535b5a6ec1a2d',1,'LoopedElemToLoc::next(int n, const constant int *shape, const constant size_t *strides)'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#af5a7c0cddeb52da88fa1140f44aec45c',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::next(const constant int *shape, const constant size_t *strides)'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a8fe55b3a2fa8cd35af568085faed785d',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::next(int n, const constant int *shape, const constant size_t *strides)'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a03f3ca7a60bb85e36d7eba75e0e08b15',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::next(const constant int *, const constant size_t *strides)'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af8f2b29946324756c09951b69e170dd8',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::next(int n, const constant int *, const constant size_t *strides)'],['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a4193c5eac3ef093a740d5305b25d3e18',1,'mlx::core::random::KeySequence::next()'],['../backend_2metal_2allocator_8h.html#ae704ab07eac590091daa5fc4aec7bddb',1,'next:&#160;allocator.h']]],
+  ['next_5fpower_5fof_5f2_21',['next_power_of_2',['../namespacemlx_1_1core.html#a685c0530e338aabc622325685846ce93',1,'mlx::core']]],
+  ['nextafter_22',['nextafter',['../namespacemetal.html#a9547fd7b09164931986f6db4813bd72d',1,'metal::nextafter()'],['../namespacemetal_1_1fast.html#a4583e8be04fc0bd475b97b0934604f23',1,'metal::fast::nextafter()'],['../namespacemetal_1_1precise.html#ad012ceeb55b77f1533749b351331e026',1,'metal::precise::nextafter()']]],
+  ['nk_23',['NK',['../structmlx_1_1steel_1_1_attn_params.html#a68a66e3fafa922dcfd1ab1f6bdc2375e',1,'mlx::steel::AttnParams']]],
+  ['nk_5faligned_24',['NK_aligned',['../structmlx_1_1steel_1_1_attn_params.html#aaf953954274794cfcb4e35e82d681b58',1,'mlx::steel::AttnParams']]],
   ['no_5ffuse_25',['no_fuse',['../namespacemlx_1_1core.html#adb15ff2b1ca5207fd4f6e631e2c3bcb4ada8df7fd43da6073fec4fe5666b03dbb',1,'mlx::core']]],
   ['no_5fsimplify_26',['no_simplify',['../namespacemlx_1_1core.html#adb15ff2b1ca5207fd4f6e631e2c3bcb4a8e5611dfddbae6e68624c59aa3e4e3e2',1,'mlx::core']]],
   ['nodenamer_27',['NodeNamer',['../structmlx_1_1core_1_1_node_namer.html',1,'mlx::core']]],
@@ -37,8 +37,10 @@ var searchData=
   ['notequal_34',['NotEqual',['../structmlx_1_1core_1_1detail_1_1_not_equal.html',1,'mlx::core::detail::NotEqual'],['../classmlx_1_1core_1_1_not_equal.html',1,'mlx::core::NotEqual'],['../struct_not_equal.html',1,'NotEqual'],['../classmlx_1_1core_1_1_not_equal.html#ac568397bd17b5d9f25ad1a0ebadedbb9',1,'mlx::core::NotEqual::NotEqual()']]],
   ['notify_5fnew_5ftask_35',['notify_new_task',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ae8aa34a9be8bc73508dd500000421173',1,'mlx::core::scheduler::Scheduler::notify_new_task()'],['../namespacemlx_1_1core_1_1scheduler.html#a6b7289e33cef665178fe614aac75c1b2',1,'mlx::core::scheduler::notify_new_task()']]],
   ['notify_5ftask_5fcompletion_36',['notify_task_completion',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#abbb2b1c2f8bae2b9c7cc51db65f18a3b',1,'mlx::core::scheduler::Scheduler::notify_task_completion()'],['../namespacemlx_1_1core_1_1scheduler.html#a1d06ffdbab36790b78deb6e34adc737f',1,'mlx::core::scheduler::notify_task_completion()']]],
-  ['num_5fthreads_37',['num_threads',['../namespacepocketfft_1_1detail_1_1threading.html#af5432c2e25aed679a73fe7b29534c833',1,'pocketfft::detail::threading']]],
-  ['number_38',['number',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2dab1bc248a7ff2b2e95569f56de68615df',1,'mlx::core::Dtype::number'],['../namespacemlx_1_1core.html#a069c0aab6b36aef34419534ec4a4310d',1,'mlx::core::number']]],
-  ['number_5fof_5felements_39',['number_of_elements',['../group__ops.html#ga6d5f5f72362488b956cdc4615ef6c636',1,'mlx::core']]],
-  ['numberofelements_40',['NumberOfElements',['../classmlx_1_1core_1_1_number_of_elements.html',1,'mlx::core::NumberOfElements'],['../classmlx_1_1core_1_1_number_of_elements.html#ac64d7c40ae29d687f8b7d2fa33e13b06',1,'mlx::core::NumberOfElements::NumberOfElements()']]]
+  ['nq_37',['NQ',['../structmlx_1_1steel_1_1_attn_params.html#a48575afc94ab9ff74deaba61464e57a1',1,'mlx::steel::AttnParams']]],
+  ['nq_5faligned_38',['NQ_aligned',['../structmlx_1_1steel_1_1_attn_params.html#a4cfd2ccb0fd7eb81c2a781a0614fdcbe',1,'mlx::steel::AttnParams']]],
+  ['num_5fthreads_39',['num_threads',['../namespacepocketfft_1_1detail_1_1threading.html#af5432c2e25aed679a73fe7b29534c833',1,'pocketfft::detail::threading']]],
+  ['number_40',['number',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2dab1bc248a7ff2b2e95569f56de68615df',1,'mlx::core::Dtype::number'],['../namespacemlx_1_1core.html#a069c0aab6b36aef34419534ec4a4310d',1,'mlx::core::number']]],
+  ['number_5fof_5felements_41',['number_of_elements',['../group__ops.html#ga6d5f5f72362488b956cdc4615ef6c636',1,'mlx::core']]],
+  ['numberofelements_42',['NumberOfElements',['../classmlx_1_1core_1_1_number_of_elements.html',1,'mlx::core::NumberOfElements'],['../classmlx_1_1core_1_1_number_of_elements.html#ac64d7c40ae29d687f8b7d2fa33e13b06',1,'mlx::core::NumberOfElements::NumberOfElements()']]]
 ];
diff --git a/docs/build/html/search/all_f.js b/docs/build/html/search/all_f.js
index b03a5aeec..68df8ad47 100644
--- a/docs/build/html/search/all_f.js
+++ b/docs/build/html/search/all_f.js
@@ -1,44 +1,44 @@
 var searchData=
 [
   ['o_0',['O',['../struct_m_l_x_conv_params.html#ad55ff586d30072d8154865f9dfe92d97',1,'MLXConvParams']]],
-  ['offset_1',['offset',['../structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0',1,'looped_elem_to_loc::offset'],['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a7aebc0b0656e3a55d0dbca27a57d600e',1,'looped_elem_to_loc&lt; 1, offset_t &gt;::offset']]],
-  ['offset_5fneg_5fidx_2',['offset_neg_idx',['../kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df',1,'indexing.h']]],
-  ['ofs_3',['ofs',['../classpocketfft_1_1detail_1_1simple__iter.html#ab59481ad9c8f04addb907c3ebb89f8fa',1,'pocketfft::detail::simple_iter::ofs()'],['../classpocketfft_1_1detail_1_1rev__iter.html#a78c3b4ad19edf9d20cab40ad109e9dd1',1,'pocketfft::detail::rev_iter::ofs()']]],
-  ['ones_4',['ones',['../group__ops.html#ga54eeed455321a54c8e72e16552a978f2',1,'mlx::core::ones(const std::vector&lt; int &gt; &amp;shape, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga6cf4b5e8580e4436302c519d05897dab',1,'mlx::core::ones(const std::vector&lt; int &gt; &amp;shape, StreamOrDevice s={})']]],
-  ['ones_5flike_5',['ones_like',['../group__ops.html#ga94f8d3b1906fee99da9cbe39f7be7d42',1,'mlx::core']]],
-  ['oofs_6',['oofs',['../classpocketfft_1_1detail_1_1multi__iter.html#aae63e67caac095d474ddd32daa5ffa34',1,'pocketfft::detail::multi_iter::oofs(size_t i) const'],['../classpocketfft_1_1detail_1_1multi__iter.html#a9236047e7419e5d21379cbf95eb3a78e',1,'pocketfft::detail::multi_iter::oofs(size_t j, size_t i) const']]],
-  ['op_7',['Op',['../classmlx_1_1core_1_1_bitwise_binary.html#a6f8b5d455d0c1770428a6bef1608f23d',1,'mlx::core::BitwiseBinary']]],
-  ['op_8',['op',['../structmlx_1_1core_1_1_default_strided_reduce.html#ac871f55a7ddd205574974cb4492a240b',1,'mlx::core::DefaultStridedReduce::op'],['../structmlx_1_1core_1_1_default_contiguous_reduce.html#a1928f07db988715cc177999e386f4830',1,'mlx::core::DefaultContiguousReduce::op'],['../common_2binary_8h.html#a70228731d29946574b238d21fb4b360c',1,'op:&#160;binary.h']]],
-  ['operations_9',['Core array operations',['../group__ops.html',1,'']]],
-  ['operator_20bool_10',['operator bool',['../struct___no_mask.html#ad3723c1e70e46beefd283ce6317416cb',1,'_NoMask::operator bool()'],['../struct___no_mask.html#aafbf8a3201e1cc1abf74dd1f1b7272cd',1,'_NoMask::operator bool() const threadgroup'],['../struct___no_mask.html#a73e9612a619885cbc97cbd8f40df71e7',1,'_NoMask::operator bool() const device'],['../struct___no_mask.html#a4bf336d472bc677028250f76b9cdc08c',1,'_NoMask::operator bool() const constant'],['../struct___no_mask.html#ad3723c1e70e46beefd283ce6317416cb',1,'_NoMask::operator bool()'],['../struct___no_mask.html#aafbf8a3201e1cc1abf74dd1f1b7272cd',1,'_NoMask::operator bool() const threadgroup'],['../struct___no_mask.html#a73e9612a619885cbc97cbd8f40df71e7',1,'_NoMask::operator bool() const device'],['../struct___no_mask.html#a4bf336d472bc677028250f76b9cdc08c',1,'_NoMask::operator bool() const constant']]],
-  ['operator_20dtype_11',['operator Dtype',['../structmlx_1_1core_1_1_type_to_dtype.html#aefdd0fd6a5bbf0197a3996ccd4adea13',1,'mlx::core::TypeToDtype']]],
-  ['operator_20float_12',['operator float',['../structmlx_1_1core_1_1___m_l_x___b_float16.html#aaae72e5340ce91325f1925be36ba46cb',1,'mlx::core::_MLX_BFloat16::operator float()'],['../structmlx_1_1core_1_1complex128__t.html#a3e2faf180c0b785646a0e4296f709a5e',1,'mlx::core::complex128_t::operator float()'],['../structmlx_1_1core_1_1complex64__t.html#a90d224dd37308345086bb9cc882ef6fc',1,'mlx::core::complex64_t::operator float()'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a363de5054f3673bddc90293fc3c9bb99',1,'mlx::core::_MLX_Float16::operator float()']]],
-  ['operator_20t_13',['operator T',['../struct___m_l_x___b_float16.html#aa7dfefdf0d15e102d2b8258c9ab01836',1,'_MLX_BFloat16::operator T() const thread'],['../struct___m_l_x___b_float16.html#a2546a8afa77e14ed5b3c5da79a281260',1,'_MLX_BFloat16::operator T() const threadgroup'],['../struct___m_l_x___b_float16.html#a1d523f87740fcb852db6ab57896c245a',1,'_MLX_BFloat16::operator T() const device'],['../struct___m_l_x___b_float16.html#a95acd29283024d7093a0bc58c9468a0a',1,'_MLX_BFloat16::operator T() const constant'],['../structcomplex64__t.html#a70e9b16031eeaff3baa601f400023fcd',1,'complex64_t::operator T() const thread'],['../structcomplex64__t.html#a4f3beea7ab6001189b782a74d1746b67',1,'complex64_t::operator T() const threadgroup'],['../structcomplex64__t.html#a9f4f7eca89ffe6c8d126a4145df6d9f2',1,'complex64_t::operator T() const device'],['../structcomplex64__t.html#ac33e2e5263fec76a4fb4418c6e1d8d14',1,'complex64_t::operator T() const constant']]],
-  ['operator_20val_14',['operator Val',['../structmlx_1_1core_1_1_dtype.html#a3b3bc059be5836476da3cb88a4f5e9fd',1,'mlx::core::Dtype']]],
-  ['operator_20value_5ftype_15',['operator value_type',['../structmlx_1_1steel_1_1integral__constant.html#a0c11203bed44a6a2c387b365134dcd64',1,'mlx::steel::integral_constant']]],
-  ['operator_21_3d_16',['operator!=',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a971aa511ab2e7ae1caae09556643a0bd',1,'mlx::core::array::ArrayIterator::operator!='],['../backend_2metal_2kernels_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55',1,'operator!=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a6baa722c22d66c7510786bb275cb8cc2',1,'operator!=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa8d9f01582a0a9f01a666d110c74db2a',1,'operator!=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa504a474ab6e00ebe2b1b7ed2f7d1ffb',1,'operator!=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abf5f3040227f021a5b84cf2eda248b2f',1,'operator!=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a347c9bbf816bad2e9e5e91aa448f8b65',1,'operator!=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a33ea086b561c652f25833a5e1ded34dd',1,'operator!=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2bbdcece13148826d3fe33af727bb79b',1,'operator!=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aeb1efa47c5f22cc0b35d49ccce73c406',1,'operator!=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa6b99cde403405df1865c989e4ce845a',1,'operator!=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a204d13a881ae8d337f6efbb98673790c',1,'operator!=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3602117b4c61d5cd4fd72fb8e5f68bd6',1,'operator!=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2721c088adfc9d73cde442d6badd2a6c',1,'operator!=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aa4364eda56525cf7576ff00e550175e6',1,'mlx::steel::operator!=()'],['../namespacemlx_1_1core.html#a94d00a1b7f8a4717ab3f26f45e4da655',1,'mlx::core::operator!=(const Device &amp;lhs, const Device &amp;rhs)'],['../group__ops.html#ga0ac483d85f23252ca8757e9926d5a3c5',1,'mlx::core::operator!=(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga3fecba9f3cb9a19afd8ca492cf509ce0',1,'mlx::core::operator!=(T a, const array &amp;b)'],['../group__ops.html#gaebbf1cfde388c7480159a03c92c9a385',1,'mlx::core::operator!=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a164f109bc19c927b2b3bcc47a5021419',1,'mlx::core::operator!=(const Stream &amp;lhs, const Stream &amp;rhs)'],['../namespacemlx_1_1core.html#ad2f9e1c230ec35d5c406dd616e8f4dea',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af5899b4d5644682cb0ac2a488f630d55',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a72ac8edd190601d7a46782582cedecd8',1,'mlx::core::operator!=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a8084162ba2dd3f9b89195d2bebc3fbb0',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a514263e63f6825b490203ca586864687',1,'mlx::core::operator!=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a1c482bb3d9f9d4c62dee5865892c1f96',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a0030fe7ad09837c670cdfb7d51279519',1,'mlx::core::operator!=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ade3791bc723b8f10fbab22eadb0f705a',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ad78c664f242cd36247c13868547e3dd4',1,'mlx::core::operator!=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab0743a1a1dcb92d40f41ca42d36f242c',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ae7a0f810e546a166c7d05849b5d41f30',1,'mlx::core::operator!=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a676a40637a563f013c725d24fa33fdc8',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a9fcb662b1561e4136bac0106cfb63b6c',1,'mlx::core::operator!=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abcca7fd43590c4347e0f5df8f134030c',1,'mlx::core::operator!=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af3ede3688a2e3b3ba8cb2da180ffe151',1,'mlx::core::operator!=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a54f48469fabd1414bef5097bcded0002',1,'mlx::core::operator!=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af8c648e892cbc6973de535aa17dc2cfe',1,'mlx::core::operator!=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#abc855e1c0584b64d7d995e33211361ab',1,'mlx::core::operator!=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad3684d660d18a54505c759ab286bd936',1,'mlx::core::operator!=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a8afdda14b14262ab5ce0a00c7745d7e8',1,'mlx::core::operator!=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7ccc479be236f2bf3f7725729c5ba201',1,'mlx::core::operator!=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a26a721b8111fce3a1dec9bf724034cd4',1,'mlx::core::operator!=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad5f8c221a53a89e8095aa39fd1f61867',1,'mlx::core::operator!=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a017b52ecf30b33da4aa8da35ccc43220',1,'mlx::core::operator!=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a43c10ca5fb05ee7d0ee63ba56f8a08a3',1,'mlx::core::operator!=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a81284b6ac737f91a8d1ffbbbbf938fe5',1,'mlx::core::operator!=(uint64_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_25_17',['operator%',['../backend_2metal_2kernels_2complex_8h.html#aaf53122a07c8eca858b5a8e38ae280e0',1,'operator%():&#160;complex.h'],['../group__ops.html#gab3bfbf82b1e4de7b00bbcf1a2255fbde',1,'mlx::core::operator%(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga50817666f0b82afcbf4a123486af9908',1,'mlx::core::operator%(T a, const array &amp;b)'],['../group__ops.html#ga46c01daa07433542a477d216e13a8480',1,'mlx::core::operator%(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a8723d145dd49021bfcb8e6c99e1c91a5',1,'mlx::core::operator%(complex64_t a, complex64_t b)']]],
-  ['operator_26_18',['operator&amp;',['../group__ops.html#gaf0d232de4cbfffda1e2c838f8afdf6ff',1,'mlx::core::operator&amp;(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#a9ee95f97bbd69262d99d7bea3bf77631',1,'mlx::core::operator&amp;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0fefc3ae4f1350ebe05ec6098fd6bae3',1,'mlx::core::operator&amp;(_MLX_BFloat16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a1e4cb758ccfe5c267baed9aeb0044834',1,'mlx::core::operator&amp;(uint16_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab9d0f9910070231695d61de08cadb930',1,'mlx::core::operator&amp;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a889d401f425db79d1868aa3beea4829b',1,'mlx::core::operator&amp;(_MLX_Float16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a76dcd1fa3c68b386bc1d1d899a68a120',1,'mlx::core::operator&amp;(uint16_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_26_26_19',['operator&amp;&amp;',['../namespacemlx_1_1steel.html#a6353bf11881842e25c46b56f92b7044f',1,'mlx::steel::operator&amp;&amp;()'],['../group__ops.html#gaee1d774bb0843601d7a0a4257d616ae3',1,'mlx::core::operator&amp;&amp;(const array &amp;a, const array &amp;b)']]],
-  ['operator_26_3d_20',['operator&amp;=',['../namespacemlx_1_1core.html#a60c263ef46e552c3954688869734b513',1,'mlx::core::operator&amp;=(_MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af9670fc8088339669c54c68b3a320e25',1,'mlx::core::operator&amp;=(_MLX_BFloat16 &amp;lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#ad1f96f0a02024f347b4c4431629407fc',1,'mlx::core::operator&amp;=(_MLX_Float16 &amp;lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae0540f16c4e7bd55d0e86a88495e4967',1,'mlx::core::operator&amp;=(_MLX_Float16 &amp;lhs, uint16_t rhs)']]],
-  ['operator_28_29_21',['operator()',['../structpocketfft_1_1detail_1_1_exec_c2_c.html#a4fd637f1a6d335826789af28ac089ecb',1,'pocketfft::detail::ExecC2C::operator()()'],['../structpocketfft_1_1detail_1_1_exec_hartley.html#a67c98b38d12440781053552b9a33bba1',1,'pocketfft::detail::ExecHartley::operator()()'],['../structpocketfft_1_1detail_1_1_exec_dcst.html#a67f4f56e3574c491695f8cb8a1e983d8',1,'pocketfft::detail::ExecDcst::operator()()'],['../structpocketfft_1_1detail_1_1_exec_r2_r.html#acdba1650962714e6afff51e9ca456970',1,'pocketfft::detail::ExecR2R::operator()()'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a0d657bc9a381dca1b5860b9a1b5a5702',1,'mlx::core::detail::Abs::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a564232db7d32811e2ae126c86de104f0',1,'mlx::core::detail::Abs::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a5fac7e6c8277d8706535a52820503c9d',1,'mlx::core::detail::Abs::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#af2c3723e648bd5ed2fe558cc20b7f5eb',1,'mlx::core::detail::Abs::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a57312cd8530dd0ede3b8037f9c401883',1,'mlx::core::detail::Abs::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#ab3b5e3853ed56bfbfa577d965c21112e',1,'mlx::core::detail::Abs::operator()(bool x)'],['../structmlx_1_1core_1_1detail_1_1_arc_cos.html#a04b4c9d1fc0160973aa28b1f809b9d51',1,'mlx::core::detail::ArcCos::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_cosh.html#a767d354bec863942822ee0b9b6742a88',1,'mlx::core::detail::ArcCosh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_sin.html#ac69091929815e5317308b4088f5c2f46',1,'mlx::core::detail::ArcSin::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_sinh.html#ac7bf9bac66fef917f75494b2345e6aaf',1,'mlx::core::detail::ArcSinh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_tan.html#aee87bf10c278a70ca788085d1b499afe',1,'mlx::core::detail::ArcTan::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_tan2.html#a9040b7afcdb4969924aa782fa67f03ac',1,'mlx::core::detail::ArcTan2::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_tanh.html#a601e8c52bb938eb3a616756a35419e8b',1,'mlx::core::detail::ArcTanh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a672f65e47d65e4e8d88be252bce0164b',1,'mlx::core::detail::Ceil::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a72a2cab2728fb5e1cc6329a539e5d573',1,'mlx::core::detail::Ceil::operator()(int8_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#aa34590f6a41331be92988558a90dc6fa',1,'mlx::core::detail::Ceil::operator()(int16_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#af14120f3dd98f6198ea257d75be223f7',1,'mlx::core::detail::Ceil::operator()(int32_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#af263ce7743cf7319387baba616c375b5',1,'mlx::core::detail::Ceil::operator()(int64_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a48f00affcd5c2ea1f81d821e019fec29',1,'mlx::core::detail::Ceil::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#ad4d24a44e8a328948393701dacb0ceac',1,'mlx::core::detail::Ceil::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a70da19b5c9c69f04b9f196bdf266f93c',1,'mlx::core::detail::Ceil::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#af0e7e806b73c664ada837476f9d4d43b',1,'mlx::core::detail::Ceil::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#acc1bfc84a9b91f6e9764234cbe3b9687',1,'mlx::core::detail::Ceil::operator()(bool x)'],['../structmlx_1_1core_1_1detail_1_1_conjugate.html#a7e662d05c6998bd6ced8ad9c187324a5',1,'mlx::core::detail::Conjugate::operator()()'],['../structmlx_1_1core_1_1detail_1_1_cos.html#ad4caef573f9d9071f8945a8efed231ad',1,'mlx::core::detail::Cos::operator()()'],['../structmlx_1_1core_1_1detail_1_1_cosh.html#a63591f49776d9aadc02200036ae38317',1,'mlx::core::detail::Cosh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_erf.html#a168f8ccc6c8053b05dd1a48904ca8fd4',1,'mlx::core::detail::Erf::operator()()'],['../structmlx_1_1core_1_1detail_1_1_erf_inv.html#acc93c0511141404208b35f302f8c1fcb',1,'mlx::core::detail::ErfInv::operator()()'],['../structmlx_1_1core_1_1detail_1_1_exp.html#a0846300cee28315e5b42f74acafbd1a1',1,'mlx::core::detail::Exp::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_exp.html#af247c0d19d59f3310fd0a081eb92cf8b',1,'mlx::core::detail::Exp::operator()(complex64_t x)'],['../structmlx_1_1core_1_1detail_1_1_expm1.html#abf7e61b8387521e9d44334ce88d833a0',1,'mlx::core::detail::Expm1::operator()()'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a16c13cfe736098bffc81d655e172294a',1,'mlx::core::detail::Floor::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a9b6c4c34b6594b8c413abe31f34a73df',1,'mlx::core::detail::Floor::operator()(int8_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#aca4c71204b3ceeca6329f7ea2b041f4c',1,'mlx::core::detail::Floor::operator()(int16_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a3c3ab9e00d1fbd124802517e8c35fe02',1,'mlx::core::detail::Floor::operator()(int32_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a4b5954ffc59c741dd7b86bafda69d5cc',1,'mlx::core::detail::Floor::operator()(int64_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a2e33b10bd5b04551054a87c601440bc7',1,'mlx::core::detail::Floor::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a2500f971100919a694f78669a5e4f438',1,'mlx::core::detail::Floor::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a23df818301d68389e6e12f5a9ec1fbd7',1,'mlx::core::detail::Floor::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#ac988b4f265cf46c68609c9c8787c15fb',1,'mlx::core::detail::Floor::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a7f936e3fd53772bc189d845c73b53202',1,'mlx::core::detail::Floor::operator()(bool x)'],['../structmlx_1_1core_1_1detail_1_1_imag.html#a5bd82e2185f3779e398c179d42a3e782',1,'mlx::core::detail::Imag::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log.html#a0012a4e1744dbe9a28c3b5652be6e1c6',1,'mlx::core::detail::Log::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log2.html#a467bd4c995674721ff5fff6df33aead8',1,'mlx::core::detail::Log2::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log10.html#a2633c5b772bbc9f8b66cffd4a3e01a3f',1,'mlx::core::detail::Log10::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log1p.html#a3220de8c6090c44aa2070b1fbb2dc340',1,'mlx::core::detail::Log1p::operator()()'],['../structmlx_1_1core_1_1detail_1_1_logical_not.html#a79799668ea5c364b0b4e2bc330e76253',1,'mlx::core::detail::LogicalNot::operator()()'],['../structmlx_1_1core_1_1detail_1_1_negative.html#afc4595c70ef7196df374cf4b2cc5e526',1,'mlx::core::detail::Negative::operator()()'],['../structmlx_1_1core_1_1detail_1_1_real.html#ae84a939fdb5916257a7731cda66d4d61',1,'mlx::core::detail::Real::operator()()'],['../structmlx_1_1core_1_1detail_1_1_round.html#a653f29c059bbfa6192378732a8a23351',1,'mlx::core::detail::Round::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_round.html#a82a984f13568051009e257fe85227da6',1,'mlx::core::detail::Round::operator()(complex64_t x)'],['../structmlx_1_1core_1_1detail_1_1_sigmoid.html#a64b72561bfaf758632167f00648f4c89',1,'mlx::core::detail::Sigmoid::operator()()'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a64ed5013cee7ff18c7fe70bc04737e7b',1,'mlx::core::detail::Sign::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a7106ed1f2f98a365fcb3e6ee39084748',1,'mlx::core::detail::Sign::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a7163e8c068dcc460600ed04014dc9945',1,'mlx::core::detail::Sign::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#ae8f56c7134721c846240830169424c22',1,'mlx::core::detail::Sign::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a10ae519e9a74a327fc72c410e9ab2936',1,'mlx::core::detail::Sign::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a91be4e273f6c7ea5d44cfab380b77603',1,'mlx::core::detail::Sign::operator()(complex64_t x)'],['../structmlx_1_1core_1_1detail_1_1_sin.html#ae95671816529cc2188389af37a2f1a13',1,'mlx::core::detail::Sin::operator()()'],['../structmlx_1_1core_1_1detail_1_1_sinh.html#a9663ddf0fa4c0003576b48f3d5385f00',1,'mlx::core::detail::Sinh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_square.html#a54e9e3c0d0896e142289e8282eab1099',1,'mlx::core::detail::Square::operator()()'],['../structmlx_1_1core_1_1detail_1_1_sqrt.html#aa5a4830b3ef7efab20ea88a110667efd',1,'mlx::core::detail::Sqrt::operator()()'],['../structmlx_1_1core_1_1detail_1_1_rsqrt.html#a9af247be16bab83243038aac54446b79',1,'mlx::core::detail::Rsqrt::operator()()'],['../structmlx_1_1core_1_1detail_1_1_tan.html#aba397cd7ac05bbe06dfa9e3a64bdb05f',1,'mlx::core::detail::Tan::operator()()'],['../structmlx_1_1core_1_1detail_1_1_tanh.html#a1749ba1edfd53095ed7d45c0e53bab61',1,'mlx::core::detail::Tanh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_add.html#a2d6011c35768b5fcd2bb75747b944353',1,'mlx::core::detail::Add::operator()()'],['../structmlx_1_1core_1_1detail_1_1_divide.html#a5e0d22e2084c4ca81bec0d457a46c662',1,'mlx::core::detail::Divide::operator()()'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a3bdaf1095ad883ecc0fecc455f02cbf3',1,'mlx::core::detail::Remainder::operator()(T numerator, T denominator)'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a52c3a2ba86fccb24d37d218ae8328954',1,'mlx::core::detail::Remainder::operator()(T numerator, T denominator)'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a0b0dd6ef5b08585fdf8355770da8d747',1,'mlx::core::detail::Remainder::operator()(T numerator, T denominator)'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a68fe542084fb94d9a5abd740fe07832b',1,'mlx::core::detail::Remainder::operator()(complex64_t numerator, complex64_t denominator)'],['../structmlx_1_1core_1_1detail_1_1_equal.html#a2994cf1884e7126e76d0a20b215fe3ab',1,'mlx::core::detail::Equal::operator()()'],['../structmlx_1_1core_1_1detail_1_1_na_n_equal.html#a073b20b0d8d41ec8364b7c477421b9bf',1,'mlx::core::detail::NaNEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_greater.html#aa3844c2bae3c7a981739f642aa0dd094',1,'mlx::core::detail::Greater::operator()()'],['../structmlx_1_1core_1_1detail_1_1_greater_equal.html#a3b005f85522ad0e4b57044eed930ac30',1,'mlx::core::detail::GreaterEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_less.html#a0b4032dff1ad2b387745cb000aabdcbb',1,'mlx::core::detail::Less::operator()()'],['../structmlx_1_1core_1_1detail_1_1_less_equal.html#a31e70f8830a07557697541301555a7a7',1,'mlx::core::detail::LessEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_maximum.html#a3eb37abec8426ebc42b8c685075c523a',1,'mlx::core::detail::Maximum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_maximum.html#af99345c7c8bc95ccab1b22c0792ac6fd',1,'mlx::core::detail::Maximum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_minimum.html#afca0861556416a8547dd8574528feb69',1,'mlx::core::detail::Minimum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_minimum.html#a64b2eecfbc56aaef7deb939423bac3f8',1,'mlx::core::detail::Minimum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_log_add_exp.html#ad1663fd809acaa4038f90666436599e5',1,'mlx::core::detail::LogAddExp::operator()()'],['../structmlx_1_1core_1_1detail_1_1_multiply.html#a898b090966b047723513224b8d3b22f1',1,'mlx::core::detail::Multiply::operator()()'],['../structmlx_1_1core_1_1detail_1_1_not_equal.html#a23d662b5fd968dc17d3bee2595b5f99d',1,'mlx::core::detail::NotEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_power.html#a2c047e1b488e6525447a224975a75db8',1,'mlx::core::detail::Power::operator()(T base, T exp)'],['../structmlx_1_1core_1_1detail_1_1_power.html#a9967db24b8f67d54b6aa3810e274f28c',1,'mlx::core::detail::Power::operator()(T base, T exp)'],['../structmlx_1_1core_1_1detail_1_1_subtract.html#a72ef05830615a2d5d9662926ed82672a',1,'mlx::core::detail::Subtract::operator()()'],['../structmlx_1_1core_1_1detail_1_1_logical_and.html#a046536c1f2f9367983f052a213d7b7d8',1,'mlx::core::detail::LogicalAnd::operator()()'],['../structmlx_1_1core_1_1detail_1_1_logical_or.html#afb134dbab79307d4ba597843c61d0b1a',1,'mlx::core::detail::LogicalOr::operator()()'],['../structmlx_1_1core_1_1detail_1_1_select.html#a930f9da2e6b3453e04f21382435a2cfb',1,'mlx::core::detail::Select::operator()()'],['../structmlx_1_1core_1_1detail_1_1_bitwise_and.html#ae0bed77f95fe2b2f0b594addddd04700',1,'mlx::core::detail::BitwiseAnd::operator()()'],['../structmlx_1_1core_1_1detail_1_1_bitwise_or.html#a5ab05734c5000b454975de6647a08d20',1,'mlx::core::detail::BitwiseOr::operator()()'],['../structmlx_1_1core_1_1detail_1_1_bitwise_xor.html#a0989e3bcd064ae06c33f660696a869a0',1,'mlx::core::detail::BitwiseXor::operator()()'],['../structmlx_1_1core_1_1detail_1_1_left_shift.html#a9385f580830a6ad163dd9bb8c4905e7a',1,'mlx::core::detail::LeftShift::operator()()'],['../structmlx_1_1core_1_1detail_1_1_right_shift.html#a154528ba50e89a4c532a181f135b1620',1,'mlx::core::detail::RightShift::operator()()'],['../structmlx_1_1core_1_1_default_strided_reduce.html#a024682ab93b84e544a07e3a9c3c51fba',1,'mlx::core::DefaultStridedReduce::operator()()'],['../structmlx_1_1core_1_1_default_contiguous_reduce.html#a08144c7a3cdf10af5e47f4575da3694f',1,'mlx::core::DefaultContiguousReduce::operator()()'],['../struct_add.html#ac5c66b63d63a222d3ae0ab8cc7c90eb5',1,'Add::operator()()'],['../struct_floor_divide.html#a2b328e4d768e718fa439f955c524666a',1,'FloorDivide::operator()(T x, T y)'],['../struct_floor_divide.html#afc16a2b2a745225e0bc95640f3fc0219',1,'FloorDivide::operator()(float x, float y)'],['../struct_floor_divide.html#ae91719a15f7e643d552129f476089c6a',1,'FloorDivide::operator()(half x, half y)'],['../struct_floor_divide.html#a4aa9f858626583e02bd79f747229bbca',1,'FloorDivide::operator()(bfloat16_t x, bfloat16_t y)'],['../struct_divide.html#a0a16b9194abc2ab7c61129f81a9bbb3d',1,'Divide::operator()()'],['../struct_remainder.html#ab7875512ff4341c580c6dc372e64fc58',1,'Remainder::operator()(T x, T y)'],['../struct_remainder.html#a18150b5f4425e30b95ffabc6bb25cede',1,'Remainder::operator()(T x, T y)'],['../struct_remainder.html#ab3b75f54b56fd357c9755daadb2cafc2',1,'Remainder::operator()(T x, T y)'],['../struct_remainder.html#ae918ce0e246937d4fe04e2ea36e4b2c1',1,'Remainder::operator()(complex64_t x, complex64_t y)'],['../struct_equal.html#aa498087080900d4428ba428a6496a769',1,'Equal::operator()()'],['../struct_na_n_equal.html#a00220898e02db656d21dde9e9354a8dc',1,'NaNEqual::operator()(T x, T y)'],['../struct_na_n_equal.html#a6185e4554dce5b4659d21673c576be51',1,'NaNEqual::operator()(complex64_t x, complex64_t y)'],['../struct_greater.html#a98d7d8ee360cd0f469c6eb9a017560f5',1,'Greater::operator()()'],['../struct_greater_equal.html#ae69a3bccc567a46506cf0d296294ce80',1,'GreaterEqual::operator()()'],['../struct_less.html#a5ee0b31b2d9123dc4504f2979a5854d3',1,'Less::operator()()'],['../struct_less_equal.html#ae9f9a1b2eae548977139704f0044acfe',1,'LessEqual::operator()()'],['../struct_log_add_exp.html#ab32417f18e8ff68c15f78aceeb624edf',1,'LogAddExp::operator()()'],['../struct_maximum.html#a3ea0f42bc4cd80b68a98f189f9fa859c',1,'Maximum::operator()(T x, T y)'],['../struct_maximum.html#a0bc8fadc87f2c49fc440d625bfc97ca6',1,'Maximum::operator()(T x, T y)'],['../struct_maximum.html#a907e8793900be5927625377dab199644',1,'Maximum::operator()(complex64_t x, complex64_t y)'],['../struct_minimum.html#aa6113dfac3986c0f571fa53f65c5330e',1,'Minimum::operator()(T x, T y)'],['../struct_minimum.html#a0c939921de87ab9c6959238aac81a059',1,'Minimum::operator()(T x, T y)'],['../struct_minimum.html#a800fba087280f79c2f7e9aff75bed093',1,'Minimum::operator()(complex64_t x, complex64_t y)'],['../struct_multiply.html#a1327fc5a0713931afe997b0d4d2988e0',1,'Multiply::operator()()'],['../struct_not_equal.html#af008d73a5d9cde0b8309b7e8ee7438b2',1,'NotEqual::operator()(T x, T y)'],['../struct_not_equal.html#a14de494cea4e4869351202cad1149f17',1,'NotEqual::operator()(complex64_t x, complex64_t y)'],['../struct_power.html#a2b6df2a9e48155ff9734caca8504a79f',1,'Power::operator()(T base, T exp)'],['../struct_power.html#a36829163d42973034a1f8a7ecc57a1de',1,'Power::operator()(T base, T exp)'],['../struct_power.html#a27cdfb313c4e82b63bdcdaee923cbbef',1,'Power::operator()(complex64_t x, complex64_t y)'],['../struct_subtract.html#ae0856cd8d449074ca287baa7e460f68a',1,'Subtract::operator()()'],['../struct_logical_and.html#a8bc6bdabc0ea0678a46e2cf6217cb3a6',1,'LogicalAnd::operator()()'],['../struct_logical_or.html#ade6a931324a604a3119d2220d6f5460d',1,'LogicalOr::operator()()'],['../struct_bitwise_and.html#afb48af090b01dd0200963bc12d842e36',1,'BitwiseAnd::operator()()'],['../struct_bitwise_or.html#a41f847463daafa99ee56f4035578390f',1,'BitwiseOr::operator()()'],['../struct_bitwise_xor.html#a3a3e8a56caab739d40262d9349c9c485',1,'BitwiseXor::operator()()'],['../struct_left_shift.html#aa729747784c38bfdbba34794fcf5175b',1,'LeftShift::operator()()'],['../struct_right_shift.html#a2cc59b400c68342b0e43050431323c17',1,'RightShift::operator()()'],['../struct_arc_tan2.html#ac9b7729753e13be293ab700231d061ac',1,'ArcTan2::operator()()'],['../struct_div_mod.html#a8b5758f2ea18d4c903b462331b25abfe',1,'DivMod::operator()()'],['../struct_cum_prod_3_01bool_01_4.html#ad634be0b139d10ce6d21332eef0d936b',1,'CumProd&lt; bool &gt;::operator()()'],['../struct_cum_max.html#a781b9b955c5412466da6af6c70d73c06',1,'CumMax::operator()()'],['../struct_cum_min.html#ae0b8c3761e04fa538d304ca842281a66',1,'CumMin::operator()()'],['../struct_less_than.html#a2798eb377b411c93a4ed30cf35caade2',1,'LessThan::operator()()'],['../struct_select.html#adb51692aae3038de07dd745891bf9848',1,'Select::operator()()'],['../struct_abs.html#a9e7481dfcc162509769852026ff4a344',1,'Abs::operator()(T x)'],['../struct_abs.html#a0ca113fd036151c443df3f83cc667f28',1,'Abs::operator()(uint8_t x)'],['../struct_abs.html#adaeab32a7e377dc990077ab15f3dc4c2',1,'Abs::operator()(uint16_t x)'],['../struct_abs.html#a99d2a2f37a6cddd3168b0224f2a9b963',1,'Abs::operator()(uint32_t x)'],['../struct_abs.html#ac9cbc02422d930479303f240a7ea6c71',1,'Abs::operator()(uint64_t x)'],['../struct_abs.html#ac30835b27784d451bd2e4524c8eb9e11',1,'Abs::operator()(bool x)'],['../struct_abs.html#ab82917d6b30a2c579e7eb879d305c5fc',1,'Abs::operator()(complex64_t x)'],['../struct_arc_cos.html#a5553cecf58511e24e76ac97f2d90b9ac',1,'ArcCos::operator()()'],['../struct_arc_cosh.html#a5c9e7712c14c97298b23ec48e19abc58',1,'ArcCosh::operator()()'],['../struct_arc_sin.html#a0343872f2da93bae2bb0baadf49da022',1,'ArcSin::operator()()'],['../struct_arc_sinh.html#a3066fb7dc7c3180100fb55ff94af6a7a',1,'ArcSinh::operator()()'],['../struct_arc_tan.html#af3a0aec6acec8ae8f5e4c4d5cf8c91ba',1,'ArcTan::operator()()'],['../struct_arc_tanh.html#a37dc3e01ec2830de7e82ed6c6363ac88',1,'ArcTanh::operator()()'],['../struct_ceil.html#a5e2a4ef1b012f5d352064489156e5e44',1,'Ceil::operator()(T x)'],['../struct_ceil.html#a455cd8083ba859993077f2e078ae165b',1,'Ceil::operator()(int8_t x)'],['../struct_ceil.html#a2acb61bc658c7a216795e7f76ebcf98a',1,'Ceil::operator()(int16_t x)'],['../struct_ceil.html#aef8c37f7a8ee3fc80700d605a09891fb',1,'Ceil::operator()(int32_t x)'],['../struct_ceil.html#a93d0110511ad5dd200e12d37a3d7d6e3',1,'Ceil::operator()(int64_t x)'],['../struct_ceil.html#aa335b745fa26e0f443cdb36298105484',1,'Ceil::operator()(uint8_t x)'],['../struct_ceil.html#ade17e13b7f30f5c590fae1581a2013ac',1,'Ceil::operator()(uint16_t x)'],['../struct_ceil.html#a411c75cc35cdc088402e176a1defd22d',1,'Ceil::operator()(uint32_t x)'],['../struct_ceil.html#a9ac660ca29eef7a7429fceb7b917a68a',1,'Ceil::operator()(uint64_t x)'],['../struct_ceil.html#a40de367e62f06ebd7e1330afa93a9ad9',1,'Ceil::operator()(bool x)'],['../struct_cos.html#ae222f8710f6b8254c471ebd475aa5bda',1,'Cos::operator()(T x)'],['../struct_cos.html#a5f26feb1dcc4bec5f59a9ff511c5b163',1,'Cos::operator()(complex64_t x)'],['../struct_cosh.html#a5847ebeebb236fdc926798ddc16475ba',1,'Cosh::operator()(T x)'],['../struct_cosh.html#aefdd91298dac16d528d29ee47e2f7252',1,'Cosh::operator()(complex64_t x)'],['../struct_conjugate.html#acb0a2694285f1f57c7654b371ce8cbd8',1,'Conjugate::operator()()'],['../struct_erf.html#a80719402ad7f7d418859a6677d7b604d',1,'Erf::operator()()'],['../struct_erf_inv.html#afbf3668d1a512e889f093a0bc7673309',1,'ErfInv::operator()()'],['../struct_exp.html#a5ef395868e055348c0802fd5fe45669c',1,'Exp::operator()(T x)'],['../struct_exp.html#a2b341ac400c4d145397950eb60734336',1,'Exp::operator()(complex64_t x)'],['../struct_expm1.html#a4b834d42cf0b84daf03fec62c222091a',1,'Expm1::operator()()'],['../struct_floor.html#ace3551f28429081e9f3a3dab0c84212b',1,'Floor::operator()(T x)'],['../struct_floor.html#a10d7fd05b4c224c9f135451246d13014',1,'Floor::operator()(int8_t x)'],['../struct_floor.html#a2865a04a492e3590302f4bd3215a10d7',1,'Floor::operator()(int16_t x)'],['../struct_floor.html#a41012343ff0463ec44b4d06196f41182',1,'Floor::operator()(int32_t x)'],['../struct_floor.html#aae3181d15856796aa0628cf30c92aa2e',1,'Floor::operator()(int64_t x)'],['../struct_floor.html#ac6cf38d82c8e270911afdca4c69ad51b',1,'Floor::operator()(uint8_t x)'],['../struct_floor.html#a78969b9e2b53ae248e72a67259eea5d8',1,'Floor::operator()(uint16_t x)'],['../struct_floor.html#a959009320ed622ed45b39becab1d5b98',1,'Floor::operator()(uint32_t x)'],['../struct_floor.html#a7d04b83c3345cd867315cae2d7ff68ab',1,'Floor::operator()(uint64_t x)'],['../struct_floor.html#abea845fe5e8e6b93bd4bca8717337e0b',1,'Floor::operator()(bool x)'],['../struct_imag.html#a3b29e9f8a46c194d683f6a9938314400',1,'Imag::operator()()'],['../struct_log.html#a32a383cb6be06e616a75f23bf49089c3',1,'Log::operator()()'],['../struct_log2.html#ac1e067ecdcbdbffb6106e789c2b98b64',1,'Log2::operator()()'],['../struct_log10.html#ac596a74c1642a00f3eced07ee3334122',1,'Log10::operator()()'],['../struct_log1p.html#a4464c6e7bdbe55ffd7d961c695cd13ce',1,'Log1p::operator()()'],['../struct_logical_not.html#a8a620bac957ab8c09ac85adfddd96708',1,'LogicalNot::operator()()'],['../struct_negative.html#af6879b374314a559faa321e8cce3d710',1,'Negative::operator()()'],['../struct_real.html#a85b9c5b9e65297994fa26ff68e19e809',1,'Real::operator()()'],['../struct_round.html#aa06a0195867e2ceb679c403b6909a1c4',1,'Round::operator()(T x)'],['../struct_round.html#ad3a08f2276ff1033900bc0a7da812655',1,'Round::operator()(complex64_t x)'],['../struct_sigmoid.html#a75a24cd75cb4d4c9a072811b2d70ad55',1,'Sigmoid::operator()()'],['../struct_sign.html#aa3304c6b43bcad53061614b741d8403c',1,'Sign::operator()(T x)'],['../struct_sign.html#ac48992b675b8b28be1e27e1f2ec5d2f7',1,'Sign::operator()(uint32_t x)'],['../struct_sign.html#ae07a4249e1b61419a3b9ca6c337b7bb5',1,'Sign::operator()(complex64_t x)'],['../struct_sin.html#a7caf98c777521fa5d5c6ddaaa3b779fd',1,'Sin::operator()(T x)'],['../struct_sin.html#aa510cf4595b6d49065ab6b602d8fcb14',1,'Sin::operator()(complex64_t x)'],['../struct_sinh.html#a02cf32bcf560657b9ee34fb1affed8e2',1,'Sinh::operator()(T x)'],['../struct_sinh.html#a1f8ba1858d352ee68861cd6ea861af43',1,'Sinh::operator()(complex64_t x)'],['../struct_square.html#afde739fc544e45dd30964c02dca94310',1,'Square::operator()()'],['../struct_sqrt.html#ab9b16d2b9b03a1c54190f4479a56a4ad',1,'Sqrt::operator()()'],['../struct_rsqrt.html#ae16699fd829e40416436247a39233fda',1,'Rsqrt::operator()()'],['../struct_tan.html#a1e6fb8c691621c69cb9bd393de4f6e78',1,'Tan::operator()(T x)'],['../struct_tan.html#a2ef120c9f92b0d2e9cec8389eda05724',1,'Tan::operator()(complex64_t x)'],['../struct_tanh.html#adce11a7ad33226c6ecff34f46f5c45d7',1,'Tanh::operator()(T x)'],['../struct_tanh.html#aa8423b43c725bb4b88965a11e8cf20f6',1,'Tanh::operator()(complex64_t x)']]],
-  ['operator_2a_22',['operator*',['../structpocketfft_1_1detail_1_1cmplx.html#a26bf3d709a58f06228e502af6db8e5ac',1,'pocketfft::detail::cmplx::operator*(const T2 &amp;other) const -&gt; cmplx&lt; decltype(r *other)&gt;'],['../structpocketfft_1_1detail_1_1cmplx.html#ad9c591ef8ae976293f207937d273e9a1',1,'pocketfft::detail::cmplx::operator*(const cmplx&lt; T2 &gt; &amp;other) const -&gt; cmplx&lt; decltype(r+other.r)&gt;'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a153756072fda6d3e53bcca11b46a1238',1,'mlx::core::array::ArrayIterator::operator*()'],['../backend_2metal_2kernels_2bf16_8h.html#a8f06316063fc91747533105f256b55b5',1,'operator*(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7b3bce3f6f17089d87e13e91f580a581',1,'operator*(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a54ae7216b82c5cea362f6b83e1df3a9b',1,'operator*(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a852689073c17596de4fb545bc046b380',1,'operator*(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a168300bbd04d8e97c5e4218cb14ae378',1,'operator*(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a6278bd2e0e2805090b33ef666bf7f6bb',1,'operator*(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aecf703522d9ce32dfeefe1e6e903db06',1,'operator*(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7cd44d27fa9a4f13df39894c34fdb348',1,'operator*(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aee64dc1890abb6d1035361cb8c751f96',1,'operator*(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad1a559ab88dbbb4fd2c7509d2c94e55b',1,'operator*(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a495ae2d9be5d97c4c6448fc4e50a03e1',1,'operator*(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a87ab4b7a502430da664ccb8abd383058',1,'operator*(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5f997839cf49c24ab594a0dff486a7bc',1,'operator*(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#a681d4fb076973f58f7dac894ec62a385',1,'operator*(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#aa0c2d29950926ae579adf6337fbea64b',1,'mlx::steel::operator*()'],['../group__ops.html#ga26c33f5cdb6fc10d272acd6e208034e0',1,'mlx::core::operator*(const array &amp;a, const array &amp;b)'],['../group__ops.html#gac22a67f7de797b1ae59029843cbdcab6',1,'mlx::core::operator*(T a, const array &amp;b)'],['../group__ops.html#ga6f2369ed5fae8ff9b1528670a004dde2',1,'mlx::core::operator*(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a0cc824d6318f97f7058918ab64ddfc25',1,'mlx::core::operator*(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a81e1c727c3fc48910b030cb65a9e7afa',1,'mlx::core::operator*(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a861d948220d8f48d46c68d2ddb16a096',1,'mlx::core::operator*(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a13d16561812679b36e68185dc4b2d04d',1,'mlx::core::operator*(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a5287610200ff573730c9c92413f48881',1,'mlx::core::operator*(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a377ccc6b4ef36767abca102dca56dc10',1,'mlx::core::operator*(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a5d696b63635ce6967526d6a410f7f6b1',1,'mlx::core::operator*(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abe90e9527bfa3e1c813d41df4a2372e7',1,'mlx::core::operator*(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a5f14963c77f96bcb5a3bef5661a86ba4',1,'mlx::core::operator*(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#acfb06fe9f5fee01dbb5a2b23bccfd0d3',1,'mlx::core::operator*(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#afc9a87f1fccbac05242b91bfbb35c24d',1,'mlx::core::operator*(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0b9678af9b487900cacf6639a4693de0',1,'mlx::core::operator*(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ad5950619081389e6ed7512f38358d33d',1,'mlx::core::operator*(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a65d25d082374761c05b056e1046d1d4e',1,'mlx::core::operator*(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a759191fb984e7737f0ef529c2053ad73',1,'mlx::core::operator*(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3a52675c3d4552b319dd9707844abdec',1,'mlx::core::operator*(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a45d67f5d80fba4d42e34c682a8d22beb',1,'mlx::core::operator*(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ad25880c67bbcbfafbe54dc16418bf736',1,'mlx::core::operator*(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a63c836e1141e07ae72cee770bad01200',1,'mlx::core::operator*(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a265a37b8ee4a97390213e9ec49693e66',1,'mlx::core::operator*(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab5a457da04dcb157a0b5172c4b2244b6',1,'mlx::core::operator*(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#aa56a8bda08be9ef3711496e216a75c95',1,'mlx::core::operator*(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af89612098dd355b1eefb841c753b36ab',1,'mlx::core::operator*(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a4552687a0637f710b5d55bb6378fcabe',1,'mlx::core::operator*(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af69db7def588d7da430434a69456e29c',1,'mlx::core::operator*(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a00af6e5095888f00791ee0ab6d993ad6',1,'mlx::core::operator*(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab48feddc1aa304383e5493923506ad7a',1,'mlx::core::operator*(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a0367b582e85162b4180e086f725e49e9',1,'mlx::core::operator*(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a45f0479526fbccdb00bc73ea7f3b7625',1,'mlx::core::operator*(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a394797646010ba9ef2a1f9b9a4b8ddd9',1,'mlx::core::operator*(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acaaa86b59c7ceb2e092ac07f2a75225c',1,'mlx::core::operator*(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a067d47823a322b88043cce7ce4a3ec78',1,'mlx::core::operator*(bfloat16_t lhs, float16_t rhs)']]],
-  ['operator_2a_3d_23',['operator*=',['../structpocketfft_1_1detail_1_1cmplx.html#a683fd490182c9189fa2c05b1823edd93',1,'pocketfft::detail::cmplx::operator*=(T2 other)'],['../structpocketfft_1_1detail_1_1cmplx.html#a06f2c26c6fc4722e61b44da4c242ed87',1,'pocketfft::detail::cmplx::operator*=(const cmplx&lt; T2 &gt; &amp;other)'],['../backend_2metal_2kernels_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419',1,'operator*=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ade65ebca11e38d56408c512df89b99f4',1,'operator*=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af4348ce3425dd99d069e8fdf06e25a3c',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2c3c5f793b3d957d7295d7f1faabebee',1,'operator*=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac66657077d55e94197b52b63acb50b7d',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a383165ea838cc3feeee4d9cf54aa77cc',1,'operator*=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab706af260b61f735b28464877d02137c',1,'operator*=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a979374b1dd4e0eaf602326fa901336d1',1,'operator*=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac815eec2c1b15a47b1c6ea6790e77d24',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8110fae7bcc34a0de5927546b24aa935',1,'operator*=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae4acef3e7ae7dfe359422503f894e885',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adc268cdbc30500f3009f5de2b2f0f67a',1,'operator*=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a81f65b04a87a25c7eb1a751d1be9fa55',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08c1f916302eb9d48c93f8b7260538fe',1,'operator*=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adc8e82b8f593b12c6d405e2250ab0f62',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4611728172afea51860a77fdb06cafa0',1,'operator*=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0b8736e2ae24758b6e24ea72668df5b4',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad920df9579603f0b0ee2689eba330617',1,'operator*=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae97ab6c3ddcc2754b24f86319a5398be',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3ff4ff59f411010ac8502cfabda4bd6f',1,'operator*=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abd3d82e2dec1847e97eb8fc3bab2985a',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a738078eb7d5ff94ff48156a555d763a5',1,'operator*=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a435f2f4256aadb1b57fd62bb7f733cf7',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0e4377b120d6305335d296e031ee5b30',1,'operator*=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a917354f77eac26189da8a2f610a00074',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af725f935bfa0405e5ff17ede3ac47283',1,'operator*=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7c56980c234a04260b8b19298085e526',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab840ff9de0cdd0e9afffb8baa2a850a3',1,'operator*=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a73416a7415f3fe31525e33419e5e8aab',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a16978f4b16d954ef4d4cf0f32f6c0b94',1,'operator*=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a99aa4cc110d1c7aa3b4c8c5cbf9235b7',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2179abbc91ce8763e96e39e1917bfa6e',1,'operator*=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab070ea4676d10a10ff3e9379a4068a57',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0197e039d4c65bf49649a6f250c2d436',1,'operator*=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad3565cc6fd1e088d052b1108aa065851',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a711693988c437c2fb4d7da505982fe21',1,'operator*=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aeff4c28986f98c23de1df17043edb0f5',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7dbf0c75df4817cb4ef8b60c417a89d0',1,'operator*=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a323a80492cd17a49e2c3dd18f8c8b5cc',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adb465776d3868bda0525d632ffc4d129',1,'operator*=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a12a98d71d670b409b8065e0d61672d55',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5d00eb2ec2b0e15b2753d100694c45ae',1,'operator*=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1a2a683ff40490226eb1371fb905023d',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4126fb7ed5bbb27a2332c543cf56a337',1,'operator*=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab092d9790ef20fc0386707530aee89db',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abff1fd2439e31e6e64a3d2fdee3c7821',1,'operator*=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a625dcb133f1f953f263e6200399866c6',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08b6071245513e1726ec68e3b63edc53',1,'operator*=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a13aa79165ec87710e977f33fe0361e91',1,'operator*=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3796dcf819adb1ef8152f57ba63ff6b1',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aaab79d0b4c9e9bdc059ace6ec58c5b00',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a0dd3893abc8986901872c8365ab1509d',1,'mlx::core::operator*=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a3cc5c154e4ad9a83ad43da8513146fdc',1,'mlx::core::operator*=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a600e77dbc72e78207b5f5dbf4b298781',1,'mlx::core::operator*=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a54833be1d44bc3adfc9ea218fc3685bd',1,'mlx::core::operator*=(float &amp;lhs, _MLX_Float16 rhs)']]],
-  ['operator_2b_24',['operator+',['../structpocketfft_1_1detail_1_1cmplx.html#a76447ef141c8732d57421749fc81b236',1,'pocketfft::detail::cmplx::operator+()'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ae2adde594b5a4853f6bc78263a957d85',1,'mlx::core::array::ArrayIterator::operator+()'],['../backend_2metal_2kernels_2bf16_8h.html#a09c1a797eb7f43742578680899932f50',1,'operator+(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a551b970f73bb4a3b287653021d000b60',1,'operator+(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a43a225e7e548bb041f3a5d844faaf0da',1,'operator+(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8b6c3fd9d068a2159084359df8b9b449',1,'operator+(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0a5bfe15d95ba540795f4c25ebfa4f07',1,'operator+(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa415ce182fe7582d885fe633fc3527ce',1,'operator+(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a62f891b7dbba0000749cf338f594bedb',1,'operator+(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab43932322f81bf322aa1b0deeee9a987',1,'operator+(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acd15d46ea5827a2a39898ccbb8352eb8',1,'operator+(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a006763fae6e0577fc168ec9446f0f747',1,'operator+(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a12a47e8ac0be788edff57ae0a96d7830',1,'operator+(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af87dfa2122e9c76042dc41fb7f338a87',1,'operator+(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af2737d09c887ee8cd43fdeabceddbe82',1,'operator+(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#ad6af5c6c5ed4898b49758618e5aee189',1,'operator+(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#a12ff4f38aa8474bf76770c7b8e3e18cb',1,'mlx::steel::operator+()'],['../group__ops.html#ga26e5a043eaaaf066d1400adac9c11d0c',1,'mlx::core::operator+(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga7d0ec8d01e7cefa6a6b25f11876761b5',1,'mlx::core::operator+(T a, const array &amp;b)'],['../group__ops.html#ga7cc080a4f9d4a667f2099aa0dbfefadd',1,'mlx::core::operator+(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#ac14b984970cafd8fbe24d080949515cc',1,'mlx::core::operator+(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab076069c6f0047c548a8dc29d35dd36a',1,'mlx::core::operator+(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aab9d96b0a168f4d05146000a6212b5d8',1,'mlx::core::operator+(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac4e6f03d7e4ae701b4eefa784f36185b',1,'mlx::core::operator+(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a4cabd600a5271b0d416c91e8d31dd9c1',1,'mlx::core::operator+(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af26df9dc279d71b7cc10892c72162b58',1,'mlx::core::operator+(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#ac3b97eecec9bd8efb313f8f201560343',1,'mlx::core::operator+(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2e3bb121cbde30c2e6d806df0d41ff59',1,'mlx::core::operator+(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ac87ecce4b44b0826e666a169ddc6f878',1,'mlx::core::operator+(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aed3d9cd32698ef0fe65b1280f103b3f5',1,'mlx::core::operator+(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6fa13b9359cf3f575fbda5260e6e035d',1,'mlx::core::operator+(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af240a6471ff827819192808bffeb857a',1,'mlx::core::operator+(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ac25a05679f312b724c406d8b282803c9',1,'mlx::core::operator+(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a54863a54f258acf2b5c734950618e4e1',1,'mlx::core::operator+(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a9f81f5ea8909db9660197217612ee446',1,'mlx::core::operator+(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a13e26c38da0a4e332e0ae4eb0aed9cb8',1,'mlx::core::operator+(const std::complex&lt; float &gt; &amp;x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a59bb13a0bb7f748c8de34415b248bc57',1,'mlx::core::operator+(const complex64_t &amp;x, const std::complex&lt; float &gt; &amp;y)'],['../namespacemlx_1_1core.html#a38a44c412c8be4c8b952d3082cc7db74',1,'mlx::core::operator+(const complex64_t &amp;x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a011dbdbd2413e59e744cf82b05431340',1,'mlx::core::operator+(bool x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a230e3b7c479add1b171fa0aaa3a8b13c',1,'mlx::core::operator+(const complex64_t &amp;x, bool y)'],['../namespacemlx_1_1core.html#a3a6f43c2485f0d42293184f1aecbeaee',1,'mlx::core::operator+(uint32_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a766157c5d5d00fdf3da95eb7cb2981b9',1,'mlx::core::operator+(const complex64_t &amp;x, uint32_t y)'],['../namespacemlx_1_1core.html#a64dceec2bb03eee963a2a1bc1ac69284',1,'mlx::core::operator+(uint64_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#ae36badb78a17cd7d13663a69645fc328',1,'mlx::core::operator+(const complex64_t &amp;x, uint64_t y)'],['../namespacemlx_1_1core.html#ac1afa5d4c856e4b58109eff086e70ffd',1,'mlx::core::operator+(int32_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a8978def3c2cfe2a96314d564613b80db',1,'mlx::core::operator+(const complex64_t &amp;x, int32_t y)'],['../namespacemlx_1_1core.html#a5b8af5ca4c0e37aba0b7530542bd64c2',1,'mlx::core::operator+(int64_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a3eaa72850205c18450c3af9a01cda219',1,'mlx::core::operator+(const complex64_t &amp;x, int64_t y)'],['../namespacemlx_1_1core.html#ad38b38a3faf050735d45eed4438ee27a',1,'mlx::core::operator+(float16_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a358e66ff205bda3e8542427b6d2edadc',1,'mlx::core::operator+(const complex64_t &amp;x, float16_t y)'],['../namespacemlx_1_1core.html#af56d4b85e329e39a825c01a50e3a2522',1,'mlx::core::operator+(bfloat16_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a806a495a129ebaab69cc57ca7db831d6',1,'mlx::core::operator+(const complex64_t &amp;x, bfloat16_t y)'],['../namespacemlx_1_1core.html#a09fc6ebda917969383783a112a8547e7',1,'mlx::core::operator+(float x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a7ed0e2cdb65612f54e67166762cb6408',1,'mlx::core::operator+(const complex64_t &amp;x, float y)'],['../namespacemlx_1_1core.html#af7577c91b8c43682f0ebc9eb9758aae4',1,'mlx::core::operator+(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#abe36af9951afd8dd3ffe90ceedeb7f2b',1,'mlx::core::operator+(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#afb9f780dd056a4f975518f71a3b021ee',1,'mlx::core::operator+(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a6a8e093b24c4c789b7cd160f7e7f7de9',1,'mlx::core::operator+(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#af3a603690fd3de9e4f7f2035a4d25621',1,'mlx::core::operator+(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afa2a4bccfeea9688ac922cb638341511',1,'mlx::core::operator+(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a6111e94d51de12391e5d68b765f28fc3',1,'mlx::core::operator+(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7c7dd6d346e0cdf398a896f2c6958258',1,'mlx::core::operator+(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a00872a443f462b0ae0a30c84fb001bc0',1,'mlx::core::operator+(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4f5d80d03bae6d8d90455d3c47a8c116',1,'mlx::core::operator+(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a78f1f388f9d81ed93f60311f4645d8d0',1,'mlx::core::operator+(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aa43e1d6958c5d5a6fa9a625a1660e741',1,'mlx::core::operator+(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ae877e1d5e3cf57734da8b49535fe3fb3',1,'mlx::core::operator+(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a9a5ae769f67f886d59c8e292a8218550',1,'mlx::core::operator+(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a058878237ce50baa4c909d8d15448d7e',1,'mlx::core::operator+(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a95fd207028f125eefbafe9e0522407fe',1,'mlx::core::operator+(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#abc6425a3fbb386f5ea5964b42507e989',1,'mlx::core::operator+(bfloat16_t lhs, float16_t rhs)']]],
-  ['operator_2b_2b_25',['operator++',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a3efe69356a84d0d4438f033992fcbd9d',1,'mlx::core::array::ArrayIterator']]],
-  ['operator_2b_3d_26',['operator+=',['../structpocketfft_1_1detail_1_1cmplx.html#ad4e69dcd89bdb7764c9c5807168f911e',1,'pocketfft::detail::cmplx::operator+=(const cmplx &amp;other)'],['../structpocketfft_1_1detail_1_1cmplx.html#affa618d8850a7c232793b7c61db6d184',1,'pocketfft::detail::cmplx::operator+=(const cmplx&lt; T2 &gt; &amp;other)'],['../backend_2metal_2kernels_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400',1,'operator+=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a251780ac4592cc2b1a543e417ff57770',1,'operator+=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a24381d991c2d570aa953694f396a69b5',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7595740d4cc12924905d6bd1b99ee4da',1,'operator+=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac1498acb8c3623b5f412f70ab6a6528b',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abce5ab327110c164f054b43ed47f79a0',1,'operator+=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae0c70198e236ffe1a98f79987c686419',1,'operator+=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a13b3338935440ae51ecc4a356093efc5',1,'operator+=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5a0cb8544b4ebd2906ba8e7f2868e8de',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7b134429ea0c8493800ff8b465410f9c',1,'operator+=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4154f90ab7857ca856f9e15fe1bf5acf',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab9ae6a51e2027b02cac9966e05f3ba68',1,'operator+=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab93ce536eb7998bee00de4af868e31a9',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad0ae9e2b4874f991a2c853e1c1fe735d',1,'operator+=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a194a6670cc25ade35a24b566f31af785',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3d0d689516c99003659c5d026847bd2e',1,'operator+=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a007f58508b98bb79e5c323ed0dec89b6',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa7198e580e2a83c1fd01a4b6fdf86a80',1,'operator+=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a15573fefd880adefbba079b1c1bd8082',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a104cf94cb9e359d1b6ef92ced2ce0c27',1,'operator+=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa04cfcb52191fd23205a1a3572b46ae0',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad684bc2ae1a2a627cd3e4a4c641e2d77',1,'operator+=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad1e28448e35f4934075b397c34ba3d66',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8ad16afd7f1711de83c0cec5af868f76',1,'operator+=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac45e9ca0c7155caebe3d0f7261518077',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3c62ac679d6aa515144d40ebafe4a188',1,'operator+=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9ff5ab3aef1057fa083b53a65c8aba03',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae74bb0a3c12cd1a23f3d29ce307d6fb1',1,'operator+=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac188bd19f236b098d603b0d8acd08921',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aef9fa600d107b509f2e3df7d6b080e01',1,'operator+=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af5713afb3a62967a02c3c20661951ee4',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7f1b84352a3ed6171444a43da1fc7e92',1,'operator+=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af1983edd26245e6e51c6e47354095e32',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8cd55d1a579540eb450e12a8a8a950be',1,'operator+=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a588ef0f7e03f306758524d378278976f',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a74751abec7086f85f4f26ced44f1ca1f',1,'operator+=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4dd3cf0e5aa116ff330352a50c18cde7',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afb9a0e18c0e40c77e6143fb7d84ebfba',1,'operator+=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adf0cfd9a608a6fb3d57933e32e7d81d2',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4bd92db6c8b9b5dc96332c7ae3eff8c7',1,'operator+=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5d628a5bc4fa755610392f47a523a1f1',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7c790442f77f2437b482c4a55e224fc3',1,'operator+=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a77bab4481b41be50297b257e95058706',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7816a97d16b1d2f8a90227bb1da2f6ac',1,'operator+=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac244d140c6149726ea44174d3e836ca3',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af802541c4c65ee4442acd495de4d27fe',1,'operator+=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac06eb2fea47a09a8a8abdaa1aa9b4603',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5263b2463fecdc97f9521d00bffea059',1,'operator+=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a24ca436ab299a710263d65302532dd3b',1,'operator+=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aee1bdf0ab2e445293708b476e8cfde3b',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a190e27077f0fba642a86f5c8f488bcc2',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a9f2c9d2f21fbf9fbbacd940c6967c9d1',1,'mlx::core::operator+=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a0b1b3c48afc0a785282e43435bba8418',1,'mlx::core::operator+=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7b763db8194e6fcb1b87eab143dfa47a',1,'mlx::core::operator+=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a827167f6a1ae55428fd218ddd51ec3b6',1,'mlx::core::operator+=(float &amp;lhs, _MLX_Float16 rhs)']]],
-  ['operator_2d_27',['operator-',['../structpocketfft_1_1detail_1_1cmplx.html#a460da5db36d1c72fb1ed3496fd3abde4',1,'pocketfft::detail::cmplx::operator-()'],['../backend_2metal_2kernels_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855',1,'operator-(_MLX_BFloat16 x):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a333f67614dbf8027439a7e124052cb85',1,'operator-(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a891aa4bf46c20a26a55061736aba25f1',1,'operator-(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7ad7ff44a3200853711869f7a577d931',1,'operator-(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af69ef8f1d8ecae0e6f755bf1c46cf075',1,'operator-(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5bd875a54b79b2dcedf674807c3e53c5',1,'operator-(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab02f8646b47806e1d2038f248df03f06',1,'operator-(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab27b26182c7c6e08af37e6d511fd9253',1,'operator-(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5868c85c988ec3432cf86d7df40e464d',1,'operator-(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad03ef47e6cc7521bbfb45740dee20f88',1,'operator-(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab789f8a400512ff27e36b3373170f0c5',1,'operator-(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7f601b22ecc480132d82ad782e5363bf',1,'operator-(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a152366ab4e2ccc867e919af6c74ced91',1,'operator-(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a42bead8ef0beb9f3452128d64cd4df9d',1,'operator-(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#a226cfd54d49f02e35c5aab3139c7596b',1,'operator-(complex64_t x):&#160;complex.h'],['../backend_2metal_2kernels_2complex_8h.html#af5608264cf920688607059b4e8cd3117',1,'operator-(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#aca8ef21c16984ccb329b3bd0c1e4be48',1,'mlx::steel::operator-()'],['../group__ops.html#gade2eea48989f4caaf36e89f7bd2a8816',1,'mlx::core::operator-(const array &amp;a)'],['../group__ops.html#ga0c7f3cb36d4ca516c7a33142f88b9181',1,'mlx::core::operator-(const array &amp;a, const array &amp;b)'],['../group__ops.html#gae68d3d0691ba951501218e98439f3465',1,'mlx::core::operator-(T a, const array &amp;b)'],['../group__ops.html#gaf5e5d882c51ad0a0ea315c274d5439b2',1,'mlx::core::operator-(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a622ce842fe44e4b6a95e03242341b459',1,'mlx::core::operator-(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af32a99d930d49e9b178472d7a65531ab',1,'mlx::core::operator-(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a3555a2b31fc0925850d3240e85e03ec5',1,'mlx::core::operator-(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a46080889fd9e5c3f9916508e97dff5ad',1,'mlx::core::operator-(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a9ca27fd1e512c8ed126342e565da12ae',1,'mlx::core::operator-(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3803f8d36558d32bb7dd6e580ea683b4',1,'mlx::core::operator-(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#af5d865528989ca66b3d357e5ce4e0300',1,'mlx::core::operator-(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#afb784b960f55aeb4edd7f567fa74d443',1,'mlx::core::operator-(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a29cbacf4b399c24728fb0808fad498f9',1,'mlx::core::operator-(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aececc0e451237aa6c0d1a2c3d828c86e',1,'mlx::core::operator-(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a383a26cc2689c98fd6c4435ade8dc669',1,'mlx::core::operator-(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad6311ef8df59bdfb212b5cf8169246b2',1,'mlx::core::operator-(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a23b7329bc1c93c8ac0a1f576565fefb0',1,'mlx::core::operator-(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad8d650bf63998abd716ee0ca28e1cbb9',1,'mlx::core::operator-(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a7339b33201254e9119d99d3a728ded72',1,'mlx::core::operator-(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a064318b7a16e5cb6d0a6407501b5c7dc',1,'mlx::core::operator-(_MLX_BFloat16 lhs)'],['../namespacemlx_1_1core.html#a7bae3ff296d9a60ff3c7e448f7fbc6bd',1,'mlx::core::operator-(const complex64_t &amp;v)'],['../namespacemlx_1_1core.html#afb5069ecebdfd9d388c26f83df12c93c',1,'mlx::core::operator-(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a8d126e3f3fa9f8c1c1ae1b09f94df487',1,'mlx::core::operator-(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ad04f1ccd2cd7c487a2f2aaa055939f64',1,'mlx::core::operator-(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a15eb2ea76508ff823fa0591e811d0b7d',1,'mlx::core::operator-(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a96d9577db38d6809d022893e32feeda1',1,'mlx::core::operator-(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a5d9c02765c1672930757416411567bf2',1,'mlx::core::operator-(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a6105d3b5266666b7c6bb9469285a9ec3',1,'mlx::core::operator-(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a777aa772dfb205b25d26f3180d98a2f6',1,'mlx::core::operator-(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a085eb092f4ada47f8169de62886cff90',1,'mlx::core::operator-(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab25e5d211e2c8785b45c3a81a6282e2b',1,'mlx::core::operator-(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#abf5d09561a81b0f0b32d59d77e32e16f',1,'mlx::core::operator-(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4ce6867dbb4d1631d1870dac14022dbb',1,'mlx::core::operator-(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a8a049e646e0442064cfe9e202d7047c5',1,'mlx::core::operator-(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a78e2a1cfc65453185bcca13bd4f523cf',1,'mlx::core::operator-(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#af143cf68673e06390d4bb2ec2892bd22',1,'mlx::core::operator-(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a46d502dfe0b027955950d4e716c2eb26',1,'mlx::core::operator-(_MLX_Float16 lhs)'],['../namespacemlx_1_1core.html#a2631e78c6f0a602f6754ac577ec75f83',1,'mlx::core::operator-(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a73d79cbd75d543d0837b8a51bf103f9e',1,'mlx::core::operator-(bfloat16_t lhs, float16_t rhs)']]],
-  ['operator_2d_3d_28',['operator-=',['../structpocketfft_1_1detail_1_1cmplx.html#a12441ff423274bd1b54245933d69ad7e',1,'pocketfft::detail::cmplx::operator-=()'],['../backend_2metal_2kernels_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca',1,'operator-=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac2f1e1f2365cfa531b1519aa9ff67695',1,'operator-=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a513501355a5912a1263fd8b10864142b',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab4f4ecd62c3d8b3363d02019573dc9f1',1,'operator-=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a92d1348f201d78fcd474f75d5b23ef68',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3eefe9a7f5fb226335ea687012f32d5c',1,'operator-=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aef62c7e3e494b6a511a7833c0d942a60',1,'operator-=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad30726cc8b69fd300d33c2a46e123c28',1,'operator-=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8859b5b8dc241e4f58243c85d2630cc8',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7003e1e5881e3d106257f22b6a3e59fe',1,'operator-=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3165e37d393be50c2cfa9ddcba153684',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a76f5bd895b7214cbc3cea3440992718a',1,'operator-=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7167343d90eb70e5a0d5fa9ec5398e94',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9b31c363ebc93d592b6fa0e27b00335a',1,'operator-=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a967a1d7b5664f616e5b6f2d257367f0c',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aff19193e1b2cee29a8737318e95cc74a',1,'operator-=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aede0cc4179507b739849948f1a2fed4b',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7e1a6056f9c96f3c89fe204dbf103be5',1,'operator-=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9d06cceea5c179bcc608452188bd7d6a',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0aa9ffe056f49fda181bbacbd60556ea',1,'operator-=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ada5685d99c2d6708d1c4ef826d68e879',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a726cecf778b8584b6f7c37db1b064576',1,'operator-=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3816a35f8468156d59c239256c12dcf3',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa332fae098e7c6dc23b98bc0026f1070',1,'operator-=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afb3cd302e0b78902c62111dce4494fe8',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abb884888f14086cc674657677cb4b8bc',1,'operator-=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a38bb89f925eca4f9c042f6ee7a2c0193',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac30c580713f354916088a7dc049ae4cd',1,'operator-=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a715c824ee8c87e0256114a85624d9949',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7bc91aaaf476a37063264d1d53d862cc',1,'operator-=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab155f418f15cabd86ff942c6f9472ddb',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aaa66dc6d7b2c5efbfaa97ca9c7872bd8',1,'operator-=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a696978d9401e09200045b2d8aad045c2',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae998d8f423a9fb73405cfbd4b836bc72',1,'operator-=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a279d09ab8542f1c1a8dc8173b65946b6',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a491dadfae957cd7cc0c36188d910f6f6',1,'operator-=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9a837c3b9c4e42f53d7cd1ed0d266e2f',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acf7af2284269544064b68e807064bba4',1,'operator-=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a28d297705e29009197418546ef435393',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a948579a4d9ba276523190b03b09578fb',1,'operator-=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5a4b98a0a11db5b77cf9168df37c8bc7',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a31a3d8f2ff8038f7e0d717845c039808',1,'operator-=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1dac193d9f1c8c0eb4473441895f8c58',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad2817d53fdd4b112babfb6f0b38c8f39',1,'operator-=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa705d87cf4b78e9d7c6b07dd0c66cac6',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a542affc376726840647a6e93acf2c1a7',1,'operator-=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#add18cfe4c0d38e95c6dff6bab3e7a932',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab1de7e7e7304ff3598925d2e69134764',1,'operator-=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0d3fb52437c677c5d0f1a3642384b15c',1,'operator-=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adda64cae388baac1f138b06dc8595237',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af20874a61c6c3f4c3fd045a96e806644',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a8b8a55690df46d97fcfc2a60120783af',1,'mlx::core::operator-=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#ab03949b1f60fa035ce454a894cd73ae9',1,'mlx::core::operator-=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#adaf70bbfb3667df0d08fd3c99896e20a',1,'mlx::core::operator-=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a321c98e5a78621d3c9a3895f707f2f1c',1,'mlx::core::operator-=(float &amp;lhs, _MLX_Float16 rhs)']]],
-  ['operator_2d_3e_29',['operator-&gt;',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966',1,'mlx::core::metal::CommandEncoder']]],
-  ['operator_2f_30',['operator/',['../backend_2metal_2kernels_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c',1,'operator/(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aacaedf12f862c76457133336dd6fc446',1,'operator/(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a584a513596de20663dad951a5b81695e',1,'operator/(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad8f7b11669736fbd6ed2e28211d877d4',1,'operator/(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a59515695ebc48844345fa5120511aed1',1,'operator/(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8c8ac6736440fdca366ebdefe2a12b9f',1,'operator/(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad6859b04680d0d26d75fd6c4dd74ee24',1,'operator/(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4720cc79ab2b8e39952ea9ef20e51250',1,'operator/(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a72d10ec0e62949247da129eb3a83fb9b',1,'operator/(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad6399ba2b8708899739b4cdbb44add8d',1,'operator/(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a998b1ba877a606aedf722ab46b290403',1,'operator/(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa3277ae33976c70f7bd937ddff027b72',1,'operator/(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa708a970a200822c99c0489f389469fa',1,'operator/(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#ae6a708f67d6fd9b0962aa8877cec6d35',1,'operator/(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#a6bde717aca2051499f73a3eee199bfdd',1,'mlx::steel::operator/()'],['../group__ops.html#gaeedf77f722b394429f1a7f6c367883bf',1,'mlx::core::operator/(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga7366ec7f453be2a4dc449f0faa1bf554',1,'mlx::core::operator/(double a, const array &amp;b)'],['../group__ops.html#gadfb324ae9b4feb2c7ea0ac6ade639f38',1,'mlx::core::operator/(const array &amp;a, double b)'],['../namespacemlx_1_1core.html#a7573ac3b93ddecd69e9c88a26fc84ba9',1,'mlx::core::operator/(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a40e868dad70401d9aa9ee9c32235c315',1,'mlx::core::operator/(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a7587c28fbd2023b134e5fc12bb0dde23',1,'mlx::core::operator/(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a92cdd377c408becf4cf83c1ee9b7085d',1,'mlx::core::operator/(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aef89566301cb133d98c8e7bdd2b7bec6',1,'mlx::core::operator/(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a94e7b51185590492b46916685641276f',1,'mlx::core::operator/(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a04584788c08180835219d0ea1e2b97b1',1,'mlx::core::operator/(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad5af96e2ff09d207eb1e1980fe3e7c2d',1,'mlx::core::operator/(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ac2217bf760038cd011781158923149ed',1,'mlx::core::operator/(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aea414c04bddc4b9b609262e97398f1b4',1,'mlx::core::operator/(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a27fe23230cd082c0363b9451b731ce6b',1,'mlx::core::operator/(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abdd9bb8fb4411e5924f3eb7ef1bb52f8',1,'mlx::core::operator/(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a50bae338a7353f8b0ed3441071bb0cf6',1,'mlx::core::operator/(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aab26a3284dd3ac7d47c8b5b3a3290ce3',1,'mlx::core::operator/(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a749f48db01de38f259a0c6750a97fa77',1,'mlx::core::operator/(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a32a6a08a2a4652975b0a1bd1fcf3eafd',1,'mlx::core::operator/(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4b66fb38ddc5cc0c2489583d5c499602',1,'mlx::core::operator/(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a45726f1905b709cf8253e6efa046027b',1,'mlx::core::operator/(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afd4170c1e364384f30e6bae341146fa6',1,'mlx::core::operator/(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aef85739d150b9d5609973da8a3f1086a',1,'mlx::core::operator/(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af52a941f8ed9b25eec91402c7b9e281f',1,'mlx::core::operator/(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a477cade78296bc85894170f62db68870',1,'mlx::core::operator/(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a22f5a2257e11423fc2fe18e2dce91590',1,'mlx::core::operator/(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a640d3574dfe6ad934c720ae8bdd78bfa',1,'mlx::core::operator/(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a6f65d8fd0cdddc96fc01f6af95804873',1,'mlx::core::operator/(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a517019d42d4e426b7b98e1c719bb47ce',1,'mlx::core::operator/(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a0beb7a223c542015a4eff4aed814a9dd',1,'mlx::core::operator/(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#abc9b1bd5018d46514bc19d23db2e5063',1,'mlx::core::operator/(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af22937df654ddbd6e398ef12764d18c0',1,'mlx::core::operator/(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a775aed5f49b530c57e71cbac81404d45',1,'mlx::core::operator/(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a97efcd96d6be666e5608034ae77289ef',1,'mlx::core::operator/(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a899851f85dbddd96f9d36319b82542a0',1,'mlx::core::operator/(bfloat16_t lhs, float16_t rhs)']]],
-  ['operator_2f_3d_31',['operator/=',['../backend_2metal_2kernels_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095',1,'operator/=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a90a1c5130db515db48624d8587edbb91',1,'operator/=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a65f30a2dc199134e35bc7c5d431b2263',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7172d84db640e6c49dff0d08dd64b53e',1,'operator/=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acf7cb9927bf09022088401923f2e1916',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a86b2a001cbec0d3a8d762a3c7ff47b0b',1,'operator/=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a744f72ba83522fe3cc2a49a007b42543',1,'operator/=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a77c678665b34df7652dcde053ca73185',1,'operator/=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae0614b6b199d8a65ae95d4621b118b82',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa846fde89c7d2d18b18ef180a8a9c8a3',1,'operator/=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08e778be18e4a291c108fcc528b981d3',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a6b9e49ad9ea256d2d0220c0d81552602',1,'operator/=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab933bc3cdf9adfea10ab9dba5292c812',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a25e7c5d2ecf3375756d59074f333858f',1,'operator/=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4ae4a80fde67eea9a0a37b2803946544',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a912393b7208fa45bd1e87f30b218b68b',1,'operator/=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a18963246f2b640874bef6dca7049f64d',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0e2c2c2cb50b3a55ff213f18978aca35',1,'operator/=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a64f1136b17006f168ef837e17240814f',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae46d75b8046d557452d74513f1106710',1,'operator/=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08d2460e259b9106d90d889481ad60d5',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0f7fd418408806ef498745c6fdb2c062',1,'operator/=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac703495cb370b52526a5a2d36ae26038',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4ca11d43174baf0a729f93b35eabcbea',1,'operator/=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9f835a0a80c411580c97b65fdc5bdfd3',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a17f47ec9cff60f8e1b3477a2793b7ac0',1,'operator/=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5be23e296bbed3a885586a6424b1666e',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afba39221eb54e272aae79910b3cd7ef5',1,'operator/=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac057d95a2bf087575584aa6f9a2c6bf5',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab986ae2cec780a1f494b7b4468b7ba11',1,'operator/=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a44522c2304c6396bbe6b9d32000f4b6f',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aef8e7e499ea9d432aa743d83c076f945',1,'operator/=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3a0a3edbf1ba2314551454059c3f422b',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acb9f0aef9fbdfde8a4f46e33b0d6c52f',1,'operator/=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a303dfcc81ffd355f866f863d7d9f0fa5',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a359edd4bcb8776861ceb26a3005624c0',1,'operator/=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adc9f32cc6f40768df4285fba2e4783c7',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae71f66d814a03f6377c9d86cf0a2b5d7',1,'operator/=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad0125b6baba3065a87a174ec27aa9a61',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5cc74ad3e522d7104e6e2117751151ad',1,'operator/=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab3b594321fb42b0c2da99954d1e0976c',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4a0023e2fd08875156cd6ef747fbb5cd',1,'operator/=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4358ee606e66ba2081fcf94f9c3b5915',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad1e7ef6f065695d4b1d017547b60ef62',1,'operator/=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a284dfc702f0f67b9c233b87162eeabdd',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab8f211ea896fc5190004f3ad6ad8932f',1,'operator/=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7e1bcf3bc06cbcbc304c0cdf729802bc',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abbe42648a46092137b303ccd08f7df86',1,'operator/=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af1a12a1efb618a57da6dd41ae18cb53c',1,'operator/=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a94686039356dfa9aa45608a8b0562fdc',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa251d6483d3b099d1b5311fbe6f0bce2',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a045ff27257cb6d8ab7a94771ba5a17e6',1,'mlx::core::operator/=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a58112951a56a0f9f8c90b60fe74f9508',1,'mlx::core::operator/=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae736defc89a04fbaf7627ad2695bb838',1,'mlx::core::operator/=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#ab1f260710251256ef737dd59be9e143c',1,'mlx::core::operator/=(float &amp;lhs, _MLX_Float16 rhs)']]],
-  ['operator_3c_32',['operator&lt;',['../backend_2metal_2kernels_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25',1,'operator&lt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aab02c65bc38ea66335b2192ead4095a8',1,'operator&lt;(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae91686513e284bcc9635833744bbdda1',1,'operator&lt;(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2486f3b5de85b0d57f458d8f21f82b42',1,'operator&lt;(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a435a2aec4c777b4b184ff5d24992e8a1',1,'operator&lt;(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abdd04257e6a73883b5f56f1186d0e906',1,'operator&lt;(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a69984aaa05ae1d4fccccf7f57e8ecb4a',1,'operator&lt;(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a501cc01d5bf15d9f03aa28545f9624ea',1,'operator&lt;(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1b029e4ca72125a5f9471f582c819705',1,'operator&lt;(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0736a76f56578d26ba1422dc8b744a18',1,'operator&lt;(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a24b1fa8998c892f90f8dde7c34fb10a5',1,'operator&lt;(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af80ff2020ec2c4b406c5fdae3fe55e63',1,'operator&lt;(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac03f6eefb836373d37dc280b0d813d78',1,'operator&lt;(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#a67674e32596a9dae2258bb8e0e6a2058',1,'operator&lt;(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#adb5f24b57d98214fc215a06475f21412',1,'mlx::steel::operator&lt;()'],['../group__ops.html#gaee41e2b8f61d563200ff03575ac1d6c3',1,'mlx::core::operator&lt;(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga1ef8ea11cf15ce628c54201fa42748ef',1,'mlx::core::operator&lt;(T a, const array &amp;b)'],['../group__ops.html#ga95e72226dc7a79c40b3d16f990922050',1,'mlx::core::operator&lt;(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a987d631e1508e8df55d98ddd57e4d086',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad3fb46370cd8f0992866fad9e2c64a3c',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a3026691bf7ee5095243a8611bf3411aa',1,'mlx::core::operator&lt;(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0d42d6c1d5f77a96e2f296b8ebd79ee6',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ab5ce08a7de0a0ca00d61f7a7f8ea3ab4',1,'mlx::core::operator&lt;(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abce8b7f24b61e5ec0f9a3afe20845caf',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#aff97612627ae1ed260c43c0a7af0d306',1,'mlx::core::operator&lt;(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a9119e518234df7923cae2b3802d59bf2',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#aefb9b05ce8864ada99a920ab32017b89',1,'mlx::core::operator&lt;(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abc55f3676c2d112a6e9ab276bd6b1796',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#afe6581a2c45f24d7fab1e4006c1e3c70',1,'mlx::core::operator&lt;(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aca1d50cdd9506481dcc4cd1ad4a4f734',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a310720f513b6a2490e9df80c65f1bfb3',1,'mlx::core::operator&lt;(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a29e457a170b6cefb6ba1e394c96c6f7b',1,'mlx::core::operator&lt;(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#afd4519985b6b207ec41ad8530d1036df',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae1e41ca94022e43a00cdfc5845102daa',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ac80f4022bffd95b57526685ce8e1cbc1',1,'mlx::core::operator&lt;(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a3a8f6f0af477788c4f0aa98abfc5f1ab',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a3728ed9b6cbd152bf675251a0501b466',1,'mlx::core::operator&lt;(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a5b9ad811a5e1358100c5423dd70ea387',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a5c77e1db83995d3e06a8a26265bce5d6',1,'mlx::core::operator&lt;(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab8a0a3f70664049b35ce1887bd8ff5c2',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6652d93bfb2d426e261a1712a181a4d2',1,'mlx::core::operator&lt;(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a03758b8d13da2de07cc4f4fc45d2854b',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a325161b81a9ff179fd37d949780a17ba',1,'mlx::core::operator&lt;(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a92eca79fce8233e4299343eee3996511',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#adb016662b8f7eb680abfe1a421eabe72',1,'mlx::core::operator&lt;(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['o_5fstrides_1',['O_strides',['../structmlx_1_1steel_1_1_attn_params.html#a33dc7fc22d2604a73af9f94eeea45bb4',1,'mlx::steel::AttnParams']]],
+  ['offset_2',['offset',['../struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791',1,'LoopedElemToLoc::offset'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a3a18944c158e2747a6ddebb420299a3b',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::offset'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af792b1fd4e8286f97b9b863c127a2d9a',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::offset']]],
+  ['offset_5fneg_5fidx_3',['offset_neg_idx',['../kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8',1,'indexing.h']]],
+  ['ofs_4',['ofs',['../classpocketfft_1_1detail_1_1simple__iter.html#ab59481ad9c8f04addb907c3ebb89f8fa',1,'pocketfft::detail::simple_iter::ofs()'],['../classpocketfft_1_1detail_1_1rev__iter.html#a78c3b4ad19edf9d20cab40ad109e9dd1',1,'pocketfft::detail::rev_iter::ofs()']]],
+  ['ones_5',['ones',['../group__ops.html#ga54eeed455321a54c8e72e16552a978f2',1,'mlx::core::ones(const std::vector&lt; int &gt; &amp;shape, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga6cf4b5e8580e4436302c519d05897dab',1,'mlx::core::ones(const std::vector&lt; int &gt; &amp;shape, StreamOrDevice s={})']]],
+  ['ones_5flike_6',['ones_like',['../group__ops.html#ga94f8d3b1906fee99da9cbe39f7be7d42',1,'mlx::core']]],
+  ['oofs_7',['oofs',['../classpocketfft_1_1detail_1_1multi__iter.html#aae63e67caac095d474ddd32daa5ffa34',1,'pocketfft::detail::multi_iter::oofs(size_t i) const'],['../classpocketfft_1_1detail_1_1multi__iter.html#a9236047e7419e5d21379cbf95eb3a78e',1,'pocketfft::detail::multi_iter::oofs(size_t j, size_t i) const']]],
+  ['op_8',['Op',['../classmlx_1_1core_1_1_bitwise_binary.html#a6f8b5d455d0c1770428a6bef1608f23d',1,'mlx::core::BitwiseBinary']]],
+  ['op_9',['op',['../structmlx_1_1core_1_1_default_strided_reduce.html#ac871f55a7ddd205574974cb4492a240b',1,'mlx::core::DefaultStridedReduce::op'],['../structmlx_1_1core_1_1_default_contiguous_reduce.html#a1928f07db988715cc177999e386f4830',1,'mlx::core::DefaultContiguousReduce::op'],['../common_2binary_8h.html#a70228731d29946574b238d21fb4b360c',1,'op:&#160;binary.h']]],
+  ['operations_10',['Core array operations',['../group__ops.html',1,'']]],
+  ['operator_20bool_11',['operator bool',['../struct___no_mask.html#ad3723c1e70e46beefd283ce6317416cb',1,'_NoMask::operator bool()'],['../struct___no_mask.html#aafbf8a3201e1cc1abf74dd1f1b7272cd',1,'_NoMask::operator bool() const threadgroup'],['../struct___no_mask.html#a73e9612a619885cbc97cbd8f40df71e7',1,'_NoMask::operator bool() const device'],['../struct___no_mask.html#a4bf336d472bc677028250f76b9cdc08c',1,'_NoMask::operator bool() const constant'],['../struct___no_mask.html#ad3723c1e70e46beefd283ce6317416cb',1,'_NoMask::operator bool()'],['../struct___no_mask.html#aafbf8a3201e1cc1abf74dd1f1b7272cd',1,'_NoMask::operator bool() const threadgroup'],['../struct___no_mask.html#a73e9612a619885cbc97cbd8f40df71e7',1,'_NoMask::operator bool() const device'],['../struct___no_mask.html#a4bf336d472bc677028250f76b9cdc08c',1,'_NoMask::operator bool() const constant']]],
+  ['operator_20dtype_12',['operator Dtype',['../structmlx_1_1core_1_1_type_to_dtype.html#aefdd0fd6a5bbf0197a3996ccd4adea13',1,'mlx::core::TypeToDtype']]],
+  ['operator_20float_13',['operator float',['../structmlx_1_1core_1_1___m_l_x___b_float16.html#aaae72e5340ce91325f1925be36ba46cb',1,'mlx::core::_MLX_BFloat16::operator float()'],['../structmlx_1_1core_1_1complex128__t.html#a3e2faf180c0b785646a0e4296f709a5e',1,'mlx::core::complex128_t::operator float()'],['../structmlx_1_1core_1_1complex64__t.html#a90d224dd37308345086bb9cc882ef6fc',1,'mlx::core::complex64_t::operator float()'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a363de5054f3673bddc90293fc3c9bb99',1,'mlx::core::_MLX_Float16::operator float()']]],
+  ['operator_20t_14',['operator T',['../structcomplex64__t.html#a70e9b16031eeaff3baa601f400023fcd',1,'complex64_t::operator T() const thread'],['../structcomplex64__t.html#a4f3beea7ab6001189b782a74d1746b67',1,'complex64_t::operator T() const threadgroup'],['../structcomplex64__t.html#a9f4f7eca89ffe6c8d126a4145df6d9f2',1,'complex64_t::operator T() const device'],['../structcomplex64__t.html#ac33e2e5263fec76a4fb4418c6e1d8d14',1,'complex64_t::operator T() const constant'],['../struct___m_l_x___b_float16.html#aa7dfefdf0d15e102d2b8258c9ab01836',1,'_MLX_BFloat16::operator T() const thread'],['../struct___m_l_x___b_float16.html#a2546a8afa77e14ed5b3c5da79a281260',1,'_MLX_BFloat16::operator T() const threadgroup'],['../struct___m_l_x___b_float16.html#a1d523f87740fcb852db6ab57896c245a',1,'_MLX_BFloat16::operator T() const device'],['../struct___m_l_x___b_float16.html#a95acd29283024d7093a0bc58c9468a0a',1,'_MLX_BFloat16::operator T() const constant']]],
+  ['operator_20val_15',['operator Val',['../structmlx_1_1core_1_1_dtype.html#a3b3bc059be5836476da3cb88a4f5e9fd',1,'mlx::core::Dtype']]],
+  ['operator_20value_5ftype_16',['operator value_type',['../structmlx_1_1steel_1_1integral__constant.html#a0c11203bed44a6a2c387b365134dcd64',1,'mlx::steel::integral_constant']]],
+  ['operator_21_3d_17',['operator!=',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a971aa511ab2e7ae1caae09556643a0bd',1,'mlx::core::array::ArrayIterator::operator!='],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55',1,'operator!=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6baa722c22d66c7510786bb275cb8cc2',1,'operator!=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa8d9f01582a0a9f01a666d110c74db2a',1,'operator!=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa504a474ab6e00ebe2b1b7ed2f7d1ffb',1,'operator!=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abf5f3040227f021a5b84cf2eda248b2f',1,'operator!=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a347c9bbf816bad2e9e5e91aa448f8b65',1,'operator!=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a33ea086b561c652f25833a5e1ded34dd',1,'operator!=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2bbdcece13148826d3fe33af727bb79b',1,'operator!=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeb1efa47c5f22cc0b35d49ccce73c406',1,'operator!=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa6b99cde403405df1865c989e4ce845a',1,'operator!=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a204d13a881ae8d337f6efbb98673790c',1,'operator!=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3602117b4c61d5cd4fd72fb8e5f68bd6',1,'operator!=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2721c088adfc9d73cde442d6badd2a6c',1,'operator!=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aa4364eda56525cf7576ff00e550175e6',1,'mlx::steel::operator!=()'],['../namespacemlx_1_1core.html#a94d00a1b7f8a4717ab3f26f45e4da655',1,'mlx::core::operator!=(const Device &amp;lhs, const Device &amp;rhs)'],['../group__ops.html#ga0ac483d85f23252ca8757e9926d5a3c5',1,'mlx::core::operator!=(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga3fecba9f3cb9a19afd8ca492cf509ce0',1,'mlx::core::operator!=(T a, const array &amp;b)'],['../group__ops.html#gaebbf1cfde388c7480159a03c92c9a385',1,'mlx::core::operator!=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a164f109bc19c927b2b3bcc47a5021419',1,'mlx::core::operator!=(const Stream &amp;lhs, const Stream &amp;rhs)'],['../namespacemlx_1_1core.html#ad2f9e1c230ec35d5c406dd616e8f4dea',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af5899b4d5644682cb0ac2a488f630d55',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a72ac8edd190601d7a46782582cedecd8',1,'mlx::core::operator!=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a8084162ba2dd3f9b89195d2bebc3fbb0',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a514263e63f6825b490203ca586864687',1,'mlx::core::operator!=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a1c482bb3d9f9d4c62dee5865892c1f96',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a0030fe7ad09837c670cdfb7d51279519',1,'mlx::core::operator!=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ade3791bc723b8f10fbab22eadb0f705a',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ad78c664f242cd36247c13868547e3dd4',1,'mlx::core::operator!=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab0743a1a1dcb92d40f41ca42d36f242c',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ae7a0f810e546a166c7d05849b5d41f30',1,'mlx::core::operator!=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a676a40637a563f013c725d24fa33fdc8',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a9fcb662b1561e4136bac0106cfb63b6c',1,'mlx::core::operator!=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abcca7fd43590c4347e0f5df8f134030c',1,'mlx::core::operator!=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af3ede3688a2e3b3ba8cb2da180ffe151',1,'mlx::core::operator!=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a54f48469fabd1414bef5097bcded0002',1,'mlx::core::operator!=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af8c648e892cbc6973de535aa17dc2cfe',1,'mlx::core::operator!=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#abc855e1c0584b64d7d995e33211361ab',1,'mlx::core::operator!=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad3684d660d18a54505c759ab286bd936',1,'mlx::core::operator!=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a8afdda14b14262ab5ce0a00c7745d7e8',1,'mlx::core::operator!=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7ccc479be236f2bf3f7725729c5ba201',1,'mlx::core::operator!=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a26a721b8111fce3a1dec9bf724034cd4',1,'mlx::core::operator!=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad5f8c221a53a89e8095aa39fd1f61867',1,'mlx::core::operator!=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a017b52ecf30b33da4aa8da35ccc43220',1,'mlx::core::operator!=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a43c10ca5fb05ee7d0ee63ba56f8a08a3',1,'mlx::core::operator!=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a81284b6ac737f91a8d1ffbbbbf938fe5',1,'mlx::core::operator!=(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_25_18',['operator%',['../backend_2metal_2kernels_2complex_8h.html#aaf53122a07c8eca858b5a8e38ae280e0',1,'operator%():&#160;complex.h'],['../group__ops.html#gab3bfbf82b1e4de7b00bbcf1a2255fbde',1,'mlx::core::operator%(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga50817666f0b82afcbf4a123486af9908',1,'mlx::core::operator%(T a, const array &amp;b)'],['../group__ops.html#ga46c01daa07433542a477d216e13a8480',1,'mlx::core::operator%(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a8723d145dd49021bfcb8e6c99e1c91a5',1,'mlx::core::operator%(complex64_t a, complex64_t b)']]],
+  ['operator_26_19',['operator&amp;',['../group__ops.html#gaf0d232de4cbfffda1e2c838f8afdf6ff',1,'mlx::core::operator&amp;(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#a9ee95f97bbd69262d99d7bea3bf77631',1,'mlx::core::operator&amp;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0fefc3ae4f1350ebe05ec6098fd6bae3',1,'mlx::core::operator&amp;(_MLX_BFloat16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a1e4cb758ccfe5c267baed9aeb0044834',1,'mlx::core::operator&amp;(uint16_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab9d0f9910070231695d61de08cadb930',1,'mlx::core::operator&amp;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a889d401f425db79d1868aa3beea4829b',1,'mlx::core::operator&amp;(_MLX_Float16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a76dcd1fa3c68b386bc1d1d899a68a120',1,'mlx::core::operator&amp;(uint16_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_26_26_20',['operator&amp;&amp;',['../namespacemlx_1_1steel.html#a6353bf11881842e25c46b56f92b7044f',1,'mlx::steel::operator&amp;&amp;()'],['../group__ops.html#gaee1d774bb0843601d7a0a4257d616ae3',1,'mlx::core::operator&amp;&amp;(const array &amp;a, const array &amp;b)']]],
+  ['operator_26_3d_21',['operator&amp;=',['../namespacemlx_1_1core.html#a60c263ef46e552c3954688869734b513',1,'mlx::core::operator&amp;=(_MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af9670fc8088339669c54c68b3a320e25',1,'mlx::core::operator&amp;=(_MLX_BFloat16 &amp;lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#ad1f96f0a02024f347b4c4431629407fc',1,'mlx::core::operator&amp;=(_MLX_Float16 &amp;lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae0540f16c4e7bd55d0e86a88495e4967',1,'mlx::core::operator&amp;=(_MLX_Float16 &amp;lhs, uint16_t rhs)']]],
+  ['operator_28_29_22',['operator()',['../structpocketfft_1_1detail_1_1_exec_c2_c.html#a4fd637f1a6d335826789af28ac089ecb',1,'pocketfft::detail::ExecC2C::operator()()'],['../structpocketfft_1_1detail_1_1_exec_hartley.html#a67c98b38d12440781053552b9a33bba1',1,'pocketfft::detail::ExecHartley::operator()()'],['../structpocketfft_1_1detail_1_1_exec_dcst.html#a67f4f56e3574c491695f8cb8a1e983d8',1,'pocketfft::detail::ExecDcst::operator()()'],['../structpocketfft_1_1detail_1_1_exec_r2_r.html#acdba1650962714e6afff51e9ca456970',1,'pocketfft::detail::ExecR2R::operator()()'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a0d657bc9a381dca1b5860b9a1b5a5702',1,'mlx::core::detail::Abs::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a564232db7d32811e2ae126c86de104f0',1,'mlx::core::detail::Abs::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a5fac7e6c8277d8706535a52820503c9d',1,'mlx::core::detail::Abs::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#af2c3723e648bd5ed2fe558cc20b7f5eb',1,'mlx::core::detail::Abs::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a57312cd8530dd0ede3b8037f9c401883',1,'mlx::core::detail::Abs::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#ab3b5e3853ed56bfbfa577d965c21112e',1,'mlx::core::detail::Abs::operator()(bool x)'],['../structmlx_1_1core_1_1detail_1_1_arc_cos.html#a04b4c9d1fc0160973aa28b1f809b9d51',1,'mlx::core::detail::ArcCos::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_cosh.html#a767d354bec863942822ee0b9b6742a88',1,'mlx::core::detail::ArcCosh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_sin.html#ac69091929815e5317308b4088f5c2f46',1,'mlx::core::detail::ArcSin::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_sinh.html#ac7bf9bac66fef917f75494b2345e6aaf',1,'mlx::core::detail::ArcSinh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_tan.html#aee87bf10c278a70ca788085d1b499afe',1,'mlx::core::detail::ArcTan::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_tan2.html#a9040b7afcdb4969924aa782fa67f03ac',1,'mlx::core::detail::ArcTan2::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_tanh.html#a601e8c52bb938eb3a616756a35419e8b',1,'mlx::core::detail::ArcTanh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a672f65e47d65e4e8d88be252bce0164b',1,'mlx::core::detail::Ceil::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a72a2cab2728fb5e1cc6329a539e5d573',1,'mlx::core::detail::Ceil::operator()(int8_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#aa34590f6a41331be92988558a90dc6fa',1,'mlx::core::detail::Ceil::operator()(int16_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#af14120f3dd98f6198ea257d75be223f7',1,'mlx::core::detail::Ceil::operator()(int32_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#af263ce7743cf7319387baba616c375b5',1,'mlx::core::detail::Ceil::operator()(int64_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a48f00affcd5c2ea1f81d821e019fec29',1,'mlx::core::detail::Ceil::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#ad4d24a44e8a328948393701dacb0ceac',1,'mlx::core::detail::Ceil::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a70da19b5c9c69f04b9f196bdf266f93c',1,'mlx::core::detail::Ceil::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#af0e7e806b73c664ada837476f9d4d43b',1,'mlx::core::detail::Ceil::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#acc1bfc84a9b91f6e9764234cbe3b9687',1,'mlx::core::detail::Ceil::operator()(bool x)'],['../structmlx_1_1core_1_1detail_1_1_conjugate.html#a7e662d05c6998bd6ced8ad9c187324a5',1,'mlx::core::detail::Conjugate::operator()()'],['../structmlx_1_1core_1_1detail_1_1_cos.html#ad4caef573f9d9071f8945a8efed231ad',1,'mlx::core::detail::Cos::operator()()'],['../structmlx_1_1core_1_1detail_1_1_cosh.html#a63591f49776d9aadc02200036ae38317',1,'mlx::core::detail::Cosh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_erf.html#a168f8ccc6c8053b05dd1a48904ca8fd4',1,'mlx::core::detail::Erf::operator()()'],['../structmlx_1_1core_1_1detail_1_1_erf_inv.html#acc93c0511141404208b35f302f8c1fcb',1,'mlx::core::detail::ErfInv::operator()()'],['../structmlx_1_1core_1_1detail_1_1_exp.html#a0846300cee28315e5b42f74acafbd1a1',1,'mlx::core::detail::Exp::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_exp.html#af247c0d19d59f3310fd0a081eb92cf8b',1,'mlx::core::detail::Exp::operator()(complex64_t x)'],['../structmlx_1_1core_1_1detail_1_1_expm1.html#abf7e61b8387521e9d44334ce88d833a0',1,'mlx::core::detail::Expm1::operator()()'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a16c13cfe736098bffc81d655e172294a',1,'mlx::core::detail::Floor::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a9b6c4c34b6594b8c413abe31f34a73df',1,'mlx::core::detail::Floor::operator()(int8_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#aca4c71204b3ceeca6329f7ea2b041f4c',1,'mlx::core::detail::Floor::operator()(int16_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a3c3ab9e00d1fbd124802517e8c35fe02',1,'mlx::core::detail::Floor::operator()(int32_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a4b5954ffc59c741dd7b86bafda69d5cc',1,'mlx::core::detail::Floor::operator()(int64_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a2e33b10bd5b04551054a87c601440bc7',1,'mlx::core::detail::Floor::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a2500f971100919a694f78669a5e4f438',1,'mlx::core::detail::Floor::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a23df818301d68389e6e12f5a9ec1fbd7',1,'mlx::core::detail::Floor::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#ac988b4f265cf46c68609c9c8787c15fb',1,'mlx::core::detail::Floor::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a7f936e3fd53772bc189d845c73b53202',1,'mlx::core::detail::Floor::operator()(bool x)'],['../structmlx_1_1core_1_1detail_1_1_imag.html#a5bd82e2185f3779e398c179d42a3e782',1,'mlx::core::detail::Imag::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log.html#a0012a4e1744dbe9a28c3b5652be6e1c6',1,'mlx::core::detail::Log::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log2.html#a467bd4c995674721ff5fff6df33aead8',1,'mlx::core::detail::Log2::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log10.html#a2633c5b772bbc9f8b66cffd4a3e01a3f',1,'mlx::core::detail::Log10::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log1p.html#a3220de8c6090c44aa2070b1fbb2dc340',1,'mlx::core::detail::Log1p::operator()()'],['../structmlx_1_1core_1_1detail_1_1_logical_not.html#a79799668ea5c364b0b4e2bc330e76253',1,'mlx::core::detail::LogicalNot::operator()()'],['../structmlx_1_1core_1_1detail_1_1_negative.html#afc4595c70ef7196df374cf4b2cc5e526',1,'mlx::core::detail::Negative::operator()()'],['../structmlx_1_1core_1_1detail_1_1_real.html#ae84a939fdb5916257a7731cda66d4d61',1,'mlx::core::detail::Real::operator()()'],['../structmlx_1_1core_1_1detail_1_1_round.html#a653f29c059bbfa6192378732a8a23351',1,'mlx::core::detail::Round::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_round.html#a82a984f13568051009e257fe85227da6',1,'mlx::core::detail::Round::operator()(complex64_t x)'],['../structmlx_1_1core_1_1detail_1_1_sigmoid.html#a64b72561bfaf758632167f00648f4c89',1,'mlx::core::detail::Sigmoid::operator()()'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a64ed5013cee7ff18c7fe70bc04737e7b',1,'mlx::core::detail::Sign::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a7106ed1f2f98a365fcb3e6ee39084748',1,'mlx::core::detail::Sign::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a7163e8c068dcc460600ed04014dc9945',1,'mlx::core::detail::Sign::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#ae8f56c7134721c846240830169424c22',1,'mlx::core::detail::Sign::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a10ae519e9a74a327fc72c410e9ab2936',1,'mlx::core::detail::Sign::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a91be4e273f6c7ea5d44cfab380b77603',1,'mlx::core::detail::Sign::operator()(complex64_t x)'],['../structmlx_1_1core_1_1detail_1_1_sin.html#ae95671816529cc2188389af37a2f1a13',1,'mlx::core::detail::Sin::operator()()'],['../structmlx_1_1core_1_1detail_1_1_sinh.html#a9663ddf0fa4c0003576b48f3d5385f00',1,'mlx::core::detail::Sinh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_square.html#a54e9e3c0d0896e142289e8282eab1099',1,'mlx::core::detail::Square::operator()()'],['../structmlx_1_1core_1_1detail_1_1_sqrt.html#aa5a4830b3ef7efab20ea88a110667efd',1,'mlx::core::detail::Sqrt::operator()()'],['../structmlx_1_1core_1_1detail_1_1_rsqrt.html#a9af247be16bab83243038aac54446b79',1,'mlx::core::detail::Rsqrt::operator()()'],['../structmlx_1_1core_1_1detail_1_1_tan.html#aba397cd7ac05bbe06dfa9e3a64bdb05f',1,'mlx::core::detail::Tan::operator()()'],['../structmlx_1_1core_1_1detail_1_1_tanh.html#a1749ba1edfd53095ed7d45c0e53bab61',1,'mlx::core::detail::Tanh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_add.html#a2d6011c35768b5fcd2bb75747b944353',1,'mlx::core::detail::Add::operator()()'],['../structmlx_1_1core_1_1detail_1_1_divide.html#a5e0d22e2084c4ca81bec0d457a46c662',1,'mlx::core::detail::Divide::operator()()'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a3bdaf1095ad883ecc0fecc455f02cbf3',1,'mlx::core::detail::Remainder::operator()(T numerator, T denominator)'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a52c3a2ba86fccb24d37d218ae8328954',1,'mlx::core::detail::Remainder::operator()(T numerator, T denominator)'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a0b0dd6ef5b08585fdf8355770da8d747',1,'mlx::core::detail::Remainder::operator()(T numerator, T denominator)'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a68fe542084fb94d9a5abd740fe07832b',1,'mlx::core::detail::Remainder::operator()(complex64_t numerator, complex64_t denominator)'],['../structmlx_1_1core_1_1detail_1_1_equal.html#a2994cf1884e7126e76d0a20b215fe3ab',1,'mlx::core::detail::Equal::operator()()'],['../structmlx_1_1core_1_1detail_1_1_na_n_equal.html#a073b20b0d8d41ec8364b7c477421b9bf',1,'mlx::core::detail::NaNEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_greater.html#aa3844c2bae3c7a981739f642aa0dd094',1,'mlx::core::detail::Greater::operator()()'],['../structmlx_1_1core_1_1detail_1_1_greater_equal.html#a3b005f85522ad0e4b57044eed930ac30',1,'mlx::core::detail::GreaterEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_less.html#a0b4032dff1ad2b387745cb000aabdcbb',1,'mlx::core::detail::Less::operator()()'],['../structmlx_1_1core_1_1detail_1_1_less_equal.html#a31e70f8830a07557697541301555a7a7',1,'mlx::core::detail::LessEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_maximum.html#a3eb37abec8426ebc42b8c685075c523a',1,'mlx::core::detail::Maximum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_maximum.html#af99345c7c8bc95ccab1b22c0792ac6fd',1,'mlx::core::detail::Maximum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_minimum.html#afca0861556416a8547dd8574528feb69',1,'mlx::core::detail::Minimum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_minimum.html#a64b2eecfbc56aaef7deb939423bac3f8',1,'mlx::core::detail::Minimum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_log_add_exp.html#ad1663fd809acaa4038f90666436599e5',1,'mlx::core::detail::LogAddExp::operator()()'],['../structmlx_1_1core_1_1detail_1_1_multiply.html#a898b090966b047723513224b8d3b22f1',1,'mlx::core::detail::Multiply::operator()()'],['../structmlx_1_1core_1_1detail_1_1_not_equal.html#a23d662b5fd968dc17d3bee2595b5f99d',1,'mlx::core::detail::NotEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_power.html#a2c047e1b488e6525447a224975a75db8',1,'mlx::core::detail::Power::operator()(T base, T exp)'],['../structmlx_1_1core_1_1detail_1_1_power.html#a9967db24b8f67d54b6aa3810e274f28c',1,'mlx::core::detail::Power::operator()(T base, T exp)'],['../structmlx_1_1core_1_1detail_1_1_subtract.html#a72ef05830615a2d5d9662926ed82672a',1,'mlx::core::detail::Subtract::operator()()'],['../structmlx_1_1core_1_1detail_1_1_logical_and.html#a046536c1f2f9367983f052a213d7b7d8',1,'mlx::core::detail::LogicalAnd::operator()()'],['../structmlx_1_1core_1_1detail_1_1_logical_or.html#afb134dbab79307d4ba597843c61d0b1a',1,'mlx::core::detail::LogicalOr::operator()()'],['../structmlx_1_1core_1_1detail_1_1_select.html#a930f9da2e6b3453e04f21382435a2cfb',1,'mlx::core::detail::Select::operator()()'],['../structmlx_1_1core_1_1detail_1_1_bitwise_and.html#ae0bed77f95fe2b2f0b594addddd04700',1,'mlx::core::detail::BitwiseAnd::operator()()'],['../structmlx_1_1core_1_1detail_1_1_bitwise_or.html#a5ab05734c5000b454975de6647a08d20',1,'mlx::core::detail::BitwiseOr::operator()()'],['../structmlx_1_1core_1_1detail_1_1_bitwise_xor.html#a0989e3bcd064ae06c33f660696a869a0',1,'mlx::core::detail::BitwiseXor::operator()()'],['../structmlx_1_1core_1_1detail_1_1_left_shift.html#a9385f580830a6ad163dd9bb8c4905e7a',1,'mlx::core::detail::LeftShift::operator()()'],['../structmlx_1_1core_1_1detail_1_1_right_shift.html#a154528ba50e89a4c532a181f135b1620',1,'mlx::core::detail::RightShift::operator()()'],['../structmlx_1_1core_1_1_default_strided_reduce.html#a024682ab93b84e544a07e3a9c3c51fba',1,'mlx::core::DefaultStridedReduce::operator()()'],['../structmlx_1_1core_1_1_default_contiguous_reduce.html#a08144c7a3cdf10af5e47f4575da3694f',1,'mlx::core::DefaultContiguousReduce::operator()()'],['../struct_add.html#ac5c66b63d63a222d3ae0ab8cc7c90eb5',1,'Add::operator()()'],['../struct_floor_divide.html#a2b328e4d768e718fa439f955c524666a',1,'FloorDivide::operator()(T x, T y)'],['../struct_floor_divide.html#afc16a2b2a745225e0bc95640f3fc0219',1,'FloorDivide::operator()(float x, float y)'],['../struct_floor_divide.html#ae91719a15f7e643d552129f476089c6a',1,'FloorDivide::operator()(half x, half y)'],['../struct_floor_divide.html#a4aa9f858626583e02bd79f747229bbca',1,'FloorDivide::operator()(bfloat16_t x, bfloat16_t y)'],['../struct_divide.html#a0a16b9194abc2ab7c61129f81a9bbb3d',1,'Divide::operator()()'],['../struct_remainder.html#ab7875512ff4341c580c6dc372e64fc58',1,'Remainder::operator()(T x, T y)'],['../struct_remainder.html#a18150b5f4425e30b95ffabc6bb25cede',1,'Remainder::operator()(T x, T y)'],['../struct_remainder.html#ab3b75f54b56fd357c9755daadb2cafc2',1,'Remainder::operator()(T x, T y)'],['../struct_remainder.html#ae918ce0e246937d4fe04e2ea36e4b2c1',1,'Remainder::operator()(complex64_t x, complex64_t y)'],['../struct_equal.html#aa498087080900d4428ba428a6496a769',1,'Equal::operator()()'],['../struct_na_n_equal.html#a00220898e02db656d21dde9e9354a8dc',1,'NaNEqual::operator()(T x, T y)'],['../struct_na_n_equal.html#a6185e4554dce5b4659d21673c576be51',1,'NaNEqual::operator()(complex64_t x, complex64_t y)'],['../struct_greater.html#a98d7d8ee360cd0f469c6eb9a017560f5',1,'Greater::operator()()'],['../struct_greater_equal.html#ae69a3bccc567a46506cf0d296294ce80',1,'GreaterEqual::operator()()'],['../struct_less.html#a5ee0b31b2d9123dc4504f2979a5854d3',1,'Less::operator()()'],['../struct_less_equal.html#ae9f9a1b2eae548977139704f0044acfe',1,'LessEqual::operator()()'],['../struct_log_add_exp.html#ab32417f18e8ff68c15f78aceeb624edf',1,'LogAddExp::operator()()'],['../struct_maximum.html#a3ea0f42bc4cd80b68a98f189f9fa859c',1,'Maximum::operator()(T x, T y)'],['../struct_maximum.html#a0bc8fadc87f2c49fc440d625bfc97ca6',1,'Maximum::operator()(T x, T y)'],['../struct_maximum.html#a907e8793900be5927625377dab199644',1,'Maximum::operator()(complex64_t x, complex64_t y)'],['../struct_minimum.html#aa6113dfac3986c0f571fa53f65c5330e',1,'Minimum::operator()(T x, T y)'],['../struct_minimum.html#a0c939921de87ab9c6959238aac81a059',1,'Minimum::operator()(T x, T y)'],['../struct_minimum.html#a800fba087280f79c2f7e9aff75bed093',1,'Minimum::operator()(complex64_t x, complex64_t y)'],['../struct_multiply.html#a1327fc5a0713931afe997b0d4d2988e0',1,'Multiply::operator()()'],['../struct_not_equal.html#af008d73a5d9cde0b8309b7e8ee7438b2',1,'NotEqual::operator()(T x, T y)'],['../struct_not_equal.html#a14de494cea4e4869351202cad1149f17',1,'NotEqual::operator()(complex64_t x, complex64_t y)'],['../struct_power.html#a2b6df2a9e48155ff9734caca8504a79f',1,'Power::operator()(T base, T exp)'],['../struct_power.html#a36829163d42973034a1f8a7ecc57a1de',1,'Power::operator()(T base, T exp)'],['../struct_power.html#a27cdfb313c4e82b63bdcdaee923cbbef',1,'Power::operator()(complex64_t x, complex64_t y)'],['../struct_subtract.html#ae0856cd8d449074ca287baa7e460f68a',1,'Subtract::operator()()'],['../struct_logical_and.html#a8bc6bdabc0ea0678a46e2cf6217cb3a6',1,'LogicalAnd::operator()()'],['../struct_logical_or.html#ade6a931324a604a3119d2220d6f5460d',1,'LogicalOr::operator()()'],['../struct_bitwise_and.html#afb48af090b01dd0200963bc12d842e36',1,'BitwiseAnd::operator()()'],['../struct_bitwise_or.html#a41f847463daafa99ee56f4035578390f',1,'BitwiseOr::operator()()'],['../struct_bitwise_xor.html#a3a3e8a56caab739d40262d9349c9c485',1,'BitwiseXor::operator()()'],['../struct_left_shift.html#aa729747784c38bfdbba34794fcf5175b',1,'LeftShift::operator()()'],['../struct_right_shift.html#a2cc59b400c68342b0e43050431323c17',1,'RightShift::operator()()'],['../struct_arc_tan2.html#ac9b7729753e13be293ab700231d061ac',1,'ArcTan2::operator()()'],['../struct_div_mod.html#a8b5758f2ea18d4c903b462331b25abfe',1,'DivMod::operator()()'],['../struct_cum_prod_3_01bool_01_4.html#ad634be0b139d10ce6d21332eef0d936b',1,'CumProd&lt; bool &gt;::operator()()'],['../struct_cum_max.html#a781b9b955c5412466da6af6c70d73c06',1,'CumMax::operator()()'],['../struct_cum_min.html#ae0b8c3761e04fa538d304ca842281a66',1,'CumMin::operator()()'],['../struct_less_than.html#a2798eb377b411c93a4ed30cf35caade2',1,'LessThan::operator()()'],['../struct_select.html#adb51692aae3038de07dd745891bf9848',1,'Select::operator()()'],['../struct_abs.html#a9e7481dfcc162509769852026ff4a344',1,'Abs::operator()(T x)'],['../struct_abs.html#a0ca113fd036151c443df3f83cc667f28',1,'Abs::operator()(uint8_t x)'],['../struct_abs.html#adaeab32a7e377dc990077ab15f3dc4c2',1,'Abs::operator()(uint16_t x)'],['../struct_abs.html#a99d2a2f37a6cddd3168b0224f2a9b963',1,'Abs::operator()(uint32_t x)'],['../struct_abs.html#ac9cbc02422d930479303f240a7ea6c71',1,'Abs::operator()(uint64_t x)'],['../struct_abs.html#ac30835b27784d451bd2e4524c8eb9e11',1,'Abs::operator()(bool x)'],['../struct_abs.html#ab82917d6b30a2c579e7eb879d305c5fc',1,'Abs::operator()(complex64_t x)'],['../struct_arc_cos.html#a5553cecf58511e24e76ac97f2d90b9ac',1,'ArcCos::operator()()'],['../struct_arc_cosh.html#a5c9e7712c14c97298b23ec48e19abc58',1,'ArcCosh::operator()()'],['../struct_arc_sin.html#a0343872f2da93bae2bb0baadf49da022',1,'ArcSin::operator()()'],['../struct_arc_sinh.html#a3066fb7dc7c3180100fb55ff94af6a7a',1,'ArcSinh::operator()()'],['../struct_arc_tan.html#af3a0aec6acec8ae8f5e4c4d5cf8c91ba',1,'ArcTan::operator()()'],['../struct_arc_tanh.html#a37dc3e01ec2830de7e82ed6c6363ac88',1,'ArcTanh::operator()()'],['../struct_ceil.html#a5e2a4ef1b012f5d352064489156e5e44',1,'Ceil::operator()(T x)'],['../struct_ceil.html#a455cd8083ba859993077f2e078ae165b',1,'Ceil::operator()(int8_t x)'],['../struct_ceil.html#a2acb61bc658c7a216795e7f76ebcf98a',1,'Ceil::operator()(int16_t x)'],['../struct_ceil.html#aef8c37f7a8ee3fc80700d605a09891fb',1,'Ceil::operator()(int32_t x)'],['../struct_ceil.html#a93d0110511ad5dd200e12d37a3d7d6e3',1,'Ceil::operator()(int64_t x)'],['../struct_ceil.html#aa335b745fa26e0f443cdb36298105484',1,'Ceil::operator()(uint8_t x)'],['../struct_ceil.html#ade17e13b7f30f5c590fae1581a2013ac',1,'Ceil::operator()(uint16_t x)'],['../struct_ceil.html#a411c75cc35cdc088402e176a1defd22d',1,'Ceil::operator()(uint32_t x)'],['../struct_ceil.html#a9ac660ca29eef7a7429fceb7b917a68a',1,'Ceil::operator()(uint64_t x)'],['../struct_ceil.html#a40de367e62f06ebd7e1330afa93a9ad9',1,'Ceil::operator()(bool x)'],['../struct_cos.html#ae222f8710f6b8254c471ebd475aa5bda',1,'Cos::operator()(T x)'],['../struct_cos.html#a5f26feb1dcc4bec5f59a9ff511c5b163',1,'Cos::operator()(complex64_t x)'],['../struct_cosh.html#a5847ebeebb236fdc926798ddc16475ba',1,'Cosh::operator()(T x)'],['../struct_cosh.html#aefdd91298dac16d528d29ee47e2f7252',1,'Cosh::operator()(complex64_t x)'],['../struct_conjugate.html#acb0a2694285f1f57c7654b371ce8cbd8',1,'Conjugate::operator()()'],['../struct_erf.html#a80719402ad7f7d418859a6677d7b604d',1,'Erf::operator()()'],['../struct_erf_inv.html#afbf3668d1a512e889f093a0bc7673309',1,'ErfInv::operator()()'],['../struct_exp.html#a5ef395868e055348c0802fd5fe45669c',1,'Exp::operator()(T x)'],['../struct_exp.html#a2b341ac400c4d145397950eb60734336',1,'Exp::operator()(complex64_t x)'],['../struct_expm1.html#a4b834d42cf0b84daf03fec62c222091a',1,'Expm1::operator()()'],['../struct_floor.html#ace3551f28429081e9f3a3dab0c84212b',1,'Floor::operator()(T x)'],['../struct_floor.html#a10d7fd05b4c224c9f135451246d13014',1,'Floor::operator()(int8_t x)'],['../struct_floor.html#a2865a04a492e3590302f4bd3215a10d7',1,'Floor::operator()(int16_t x)'],['../struct_floor.html#a41012343ff0463ec44b4d06196f41182',1,'Floor::operator()(int32_t x)'],['../struct_floor.html#aae3181d15856796aa0628cf30c92aa2e',1,'Floor::operator()(int64_t x)'],['../struct_floor.html#ac6cf38d82c8e270911afdca4c69ad51b',1,'Floor::operator()(uint8_t x)'],['../struct_floor.html#a78969b9e2b53ae248e72a67259eea5d8',1,'Floor::operator()(uint16_t x)'],['../struct_floor.html#a959009320ed622ed45b39becab1d5b98',1,'Floor::operator()(uint32_t x)'],['../struct_floor.html#a7d04b83c3345cd867315cae2d7ff68ab',1,'Floor::operator()(uint64_t x)'],['../struct_floor.html#abea845fe5e8e6b93bd4bca8717337e0b',1,'Floor::operator()(bool x)'],['../struct_imag.html#a3b29e9f8a46c194d683f6a9938314400',1,'Imag::operator()()'],['../struct_log.html#a32a383cb6be06e616a75f23bf49089c3',1,'Log::operator()()'],['../struct_log2.html#ac1e067ecdcbdbffb6106e789c2b98b64',1,'Log2::operator()()'],['../struct_log10.html#ac596a74c1642a00f3eced07ee3334122',1,'Log10::operator()()'],['../struct_log1p.html#a4464c6e7bdbe55ffd7d961c695cd13ce',1,'Log1p::operator()()'],['../struct_logical_not.html#a8a620bac957ab8c09ac85adfddd96708',1,'LogicalNot::operator()()'],['../struct_negative.html#af6879b374314a559faa321e8cce3d710',1,'Negative::operator()()'],['../struct_real.html#a85b9c5b9e65297994fa26ff68e19e809',1,'Real::operator()()'],['../struct_round.html#aa06a0195867e2ceb679c403b6909a1c4',1,'Round::operator()(T x)'],['../struct_round.html#ad3a08f2276ff1033900bc0a7da812655',1,'Round::operator()(complex64_t x)'],['../struct_sigmoid.html#a75a24cd75cb4d4c9a072811b2d70ad55',1,'Sigmoid::operator()()'],['../struct_sign.html#aa3304c6b43bcad53061614b741d8403c',1,'Sign::operator()(T x)'],['../struct_sign.html#ac48992b675b8b28be1e27e1f2ec5d2f7',1,'Sign::operator()(uint32_t x)'],['../struct_sign.html#ae07a4249e1b61419a3b9ca6c337b7bb5',1,'Sign::operator()(complex64_t x)'],['../struct_sin.html#a7caf98c777521fa5d5c6ddaaa3b779fd',1,'Sin::operator()(T x)'],['../struct_sin.html#aa510cf4595b6d49065ab6b602d8fcb14',1,'Sin::operator()(complex64_t x)'],['../struct_sinh.html#a02cf32bcf560657b9ee34fb1affed8e2',1,'Sinh::operator()(T x)'],['../struct_sinh.html#a1f8ba1858d352ee68861cd6ea861af43',1,'Sinh::operator()(complex64_t x)'],['../struct_square.html#afde739fc544e45dd30964c02dca94310',1,'Square::operator()()'],['../struct_sqrt.html#ab9b16d2b9b03a1c54190f4479a56a4ad',1,'Sqrt::operator()()'],['../struct_rsqrt.html#ae16699fd829e40416436247a39233fda',1,'Rsqrt::operator()()'],['../struct_tan.html#a1e6fb8c691621c69cb9bd393de4f6e78',1,'Tan::operator()(T x)'],['../struct_tan.html#a2ef120c9f92b0d2e9cec8389eda05724',1,'Tan::operator()(complex64_t x)'],['../struct_tanh.html#adce11a7ad33226c6ecff34f46f5c45d7',1,'Tanh::operator()(T x)'],['../struct_tanh.html#aa8423b43c725bb4b88965a11e8cf20f6',1,'Tanh::operator()(complex64_t x)']]],
+  ['operator_2a_23',['operator*',['../structpocketfft_1_1detail_1_1cmplx.html#a26bf3d709a58f06228e502af6db8e5ac',1,'pocketfft::detail::cmplx::operator*(const T2 &amp;other) const -&gt; cmplx&lt; decltype(r *other)&gt;'],['../structpocketfft_1_1detail_1_1cmplx.html#ad9c591ef8ae976293f207937d273e9a1',1,'pocketfft::detail::cmplx::operator*(const cmplx&lt; T2 &gt; &amp;other) const -&gt; cmplx&lt; decltype(r+other.r)&gt;'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a153756072fda6d3e53bcca11b46a1238',1,'mlx::core::array::ArrayIterator::operator*()'],['../backend_2metal_2kernels_2complex_8h.html#a681d4fb076973f58f7dac894ec62a385',1,'operator*(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8f06316063fc91747533105f256b55b5',1,'operator*(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7b3bce3f6f17089d87e13e91f580a581',1,'operator*(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a54ae7216b82c5cea362f6b83e1df3a9b',1,'operator*(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a852689073c17596de4fb545bc046b380',1,'operator*(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a168300bbd04d8e97c5e4218cb14ae378',1,'operator*(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6278bd2e0e2805090b33ef666bf7f6bb',1,'operator*(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aecf703522d9ce32dfeefe1e6e903db06',1,'operator*(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7cd44d27fa9a4f13df39894c34fdb348',1,'operator*(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aee64dc1890abb6d1035361cb8c751f96',1,'operator*(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad1a559ab88dbbb4fd2c7509d2c94e55b',1,'operator*(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a495ae2d9be5d97c4c6448fc4e50a03e1',1,'operator*(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a87ab4b7a502430da664ccb8abd383058',1,'operator*(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5f997839cf49c24ab594a0dff486a7bc',1,'operator*(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aa0c2d29950926ae579adf6337fbea64b',1,'mlx::steel::operator*()'],['../group__ops.html#ga26c33f5cdb6fc10d272acd6e208034e0',1,'mlx::core::operator*(const array &amp;a, const array &amp;b)'],['../group__ops.html#gac22a67f7de797b1ae59029843cbdcab6',1,'mlx::core::operator*(T a, const array &amp;b)'],['../group__ops.html#ga6f2369ed5fae8ff9b1528670a004dde2',1,'mlx::core::operator*(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a0cc824d6318f97f7058918ab64ddfc25',1,'mlx::core::operator*(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a81e1c727c3fc48910b030cb65a9e7afa',1,'mlx::core::operator*(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a861d948220d8f48d46c68d2ddb16a096',1,'mlx::core::operator*(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a13d16561812679b36e68185dc4b2d04d',1,'mlx::core::operator*(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a5287610200ff573730c9c92413f48881',1,'mlx::core::operator*(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a377ccc6b4ef36767abca102dca56dc10',1,'mlx::core::operator*(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a5d696b63635ce6967526d6a410f7f6b1',1,'mlx::core::operator*(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abe90e9527bfa3e1c813d41df4a2372e7',1,'mlx::core::operator*(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a5f14963c77f96bcb5a3bef5661a86ba4',1,'mlx::core::operator*(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#acfb06fe9f5fee01dbb5a2b23bccfd0d3',1,'mlx::core::operator*(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#afc9a87f1fccbac05242b91bfbb35c24d',1,'mlx::core::operator*(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0b9678af9b487900cacf6639a4693de0',1,'mlx::core::operator*(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ad5950619081389e6ed7512f38358d33d',1,'mlx::core::operator*(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a65d25d082374761c05b056e1046d1d4e',1,'mlx::core::operator*(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a759191fb984e7737f0ef529c2053ad73',1,'mlx::core::operator*(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3a52675c3d4552b319dd9707844abdec',1,'mlx::core::operator*(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a45d67f5d80fba4d42e34c682a8d22beb',1,'mlx::core::operator*(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ad25880c67bbcbfafbe54dc16418bf736',1,'mlx::core::operator*(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a63c836e1141e07ae72cee770bad01200',1,'mlx::core::operator*(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a265a37b8ee4a97390213e9ec49693e66',1,'mlx::core::operator*(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab5a457da04dcb157a0b5172c4b2244b6',1,'mlx::core::operator*(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#aa56a8bda08be9ef3711496e216a75c95',1,'mlx::core::operator*(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af89612098dd355b1eefb841c753b36ab',1,'mlx::core::operator*(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a4552687a0637f710b5d55bb6378fcabe',1,'mlx::core::operator*(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af69db7def588d7da430434a69456e29c',1,'mlx::core::operator*(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a00af6e5095888f00791ee0ab6d993ad6',1,'mlx::core::operator*(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab48feddc1aa304383e5493923506ad7a',1,'mlx::core::operator*(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a0367b582e85162b4180e086f725e49e9',1,'mlx::core::operator*(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a45f0479526fbccdb00bc73ea7f3b7625',1,'mlx::core::operator*(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a394797646010ba9ef2a1f9b9a4b8ddd9',1,'mlx::core::operator*(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acaaa86b59c7ceb2e092ac07f2a75225c',1,'mlx::core::operator*(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a067d47823a322b88043cce7ce4a3ec78',1,'mlx::core::operator*(bfloat16_t lhs, float16_t rhs)']]],
+  ['operator_2a_3d_24',['operator*=',['../structpocketfft_1_1detail_1_1cmplx.html#a683fd490182c9189fa2c05b1823edd93',1,'pocketfft::detail::cmplx::operator*=(T2 other)'],['../structpocketfft_1_1detail_1_1cmplx.html#a06f2c26c6fc4722e61b44da4c242ed87',1,'pocketfft::detail::cmplx::operator*=(const cmplx&lt; T2 &gt; &amp;other)'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419',1,'operator*=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ade65ebca11e38d56408c512df89b99f4',1,'operator*=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af4348ce3425dd99d069e8fdf06e25a3c',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2c3c5f793b3d957d7295d7f1faabebee',1,'operator*=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac66657077d55e94197b52b63acb50b7d',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a383165ea838cc3feeee4d9cf54aa77cc',1,'operator*=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab706af260b61f735b28464877d02137c',1,'operator*=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a979374b1dd4e0eaf602326fa901336d1',1,'operator*=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac815eec2c1b15a47b1c6ea6790e77d24',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8110fae7bcc34a0de5927546b24aa935',1,'operator*=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae4acef3e7ae7dfe359422503f894e885',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adc268cdbc30500f3009f5de2b2f0f67a',1,'operator*=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a81f65b04a87a25c7eb1a751d1be9fa55',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08c1f916302eb9d48c93f8b7260538fe',1,'operator*=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adc8e82b8f593b12c6d405e2250ab0f62',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4611728172afea51860a77fdb06cafa0',1,'operator*=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0b8736e2ae24758b6e24ea72668df5b4',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad920df9579603f0b0ee2689eba330617',1,'operator*=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae97ab6c3ddcc2754b24f86319a5398be',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3ff4ff59f411010ac8502cfabda4bd6f',1,'operator*=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abd3d82e2dec1847e97eb8fc3bab2985a',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a738078eb7d5ff94ff48156a555d763a5',1,'operator*=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a435f2f4256aadb1b57fd62bb7f733cf7',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0e4377b120d6305335d296e031ee5b30',1,'operator*=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a917354f77eac26189da8a2f610a00074',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af725f935bfa0405e5ff17ede3ac47283',1,'operator*=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7c56980c234a04260b8b19298085e526',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab840ff9de0cdd0e9afffb8baa2a850a3',1,'operator*=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a73416a7415f3fe31525e33419e5e8aab',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a16978f4b16d954ef4d4cf0f32f6c0b94',1,'operator*=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a99aa4cc110d1c7aa3b4c8c5cbf9235b7',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2179abbc91ce8763e96e39e1917bfa6e',1,'operator*=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab070ea4676d10a10ff3e9379a4068a57',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0197e039d4c65bf49649a6f250c2d436',1,'operator*=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad3565cc6fd1e088d052b1108aa065851',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a711693988c437c2fb4d7da505982fe21',1,'operator*=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeff4c28986f98c23de1df17043edb0f5',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7dbf0c75df4817cb4ef8b60c417a89d0',1,'operator*=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a323a80492cd17a49e2c3dd18f8c8b5cc',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adb465776d3868bda0525d632ffc4d129',1,'operator*=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a12a98d71d670b409b8065e0d61672d55',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5d00eb2ec2b0e15b2753d100694c45ae',1,'operator*=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1a2a683ff40490226eb1371fb905023d',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4126fb7ed5bbb27a2332c543cf56a337',1,'operator*=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab092d9790ef20fc0386707530aee89db',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abff1fd2439e31e6e64a3d2fdee3c7821',1,'operator*=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a625dcb133f1f953f263e6200399866c6',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08b6071245513e1726ec68e3b63edc53',1,'operator*=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a13aa79165ec87710e977f33fe0361e91',1,'operator*=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3796dcf819adb1ef8152f57ba63ff6b1',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aaab79d0b4c9e9bdc059ace6ec58c5b00',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a0dd3893abc8986901872c8365ab1509d',1,'mlx::core::operator*=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a3cc5c154e4ad9a83ad43da8513146fdc',1,'mlx::core::operator*=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a600e77dbc72e78207b5f5dbf4b298781',1,'mlx::core::operator*=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a54833be1d44bc3adfc9ea218fc3685bd',1,'mlx::core::operator*=(float &amp;lhs, _MLX_Float16 rhs)']]],
+  ['operator_2b_25',['operator+',['../structpocketfft_1_1detail_1_1cmplx.html#a76447ef141c8732d57421749fc81b236',1,'pocketfft::detail::cmplx::operator+()'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ae2adde594b5a4853f6bc78263a957d85',1,'mlx::core::array::ArrayIterator::operator+()'],['../backend_2metal_2kernels_2complex_8h.html#ad6af5c6c5ed4898b49758618e5aee189',1,'operator+(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a09c1a797eb7f43742578680899932f50',1,'operator+(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a551b970f73bb4a3b287653021d000b60',1,'operator+(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a43a225e7e548bb041f3a5d844faaf0da',1,'operator+(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8b6c3fd9d068a2159084359df8b9b449',1,'operator+(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0a5bfe15d95ba540795f4c25ebfa4f07',1,'operator+(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa415ce182fe7582d885fe633fc3527ce',1,'operator+(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a62f891b7dbba0000749cf338f594bedb',1,'operator+(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab43932322f81bf322aa1b0deeee9a987',1,'operator+(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acd15d46ea5827a2a39898ccbb8352eb8',1,'operator+(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a006763fae6e0577fc168ec9446f0f747',1,'operator+(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a12a47e8ac0be788edff57ae0a96d7830',1,'operator+(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af87dfa2122e9c76042dc41fb7f338a87',1,'operator+(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af2737d09c887ee8cd43fdeabceddbe82',1,'operator+(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#a12ff4f38aa8474bf76770c7b8e3e18cb',1,'mlx::steel::operator+()'],['../group__ops.html#ga26e5a043eaaaf066d1400adac9c11d0c',1,'mlx::core::operator+(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga7d0ec8d01e7cefa6a6b25f11876761b5',1,'mlx::core::operator+(T a, const array &amp;b)'],['../group__ops.html#ga7cc080a4f9d4a667f2099aa0dbfefadd',1,'mlx::core::operator+(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#ac14b984970cafd8fbe24d080949515cc',1,'mlx::core::operator+(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab076069c6f0047c548a8dc29d35dd36a',1,'mlx::core::operator+(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aab9d96b0a168f4d05146000a6212b5d8',1,'mlx::core::operator+(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac4e6f03d7e4ae701b4eefa784f36185b',1,'mlx::core::operator+(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a4cabd600a5271b0d416c91e8d31dd9c1',1,'mlx::core::operator+(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af26df9dc279d71b7cc10892c72162b58',1,'mlx::core::operator+(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#ac3b97eecec9bd8efb313f8f201560343',1,'mlx::core::operator+(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2e3bb121cbde30c2e6d806df0d41ff59',1,'mlx::core::operator+(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ac87ecce4b44b0826e666a169ddc6f878',1,'mlx::core::operator+(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aed3d9cd32698ef0fe65b1280f103b3f5',1,'mlx::core::operator+(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6fa13b9359cf3f575fbda5260e6e035d',1,'mlx::core::operator+(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af240a6471ff827819192808bffeb857a',1,'mlx::core::operator+(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ac25a05679f312b724c406d8b282803c9',1,'mlx::core::operator+(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a54863a54f258acf2b5c734950618e4e1',1,'mlx::core::operator+(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a9f81f5ea8909db9660197217612ee446',1,'mlx::core::operator+(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a13e26c38da0a4e332e0ae4eb0aed9cb8',1,'mlx::core::operator+(const std::complex&lt; float &gt; &amp;x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a59bb13a0bb7f748c8de34415b248bc57',1,'mlx::core::operator+(const complex64_t &amp;x, const std::complex&lt; float &gt; &amp;y)'],['../namespacemlx_1_1core.html#a38a44c412c8be4c8b952d3082cc7db74',1,'mlx::core::operator+(const complex64_t &amp;x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a011dbdbd2413e59e744cf82b05431340',1,'mlx::core::operator+(bool x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a230e3b7c479add1b171fa0aaa3a8b13c',1,'mlx::core::operator+(const complex64_t &amp;x, bool y)'],['../namespacemlx_1_1core.html#a3a6f43c2485f0d42293184f1aecbeaee',1,'mlx::core::operator+(uint32_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a766157c5d5d00fdf3da95eb7cb2981b9',1,'mlx::core::operator+(const complex64_t &amp;x, uint32_t y)'],['../namespacemlx_1_1core.html#a64dceec2bb03eee963a2a1bc1ac69284',1,'mlx::core::operator+(uint64_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#ae36badb78a17cd7d13663a69645fc328',1,'mlx::core::operator+(const complex64_t &amp;x, uint64_t y)'],['../namespacemlx_1_1core.html#ac1afa5d4c856e4b58109eff086e70ffd',1,'mlx::core::operator+(int32_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a8978def3c2cfe2a96314d564613b80db',1,'mlx::core::operator+(const complex64_t &amp;x, int32_t y)'],['../namespacemlx_1_1core.html#a5b8af5ca4c0e37aba0b7530542bd64c2',1,'mlx::core::operator+(int64_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a3eaa72850205c18450c3af9a01cda219',1,'mlx::core::operator+(const complex64_t &amp;x, int64_t y)'],['../namespacemlx_1_1core.html#ad38b38a3faf050735d45eed4438ee27a',1,'mlx::core::operator+(float16_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a358e66ff205bda3e8542427b6d2edadc',1,'mlx::core::operator+(const complex64_t &amp;x, float16_t y)'],['../namespacemlx_1_1core.html#af56d4b85e329e39a825c01a50e3a2522',1,'mlx::core::operator+(bfloat16_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a806a495a129ebaab69cc57ca7db831d6',1,'mlx::core::operator+(const complex64_t &amp;x, bfloat16_t y)'],['../namespacemlx_1_1core.html#a09fc6ebda917969383783a112a8547e7',1,'mlx::core::operator+(float x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a7ed0e2cdb65612f54e67166762cb6408',1,'mlx::core::operator+(const complex64_t &amp;x, float y)'],['../namespacemlx_1_1core.html#af7577c91b8c43682f0ebc9eb9758aae4',1,'mlx::core::operator+(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#abe36af9951afd8dd3ffe90ceedeb7f2b',1,'mlx::core::operator+(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#afb9f780dd056a4f975518f71a3b021ee',1,'mlx::core::operator+(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a6a8e093b24c4c789b7cd160f7e7f7de9',1,'mlx::core::operator+(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#af3a603690fd3de9e4f7f2035a4d25621',1,'mlx::core::operator+(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afa2a4bccfeea9688ac922cb638341511',1,'mlx::core::operator+(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a6111e94d51de12391e5d68b765f28fc3',1,'mlx::core::operator+(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7c7dd6d346e0cdf398a896f2c6958258',1,'mlx::core::operator+(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a00872a443f462b0ae0a30c84fb001bc0',1,'mlx::core::operator+(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4f5d80d03bae6d8d90455d3c47a8c116',1,'mlx::core::operator+(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a78f1f388f9d81ed93f60311f4645d8d0',1,'mlx::core::operator+(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aa43e1d6958c5d5a6fa9a625a1660e741',1,'mlx::core::operator+(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ae877e1d5e3cf57734da8b49535fe3fb3',1,'mlx::core::operator+(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a9a5ae769f67f886d59c8e292a8218550',1,'mlx::core::operator+(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a058878237ce50baa4c909d8d15448d7e',1,'mlx::core::operator+(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a95fd207028f125eefbafe9e0522407fe',1,'mlx::core::operator+(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#abc6425a3fbb386f5ea5964b42507e989',1,'mlx::core::operator+(bfloat16_t lhs, float16_t rhs)']]],
+  ['operator_2b_2b_26',['operator++',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a3efe69356a84d0d4438f033992fcbd9d',1,'mlx::core::array::ArrayIterator']]],
+  ['operator_2b_3d_27',['operator+=',['../structpocketfft_1_1detail_1_1cmplx.html#ad4e69dcd89bdb7764c9c5807168f911e',1,'pocketfft::detail::cmplx::operator+=(const cmplx &amp;other)'],['../structpocketfft_1_1detail_1_1cmplx.html#affa618d8850a7c232793b7c61db6d184',1,'pocketfft::detail::cmplx::operator+=(const cmplx&lt; T2 &gt; &amp;other)'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400',1,'operator+=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a251780ac4592cc2b1a543e417ff57770',1,'operator+=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24381d991c2d570aa953694f396a69b5',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7595740d4cc12924905d6bd1b99ee4da',1,'operator+=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac1498acb8c3623b5f412f70ab6a6528b',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abce5ab327110c164f054b43ed47f79a0',1,'operator+=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae0c70198e236ffe1a98f79987c686419',1,'operator+=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a13b3338935440ae51ecc4a356093efc5',1,'operator+=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5a0cb8544b4ebd2906ba8e7f2868e8de',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7b134429ea0c8493800ff8b465410f9c',1,'operator+=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4154f90ab7857ca856f9e15fe1bf5acf',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab9ae6a51e2027b02cac9966e05f3ba68',1,'operator+=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab93ce536eb7998bee00de4af868e31a9',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad0ae9e2b4874f991a2c853e1c1fe735d',1,'operator+=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a194a6670cc25ade35a24b566f31af785',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3d0d689516c99003659c5d026847bd2e',1,'operator+=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a007f58508b98bb79e5c323ed0dec89b6',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa7198e580e2a83c1fd01a4b6fdf86a80',1,'operator+=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a15573fefd880adefbba079b1c1bd8082',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a104cf94cb9e359d1b6ef92ced2ce0c27',1,'operator+=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa04cfcb52191fd23205a1a3572b46ae0',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad684bc2ae1a2a627cd3e4a4c641e2d77',1,'operator+=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad1e28448e35f4934075b397c34ba3d66',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8ad16afd7f1711de83c0cec5af868f76',1,'operator+=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac45e9ca0c7155caebe3d0f7261518077',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3c62ac679d6aa515144d40ebafe4a188',1,'operator+=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9ff5ab3aef1057fa083b53a65c8aba03',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae74bb0a3c12cd1a23f3d29ce307d6fb1',1,'operator+=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac188bd19f236b098d603b0d8acd08921',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aef9fa600d107b509f2e3df7d6b080e01',1,'operator+=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af5713afb3a62967a02c3c20661951ee4',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7f1b84352a3ed6171444a43da1fc7e92',1,'operator+=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af1983edd26245e6e51c6e47354095e32',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8cd55d1a579540eb450e12a8a8a950be',1,'operator+=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a588ef0f7e03f306758524d378278976f',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a74751abec7086f85f4f26ced44f1ca1f',1,'operator+=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4dd3cf0e5aa116ff330352a50c18cde7',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afb9a0e18c0e40c77e6143fb7d84ebfba',1,'operator+=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adf0cfd9a608a6fb3d57933e32e7d81d2',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4bd92db6c8b9b5dc96332c7ae3eff8c7',1,'operator+=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5d628a5bc4fa755610392f47a523a1f1',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7c790442f77f2437b482c4a55e224fc3',1,'operator+=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a77bab4481b41be50297b257e95058706',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7816a97d16b1d2f8a90227bb1da2f6ac',1,'operator+=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac244d140c6149726ea44174d3e836ca3',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af802541c4c65ee4442acd495de4d27fe',1,'operator+=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac06eb2fea47a09a8a8abdaa1aa9b4603',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5263b2463fecdc97f9521d00bffea059',1,'operator+=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24ca436ab299a710263d65302532dd3b',1,'operator+=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aee1bdf0ab2e445293708b476e8cfde3b',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a190e27077f0fba642a86f5c8f488bcc2',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a9f2c9d2f21fbf9fbbacd940c6967c9d1',1,'mlx::core::operator+=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a0b1b3c48afc0a785282e43435bba8418',1,'mlx::core::operator+=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7b763db8194e6fcb1b87eab143dfa47a',1,'mlx::core::operator+=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a827167f6a1ae55428fd218ddd51ec3b6',1,'mlx::core::operator+=(float &amp;lhs, _MLX_Float16 rhs)']]],
+  ['operator_2d_28',['operator-',['../structpocketfft_1_1detail_1_1cmplx.html#a460da5db36d1c72fb1ed3496fd3abde4',1,'pocketfft::detail::cmplx::operator-()'],['../backend_2metal_2kernels_2complex_8h.html#a226cfd54d49f02e35c5aab3139c7596b',1,'operator-(complex64_t x):&#160;complex.h'],['../backend_2metal_2kernels_2complex_8h.html#af5608264cf920688607059b4e8cd3117',1,'operator-(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855',1,'operator-(_MLX_BFloat16 x):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a333f67614dbf8027439a7e124052cb85',1,'operator-(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a891aa4bf46c20a26a55061736aba25f1',1,'operator-(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7ad7ff44a3200853711869f7a577d931',1,'operator-(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af69ef8f1d8ecae0e6f755bf1c46cf075',1,'operator-(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5bd875a54b79b2dcedf674807c3e53c5',1,'operator-(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab02f8646b47806e1d2038f248df03f06',1,'operator-(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab27b26182c7c6e08af37e6d511fd9253',1,'operator-(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5868c85c988ec3432cf86d7df40e464d',1,'operator-(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad03ef47e6cc7521bbfb45740dee20f88',1,'operator-(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab789f8a400512ff27e36b3373170f0c5',1,'operator-(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7f601b22ecc480132d82ad782e5363bf',1,'operator-(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a152366ab4e2ccc867e919af6c74ced91',1,'operator-(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a42bead8ef0beb9f3452128d64cd4df9d',1,'operator-(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aca8ef21c16984ccb329b3bd0c1e4be48',1,'mlx::steel::operator-()'],['../group__ops.html#gade2eea48989f4caaf36e89f7bd2a8816',1,'mlx::core::operator-(const array &amp;a)'],['../group__ops.html#ga0c7f3cb36d4ca516c7a33142f88b9181',1,'mlx::core::operator-(const array &amp;a, const array &amp;b)'],['../group__ops.html#gae68d3d0691ba951501218e98439f3465',1,'mlx::core::operator-(T a, const array &amp;b)'],['../group__ops.html#gaf5e5d882c51ad0a0ea315c274d5439b2',1,'mlx::core::operator-(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a622ce842fe44e4b6a95e03242341b459',1,'mlx::core::operator-(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af32a99d930d49e9b178472d7a65531ab',1,'mlx::core::operator-(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a3555a2b31fc0925850d3240e85e03ec5',1,'mlx::core::operator-(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a46080889fd9e5c3f9916508e97dff5ad',1,'mlx::core::operator-(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a9ca27fd1e512c8ed126342e565da12ae',1,'mlx::core::operator-(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3803f8d36558d32bb7dd6e580ea683b4',1,'mlx::core::operator-(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#af5d865528989ca66b3d357e5ce4e0300',1,'mlx::core::operator-(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#afb784b960f55aeb4edd7f567fa74d443',1,'mlx::core::operator-(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a29cbacf4b399c24728fb0808fad498f9',1,'mlx::core::operator-(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aececc0e451237aa6c0d1a2c3d828c86e',1,'mlx::core::operator-(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a383a26cc2689c98fd6c4435ade8dc669',1,'mlx::core::operator-(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad6311ef8df59bdfb212b5cf8169246b2',1,'mlx::core::operator-(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a23b7329bc1c93c8ac0a1f576565fefb0',1,'mlx::core::operator-(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad8d650bf63998abd716ee0ca28e1cbb9',1,'mlx::core::operator-(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a7339b33201254e9119d99d3a728ded72',1,'mlx::core::operator-(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a064318b7a16e5cb6d0a6407501b5c7dc',1,'mlx::core::operator-(_MLX_BFloat16 lhs)'],['../namespacemlx_1_1core.html#a7bae3ff296d9a60ff3c7e448f7fbc6bd',1,'mlx::core::operator-(const complex64_t &amp;v)'],['../namespacemlx_1_1core.html#afb5069ecebdfd9d388c26f83df12c93c',1,'mlx::core::operator-(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a8d126e3f3fa9f8c1c1ae1b09f94df487',1,'mlx::core::operator-(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ad04f1ccd2cd7c487a2f2aaa055939f64',1,'mlx::core::operator-(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a15eb2ea76508ff823fa0591e811d0b7d',1,'mlx::core::operator-(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a96d9577db38d6809d022893e32feeda1',1,'mlx::core::operator-(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a5d9c02765c1672930757416411567bf2',1,'mlx::core::operator-(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a6105d3b5266666b7c6bb9469285a9ec3',1,'mlx::core::operator-(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a777aa772dfb205b25d26f3180d98a2f6',1,'mlx::core::operator-(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a085eb092f4ada47f8169de62886cff90',1,'mlx::core::operator-(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab25e5d211e2c8785b45c3a81a6282e2b',1,'mlx::core::operator-(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#abf5d09561a81b0f0b32d59d77e32e16f',1,'mlx::core::operator-(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4ce6867dbb4d1631d1870dac14022dbb',1,'mlx::core::operator-(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a8a049e646e0442064cfe9e202d7047c5',1,'mlx::core::operator-(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a78e2a1cfc65453185bcca13bd4f523cf',1,'mlx::core::operator-(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#af143cf68673e06390d4bb2ec2892bd22',1,'mlx::core::operator-(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a46d502dfe0b027955950d4e716c2eb26',1,'mlx::core::operator-(_MLX_Float16 lhs)'],['../namespacemlx_1_1core.html#a2631e78c6f0a602f6754ac577ec75f83',1,'mlx::core::operator-(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a73d79cbd75d543d0837b8a51bf103f9e',1,'mlx::core::operator-(bfloat16_t lhs, float16_t rhs)']]],
+  ['operator_2d_3d_29',['operator-=',['../structpocketfft_1_1detail_1_1cmplx.html#a12441ff423274bd1b54245933d69ad7e',1,'pocketfft::detail::cmplx::operator-=()'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca',1,'operator-=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac2f1e1f2365cfa531b1519aa9ff67695',1,'operator-=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a513501355a5912a1263fd8b10864142b',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab4f4ecd62c3d8b3363d02019573dc9f1',1,'operator-=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a92d1348f201d78fcd474f75d5b23ef68',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3eefe9a7f5fb226335ea687012f32d5c',1,'operator-=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aef62c7e3e494b6a511a7833c0d942a60',1,'operator-=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad30726cc8b69fd300d33c2a46e123c28',1,'operator-=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8859b5b8dc241e4f58243c85d2630cc8',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7003e1e5881e3d106257f22b6a3e59fe',1,'operator-=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3165e37d393be50c2cfa9ddcba153684',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a76f5bd895b7214cbc3cea3440992718a',1,'operator-=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7167343d90eb70e5a0d5fa9ec5398e94',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9b31c363ebc93d592b6fa0e27b00335a',1,'operator-=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a967a1d7b5664f616e5b6f2d257367f0c',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aff19193e1b2cee29a8737318e95cc74a',1,'operator-=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aede0cc4179507b739849948f1a2fed4b',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e1a6056f9c96f3c89fe204dbf103be5',1,'operator-=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9d06cceea5c179bcc608452188bd7d6a',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0aa9ffe056f49fda181bbacbd60556ea',1,'operator-=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ada5685d99c2d6708d1c4ef826d68e879',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a726cecf778b8584b6f7c37db1b064576',1,'operator-=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3816a35f8468156d59c239256c12dcf3',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa332fae098e7c6dc23b98bc0026f1070',1,'operator-=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afb3cd302e0b78902c62111dce4494fe8',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abb884888f14086cc674657677cb4b8bc',1,'operator-=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a38bb89f925eca4f9c042f6ee7a2c0193',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac30c580713f354916088a7dc049ae4cd',1,'operator-=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a715c824ee8c87e0256114a85624d9949',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7bc91aaaf476a37063264d1d53d862cc',1,'operator-=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab155f418f15cabd86ff942c6f9472ddb',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aaa66dc6d7b2c5efbfaa97ca9c7872bd8',1,'operator-=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a696978d9401e09200045b2d8aad045c2',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae998d8f423a9fb73405cfbd4b836bc72',1,'operator-=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a279d09ab8542f1c1a8dc8173b65946b6',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a491dadfae957cd7cc0c36188d910f6f6',1,'operator-=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9a837c3b9c4e42f53d7cd1ed0d266e2f',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acf7af2284269544064b68e807064bba4',1,'operator-=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a28d297705e29009197418546ef435393',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a948579a4d9ba276523190b03b09578fb',1,'operator-=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5a4b98a0a11db5b77cf9168df37c8bc7',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31a3d8f2ff8038f7e0d717845c039808',1,'operator-=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1dac193d9f1c8c0eb4473441895f8c58',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad2817d53fdd4b112babfb6f0b38c8f39',1,'operator-=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa705d87cf4b78e9d7c6b07dd0c66cac6',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a542affc376726840647a6e93acf2c1a7',1,'operator-=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#add18cfe4c0d38e95c6dff6bab3e7a932',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab1de7e7e7304ff3598925d2e69134764',1,'operator-=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0d3fb52437c677c5d0f1a3642384b15c',1,'operator-=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adda64cae388baac1f138b06dc8595237',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af20874a61c6c3f4c3fd045a96e806644',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a8b8a55690df46d97fcfc2a60120783af',1,'mlx::core::operator-=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#ab03949b1f60fa035ce454a894cd73ae9',1,'mlx::core::operator-=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#adaf70bbfb3667df0d08fd3c99896e20a',1,'mlx::core::operator-=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a321c98e5a78621d3c9a3895f707f2f1c',1,'mlx::core::operator-=(float &amp;lhs, _MLX_Float16 rhs)']]],
+  ['operator_2f_30',['operator/',['../backend_2metal_2kernels_2complex_8h.html#ae6a708f67d6fd9b0962aa8877cec6d35',1,'operator/(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c',1,'operator/(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aacaedf12f862c76457133336dd6fc446',1,'operator/(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a584a513596de20663dad951a5b81695e',1,'operator/(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad8f7b11669736fbd6ed2e28211d877d4',1,'operator/(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a59515695ebc48844345fa5120511aed1',1,'operator/(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8c8ac6736440fdca366ebdefe2a12b9f',1,'operator/(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad6859b04680d0d26d75fd6c4dd74ee24',1,'operator/(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4720cc79ab2b8e39952ea9ef20e51250',1,'operator/(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a72d10ec0e62949247da129eb3a83fb9b',1,'operator/(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad6399ba2b8708899739b4cdbb44add8d',1,'operator/(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a998b1ba877a606aedf722ab46b290403',1,'operator/(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa3277ae33976c70f7bd937ddff027b72',1,'operator/(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa708a970a200822c99c0489f389469fa',1,'operator/(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#a6bde717aca2051499f73a3eee199bfdd',1,'mlx::steel::operator/()'],['../group__ops.html#gaeedf77f722b394429f1a7f6c367883bf',1,'mlx::core::operator/(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga7366ec7f453be2a4dc449f0faa1bf554',1,'mlx::core::operator/(double a, const array &amp;b)'],['../group__ops.html#gadfb324ae9b4feb2c7ea0ac6ade639f38',1,'mlx::core::operator/(const array &amp;a, double b)'],['../namespacemlx_1_1core.html#a7573ac3b93ddecd69e9c88a26fc84ba9',1,'mlx::core::operator/(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a40e868dad70401d9aa9ee9c32235c315',1,'mlx::core::operator/(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a7587c28fbd2023b134e5fc12bb0dde23',1,'mlx::core::operator/(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a92cdd377c408becf4cf83c1ee9b7085d',1,'mlx::core::operator/(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aef89566301cb133d98c8e7bdd2b7bec6',1,'mlx::core::operator/(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a94e7b51185590492b46916685641276f',1,'mlx::core::operator/(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a04584788c08180835219d0ea1e2b97b1',1,'mlx::core::operator/(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad5af96e2ff09d207eb1e1980fe3e7c2d',1,'mlx::core::operator/(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ac2217bf760038cd011781158923149ed',1,'mlx::core::operator/(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aea414c04bddc4b9b609262e97398f1b4',1,'mlx::core::operator/(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a27fe23230cd082c0363b9451b731ce6b',1,'mlx::core::operator/(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abdd9bb8fb4411e5924f3eb7ef1bb52f8',1,'mlx::core::operator/(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a50bae338a7353f8b0ed3441071bb0cf6',1,'mlx::core::operator/(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aab26a3284dd3ac7d47c8b5b3a3290ce3',1,'mlx::core::operator/(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a749f48db01de38f259a0c6750a97fa77',1,'mlx::core::operator/(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a32a6a08a2a4652975b0a1bd1fcf3eafd',1,'mlx::core::operator/(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4b66fb38ddc5cc0c2489583d5c499602',1,'mlx::core::operator/(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a45726f1905b709cf8253e6efa046027b',1,'mlx::core::operator/(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afd4170c1e364384f30e6bae341146fa6',1,'mlx::core::operator/(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aef85739d150b9d5609973da8a3f1086a',1,'mlx::core::operator/(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af52a941f8ed9b25eec91402c7b9e281f',1,'mlx::core::operator/(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a477cade78296bc85894170f62db68870',1,'mlx::core::operator/(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a22f5a2257e11423fc2fe18e2dce91590',1,'mlx::core::operator/(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a640d3574dfe6ad934c720ae8bdd78bfa',1,'mlx::core::operator/(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a6f65d8fd0cdddc96fc01f6af95804873',1,'mlx::core::operator/(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a517019d42d4e426b7b98e1c719bb47ce',1,'mlx::core::operator/(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a0beb7a223c542015a4eff4aed814a9dd',1,'mlx::core::operator/(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#abc9b1bd5018d46514bc19d23db2e5063',1,'mlx::core::operator/(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af22937df654ddbd6e398ef12764d18c0',1,'mlx::core::operator/(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a775aed5f49b530c57e71cbac81404d45',1,'mlx::core::operator/(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a97efcd96d6be666e5608034ae77289ef',1,'mlx::core::operator/(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a899851f85dbddd96f9d36319b82542a0',1,'mlx::core::operator/(bfloat16_t lhs, float16_t rhs)']]],
+  ['operator_2f_3d_31',['operator/=',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095',1,'operator/=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a90a1c5130db515db48624d8587edbb91',1,'operator/=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a65f30a2dc199134e35bc7c5d431b2263',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7172d84db640e6c49dff0d08dd64b53e',1,'operator/=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acf7cb9927bf09022088401923f2e1916',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a86b2a001cbec0d3a8d762a3c7ff47b0b',1,'operator/=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a744f72ba83522fe3cc2a49a007b42543',1,'operator/=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a77c678665b34df7652dcde053ca73185',1,'operator/=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae0614b6b199d8a65ae95d4621b118b82',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa846fde89c7d2d18b18ef180a8a9c8a3',1,'operator/=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08e778be18e4a291c108fcc528b981d3',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6b9e49ad9ea256d2d0220c0d81552602',1,'operator/=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab933bc3cdf9adfea10ab9dba5292c812',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a25e7c5d2ecf3375756d59074f333858f',1,'operator/=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ae4a80fde67eea9a0a37b2803946544',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a912393b7208fa45bd1e87f30b218b68b',1,'operator/=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a18963246f2b640874bef6dca7049f64d',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0e2c2c2cb50b3a55ff213f18978aca35',1,'operator/=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a64f1136b17006f168ef837e17240814f',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae46d75b8046d557452d74513f1106710',1,'operator/=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08d2460e259b9106d90d889481ad60d5',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0f7fd418408806ef498745c6fdb2c062',1,'operator/=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac703495cb370b52526a5a2d36ae26038',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ca11d43174baf0a729f93b35eabcbea',1,'operator/=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9f835a0a80c411580c97b65fdc5bdfd3',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a17f47ec9cff60f8e1b3477a2793b7ac0',1,'operator/=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5be23e296bbed3a885586a6424b1666e',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afba39221eb54e272aae79910b3cd7ef5',1,'operator/=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac057d95a2bf087575584aa6f9a2c6bf5',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab986ae2cec780a1f494b7b4468b7ba11',1,'operator/=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a44522c2304c6396bbe6b9d32000f4b6f',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aef8e7e499ea9d432aa743d83c076f945',1,'operator/=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3a0a3edbf1ba2314551454059c3f422b',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acb9f0aef9fbdfde8a4f46e33b0d6c52f',1,'operator/=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a303dfcc81ffd355f866f863d7d9f0fa5',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a359edd4bcb8776861ceb26a3005624c0',1,'operator/=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adc9f32cc6f40768df4285fba2e4783c7',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae71f66d814a03f6377c9d86cf0a2b5d7',1,'operator/=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad0125b6baba3065a87a174ec27aa9a61',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5cc74ad3e522d7104e6e2117751151ad',1,'operator/=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab3b594321fb42b0c2da99954d1e0976c',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4a0023e2fd08875156cd6ef747fbb5cd',1,'operator/=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4358ee606e66ba2081fcf94f9c3b5915',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad1e7ef6f065695d4b1d017547b60ef62',1,'operator/=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a284dfc702f0f67b9c233b87162eeabdd',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab8f211ea896fc5190004f3ad6ad8932f',1,'operator/=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e1bcf3bc06cbcbc304c0cdf729802bc',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abbe42648a46092137b303ccd08f7df86',1,'operator/=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af1a12a1efb618a57da6dd41ae18cb53c',1,'operator/=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a94686039356dfa9aa45608a8b0562fdc',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa251d6483d3b099d1b5311fbe6f0bce2',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a045ff27257cb6d8ab7a94771ba5a17e6',1,'mlx::core::operator/=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a58112951a56a0f9f8c90b60fe74f9508',1,'mlx::core::operator/=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae736defc89a04fbaf7627ad2695bb838',1,'mlx::core::operator/=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#ab1f260710251256ef737dd59be9e143c',1,'mlx::core::operator/=(float &amp;lhs, _MLX_Float16 rhs)']]],
+  ['operator_3c_32',['operator&lt;',['../backend_2metal_2kernels_2complex_8h.html#a67674e32596a9dae2258bb8e0e6a2058',1,'operator&lt;(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25',1,'operator&lt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aab02c65bc38ea66335b2192ead4095a8',1,'operator&lt;(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae91686513e284bcc9635833744bbdda1',1,'operator&lt;(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2486f3b5de85b0d57f458d8f21f82b42',1,'operator&lt;(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a435a2aec4c777b4b184ff5d24992e8a1',1,'operator&lt;(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abdd04257e6a73883b5f56f1186d0e906',1,'operator&lt;(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a69984aaa05ae1d4fccccf7f57e8ecb4a',1,'operator&lt;(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a501cc01d5bf15d9f03aa28545f9624ea',1,'operator&lt;(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1b029e4ca72125a5f9471f582c819705',1,'operator&lt;(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0736a76f56578d26ba1422dc8b744a18',1,'operator&lt;(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24b1fa8998c892f90f8dde7c34fb10a5',1,'operator&lt;(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af80ff2020ec2c4b406c5fdae3fe55e63',1,'operator&lt;(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac03f6eefb836373d37dc280b0d813d78',1,'operator&lt;(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#adb5f24b57d98214fc215a06475f21412',1,'mlx::steel::operator&lt;()'],['../group__ops.html#gaee41e2b8f61d563200ff03575ac1d6c3',1,'mlx::core::operator&lt;(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga1ef8ea11cf15ce628c54201fa42748ef',1,'mlx::core::operator&lt;(T a, const array &amp;b)'],['../group__ops.html#ga95e72226dc7a79c40b3d16f990922050',1,'mlx::core::operator&lt;(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a987d631e1508e8df55d98ddd57e4d086',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad3fb46370cd8f0992866fad9e2c64a3c',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a3026691bf7ee5095243a8611bf3411aa',1,'mlx::core::operator&lt;(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0d42d6c1d5f77a96e2f296b8ebd79ee6',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ab5ce08a7de0a0ca00d61f7a7f8ea3ab4',1,'mlx::core::operator&lt;(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abce8b7f24b61e5ec0f9a3afe20845caf',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#aff97612627ae1ed260c43c0a7af0d306',1,'mlx::core::operator&lt;(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a9119e518234df7923cae2b3802d59bf2',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#aefb9b05ce8864ada99a920ab32017b89',1,'mlx::core::operator&lt;(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abc55f3676c2d112a6e9ab276bd6b1796',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#afe6581a2c45f24d7fab1e4006c1e3c70',1,'mlx::core::operator&lt;(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aca1d50cdd9506481dcc4cd1ad4a4f734',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a310720f513b6a2490e9df80c65f1bfb3',1,'mlx::core::operator&lt;(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a29e457a170b6cefb6ba1e394c96c6f7b',1,'mlx::core::operator&lt;(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#afd4519985b6b207ec41ad8530d1036df',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae1e41ca94022e43a00cdfc5845102daa',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ac80f4022bffd95b57526685ce8e1cbc1',1,'mlx::core::operator&lt;(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a3a8f6f0af477788c4f0aa98abfc5f1ab',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a3728ed9b6cbd152bf675251a0501b466',1,'mlx::core::operator&lt;(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a5b9ad811a5e1358100c5423dd70ea387',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a5c77e1db83995d3e06a8a26265bce5d6',1,'mlx::core::operator&lt;(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab8a0a3f70664049b35ce1887bd8ff5c2',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6652d93bfb2d426e261a1712a181a4d2',1,'mlx::core::operator&lt;(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a03758b8d13da2de07cc4f4fc45d2854b',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a325161b81a9ff179fd37d949780a17ba',1,'mlx::core::operator&lt;(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a92eca79fce8233e4299343eee3996511',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#adb016662b8f7eb680abfe1a421eabe72',1,'mlx::core::operator&lt;(uint64_t lhs, _MLX_Float16 rhs)']]],
   ['operator_3c_3c_33',['operator&lt;&lt;',['../group__ops.html#gad656c30f9fd7d9467e405657b325aa7e',1,'mlx::core::operator&lt;&lt;(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#a1e5c30e316afa30c14bc48b92afdb794',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Device &amp;d)'],['../namespacemlx_1_1core.html#a4ddd07021b36c848d6fb1dd9ac276822',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a0023c267cf81345fad65e7a797954cd3',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Dtype &amp;d)'],['../namespacemlx_1_1core.html#a1fd58658474fb842d648dcf8f7d9f078',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Dtype::Kind &amp;k)'],['../namespacemlx_1_1core.html#a123331f01188bd76e37623b63b6b4340',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, array a)'],['../namespacemlx_1_1core.html#a4e733bba89760abed32393e085812b22',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const std::vector&lt; int &gt; &amp;v)'],['../namespacemlx_1_1core.html#a6276bb9bad43ed4a27a1e2c3f5bfd990',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const std::vector&lt; size_t &gt; &amp;v)'],['../namespacemlx_1_1core.html#a5e5bd5c57b1cf19776bdb41e732861d9',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const std::vector&lt; int64_t &gt; &amp;v)'],['../namespacemlx_1_1core.html#a42a19c8442b173606e714364227e7d45',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const complex64_t &amp;v)'],['../namespacemlx_1_1core.html#a57eb97a5eba99a846ac429795e407574',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const float16_t &amp;v)'],['../namespacemlx_1_1core.html#a7db909d54cf07375e89424c32c07a29c',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const bfloat16_t &amp;v)']]],
-  ['operator_3c_3d_34',['operator&lt;=',['../backend_2metal_2kernels_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05',1,'operator&lt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5a81eae168dfafd299c2b94e3e8558cf',1,'operator&lt;=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0f486bf02c6ad5b9b6a96d3450f03e47',1,'operator&lt;=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acba9efe192d22b7781b4622103c7a944',1,'operator&lt;=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aff100489cc40ad276c2d5d67a9df67db',1,'operator&lt;=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7eac96f64ca42991caf819c8e8c8d2bc',1,'operator&lt;=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a88c11cd37600de5480570da3d2ae5732',1,'operator&lt;=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08c7d12a0d16565fbf052dba2db8b22d',1,'operator&lt;=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2b9de9624c0a507b4ead85f898ad9daf',1,'operator&lt;=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a28f8d21c5eef047c701cf690ce9c2ef0',1,'operator&lt;=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a14b56c687053ee2432398a25663c068f',1,'operator&lt;=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0f360806708b95a3be400af0b8871b57',1,'operator&lt;=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a80d288f22cadfdf5e904410349e616a1',1,'operator&lt;=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#aee04c9a63c6716a99a027418354debb0',1,'operator&lt;=(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#a6cc3bab5e7f6e7c719c82afa90ad2827',1,'mlx::steel::operator&lt;=()'],['../group__ops.html#ga4c8b8a1632944acaae50f0de6c23ece6',1,'mlx::core::operator&lt;=(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga150a9be467c9f91482a6d6fc13504bc4',1,'mlx::core::operator&lt;=(T a, const array &amp;b)'],['../group__ops.html#ga624eeccef0cc4b130e1325abfea057cb',1,'mlx::core::operator&lt;=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a0066a47cb21223ddebc77992ee874fb9',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2593dbace3ce50e7146d9514726a543f',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a88654bcf6c9728517a2933ca2e29a7c1',1,'mlx::core::operator&lt;=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a5d4f449e9c1699b99fcf894dd15e8af3',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a6b678bea8fdcda1f11c6691b56a15211',1,'mlx::core::operator&lt;=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae8aacc606ea16f018a90eae758830a35',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a25668dea4ffb51c7c00eeecb9530d1d8',1,'mlx::core::operator&lt;=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a084558b6a5487549799c49c37c9e9652',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ade2e2a0daa79d5c52f278f85f03dde2e',1,'mlx::core::operator&lt;=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a750a2d2b4976ad94b08994d081f83445',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ade5a175ff45347689ac4c798d04c8ffc',1,'mlx::core::operator&lt;=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae25e0c01b46612f039313a4825ba6428',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a5c90f16d8f6edf4b75c96b945b9fa591',1,'mlx::core::operator&lt;=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a8cd6583fa0fc9957f993e00b2ec01d91',1,'mlx::core::operator&lt;=(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a012130a0458cbc30b88365e0e0eab232',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae8c890bdcffadee8c5dab85c907f57eb',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a43cb070553c1f2fffb32ef6670e30980',1,'mlx::core::operator&lt;=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ac759b7798d668a99535e59e26d6ba192',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a70e528a789b5660d98e783b045aaa379',1,'mlx::core::operator&lt;=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a40bd8abb8a4d989ddabbb298518bd7f5',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a4155d4b0c76f37ab5e0b54f9cd683f35',1,'mlx::core::operator&lt;=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad8bb648d0603a206e0392990c911ca0b',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ace72a5853f2afd6510dcb97d54fa650d',1,'mlx::core::operator&lt;=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab38f7a0d3c0809071ff5d3af859018d6',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a7904b886d7b535a6af0a885d00597323',1,'mlx::core::operator&lt;=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a57952168bd0b54c2677204d4ab1cb6e5',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a6235dc5f4db517618bb3449b08c96e8b',1,'mlx::core::operator&lt;=(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_3c_3d_34',['operator&lt;=',['../backend_2metal_2kernels_2complex_8h.html#aee04c9a63c6716a99a027418354debb0',1,'operator&lt;=(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05',1,'operator&lt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5a81eae168dfafd299c2b94e3e8558cf',1,'operator&lt;=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0f486bf02c6ad5b9b6a96d3450f03e47',1,'operator&lt;=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acba9efe192d22b7781b4622103c7a944',1,'operator&lt;=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aff100489cc40ad276c2d5d67a9df67db',1,'operator&lt;=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7eac96f64ca42991caf819c8e8c8d2bc',1,'operator&lt;=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a88c11cd37600de5480570da3d2ae5732',1,'operator&lt;=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08c7d12a0d16565fbf052dba2db8b22d',1,'operator&lt;=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2b9de9624c0a507b4ead85f898ad9daf',1,'operator&lt;=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a28f8d21c5eef047c701cf690ce9c2ef0',1,'operator&lt;=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a14b56c687053ee2432398a25663c068f',1,'operator&lt;=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0f360806708b95a3be400af0b8871b57',1,'operator&lt;=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a80d288f22cadfdf5e904410349e616a1',1,'operator&lt;=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#a6cc3bab5e7f6e7c719c82afa90ad2827',1,'mlx::steel::operator&lt;=()'],['../group__ops.html#ga4c8b8a1632944acaae50f0de6c23ece6',1,'mlx::core::operator&lt;=(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga150a9be467c9f91482a6d6fc13504bc4',1,'mlx::core::operator&lt;=(T a, const array &amp;b)'],['../group__ops.html#ga624eeccef0cc4b130e1325abfea057cb',1,'mlx::core::operator&lt;=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a0066a47cb21223ddebc77992ee874fb9',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2593dbace3ce50e7146d9514726a543f',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a88654bcf6c9728517a2933ca2e29a7c1',1,'mlx::core::operator&lt;=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a5d4f449e9c1699b99fcf894dd15e8af3',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a6b678bea8fdcda1f11c6691b56a15211',1,'mlx::core::operator&lt;=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae8aacc606ea16f018a90eae758830a35',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a25668dea4ffb51c7c00eeecb9530d1d8',1,'mlx::core::operator&lt;=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a084558b6a5487549799c49c37c9e9652',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ade2e2a0daa79d5c52f278f85f03dde2e',1,'mlx::core::operator&lt;=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a750a2d2b4976ad94b08994d081f83445',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ade5a175ff45347689ac4c798d04c8ffc',1,'mlx::core::operator&lt;=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae25e0c01b46612f039313a4825ba6428',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a5c90f16d8f6edf4b75c96b945b9fa591',1,'mlx::core::operator&lt;=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a8cd6583fa0fc9957f993e00b2ec01d91',1,'mlx::core::operator&lt;=(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a012130a0458cbc30b88365e0e0eab232',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae8c890bdcffadee8c5dab85c907f57eb',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a43cb070553c1f2fffb32ef6670e30980',1,'mlx::core::operator&lt;=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ac759b7798d668a99535e59e26d6ba192',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a70e528a789b5660d98e783b045aaa379',1,'mlx::core::operator&lt;=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a40bd8abb8a4d989ddabbb298518bd7f5',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a4155d4b0c76f37ab5e0b54f9cd683f35',1,'mlx::core::operator&lt;=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad8bb648d0603a206e0392990c911ca0b',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ace72a5853f2afd6510dcb97d54fa650d',1,'mlx::core::operator&lt;=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab38f7a0d3c0809071ff5d3af859018d6',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a7904b886d7b535a6af0a885d00597323',1,'mlx::core::operator&lt;=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a57952168bd0b54c2677204d4ab1cb6e5',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a6235dc5f4db517618bb3449b08c96e8b',1,'mlx::core::operator&lt;=(uint64_t lhs, _MLX_Float16 rhs)']]],
   ['operator_3d_35',['operator=',['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a027b84cddc8d476f736ac1f1a9991fe4',1,'mlx::core::allocator::Allocator::operator=(const Allocator &amp;other)=delete'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a2e971b47339b1d0849a334a902a9df3c',1,'mlx::core::allocator::Allocator::operator=(Allocator &amp;&amp;other)=delete'],['../classmlx_1_1core_1_1array.html#a8acf2b4c75f9b7f79da6675dbc36cf36',1,'mlx::core::array::operator=(const array &amp;other) &amp;&amp;=delete'],['../classmlx_1_1core_1_1array.html#a5c89c2406a610b32943955f9a5060fbd',1,'mlx::core::array::operator=(array &amp;&amp;other) &amp;&amp;=delete'],['../classmlx_1_1core_1_1array.html#ad3277ff68f1336aa217f9cbe40181479',1,'mlx::core::array::operator=(array &amp;&amp;other) &amp;=default'],['../classmlx_1_1core_1_1array.html#a5da41aabecf4c8055b7515341bf57147',1,'mlx::core::array::operator=(const array &amp;other) &amp;'],['../structmlx_1_1core_1_1array_1_1_data.html#a68e9417954fe811b5e41e6317a526748',1,'mlx::core::array::Data::operator=()'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e',1,'mlx::core::metal::CommandEncoder::operator=()'],['../classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73',1,'mlx::core::metal::Device::operator=()'],['../classmlx_1_1core_1_1metal_1_1_residency_set.html#aef97dbbc755940789f99a26164591c45',1,'mlx::core::metal::ResidencySet::operator=()'],['../classmlx_1_1core_1_1_primitive.html#a6b1be7ea92f3a7bb19875c70259dad6b',1,'mlx::core::Primitive::operator=(const Primitive &amp;other)=delete'],['../classmlx_1_1core_1_1_primitive.html#a50bbddd43e1ba0cf5f127cd7aa756a9e',1,'mlx::core::Primitive::operator=(Primitive &amp;&amp;other)=delete'],['../classmlx_1_1core_1_1_unary_primitive.html#a0a859309a4f192f2679e07f2e4ff4d22',1,'mlx::core::UnaryPrimitive::operator=(const UnaryPrimitive &amp;other)=delete'],['../classmlx_1_1core_1_1_unary_primitive.html#ab90b2ea80f1d914be03cf44def5db5a5',1,'mlx::core::UnaryPrimitive::operator=(UnaryPrimitive &amp;&amp;other)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ab170dbd2ce34c51e2eeebf5d08e7e2db',1,'mlx::core::scheduler::Scheduler::operator=(const Scheduler &amp;)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a035ea35f4dd8ee985973080f14029379',1,'mlx::core::scheduler::Scheduler::operator=(Scheduler &amp;&amp;)=delete'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#a0f65b0523b8ddd989f338da6cb2860e3',1,'mlx::core::_MLX_BFloat16::operator=(std::vector&lt; bool &gt;::reference x)'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#abb8cd44ee22b17c55333ff2eb4e13a14',1,'mlx::core::_MLX_BFloat16::operator=(const float &amp;x)'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a608a099bf7116ee608dcfd31ea3ade2c',1,'mlx::core::_MLX_Float16::operator=(std::vector&lt; bool &gt;::reference x)'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a35543c3653d477c46350697fb808373d',1,'mlx::core::_MLX_Float16::operator=(const float &amp;x)']]],
-  ['operator_3d_3d_36',['operator==',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a1afd6d2a19a2b0d712063f221ab4eba7',1,'mlx::core::array::ArrayIterator::operator=='],['../backend_2metal_2kernels_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065',1,'operator==(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0aa3bfcfab53700488e5f386e6de60d5',1,'operator==(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3936148781ab1c4f33f58d12c116f370',1,'operator==(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae753526b669fba27771089dc809abd66',1,'operator==(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a05a4f197a71d0f16879032f44492bb79',1,'operator==(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae86f5917847b1ec9f313996250f2e0be',1,'operator==(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aab74ec4d33a64b92b908717d500f1ecf',1,'operator==(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac30a2c1fa6f172af903fdeb6a8632606',1,'operator==(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab4e9ad547aa23daa351075e0ecc58fa2',1,'operator==(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa5fa1a8f2b39c3508fe38205469756d1',1,'operator==(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aeadc1f36c6bdc219294ce9341d80afa5',1,'operator==(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3ae2091ada1e39e857fbc53c97bdb79f',1,'operator==(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac7b4d295f3c7b1e09964f24f306422da',1,'operator==(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#abfc19f03616441245dfc7726b278f190',1,'operator==(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#abcc797f27e87e857b41c1a8d33ee2c78',1,'mlx::steel::operator==()'],['../namespacemlx_1_1core.html#a937503d72b66c661bf3f5fdcd98ef97c',1,'mlx::core::operator==(const Device &amp;lhs, const Device &amp;rhs)'],['../group__ops.html#gaa30cf69f3d22f65615f5e1696dd5703f',1,'mlx::core::operator==(const array &amp;a, const array &amp;b)'],['../group__ops.html#gaf115782d009ac2a547fcca395c9ec797',1,'mlx::core::operator==(T a, const array &amp;b)'],['../group__ops.html#ga3ad3ed7aece2650943a35082dbe3a0a5',1,'mlx::core::operator==(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#ac470f937a379d6356c8f567c97cd7481',1,'mlx::core::operator==(const Stream &amp;lhs, const Stream &amp;rhs)'],['../namespacemlx_1_1core.html#aec63a0472cb943fe39f31e7678555572',1,'mlx::core::operator==(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad05311ca8e2f19ffe5849e963837cec7',1,'mlx::core::operator==(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aaaf591cb2188381e6cbd857132d04eb7',1,'mlx::core::operator==(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7ef33c33509ccccf1ab217500e8b3c1a',1,'mlx::core::operator==(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#abec4200a718b7c5ed80b7abcc4447260',1,'mlx::core::operator==(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad853981b1c5ba69b07d54c7b77055d22',1,'mlx::core::operator==(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a752d6cb4172a9cb91e5da19582329c6d',1,'mlx::core::operator==(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0175beb3de139faa08479a88215b35ea',1,'mlx::core::operator==(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a61da2851cb3beeef28049228346c28b5',1,'mlx::core::operator==(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aa24713cb9e39bacb516c992eb03d2b2b',1,'mlx::core::operator==(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a6d565dd93c46259f9486d9fdf0969589',1,'mlx::core::operator==(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a24e79a82557861de64dad66d36e6ff30',1,'mlx::core::operator==(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#af27d515ac390d62bd852b73ea759a947',1,'mlx::core::operator==(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae3e1e8b7a5410e0edf35f31f74295e2f',1,'mlx::core::operator==(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aaa22230a66b15c3e774d8ce45783a746',1,'mlx::core::operator==(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ae2a0bcdc171d7e9745d33e1d9aac4f8a',1,'mlx::core::operator==(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a331ec62442a8d3eb8ccba7b4de5168d1',1,'mlx::core::operator==(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#acfcaefe0990eb3533e2b11a6f2657492',1,'mlx::core::operator==(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a8d48dbd49cccff07777affb2a412058c',1,'mlx::core::operator==(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a88eae27edd22fa4418776672023cb276',1,'mlx::core::operator==(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a188b363f633ea360407b3f9cf4e1f1a6',1,'mlx::core::operator==(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ae065fe5c42c1a333d7858d19f6434fa9',1,'mlx::core::operator==(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a2f98db199deb6d7a82551fa4afec655a',1,'mlx::core::operator==(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a85f83add412cb320b5cd1c3da6aadbd5',1,'mlx::core::operator==(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7e2cee66c3ca1b56f4f3d7fd1d6e0be1',1,'mlx::core::operator==(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#ad436557da5c7fea71fc58182a876cfe5',1,'mlx::core::operator==(uint64_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_3e_37',['operator&gt;',['../backend_2metal_2kernels_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57',1,'operator&gt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab9cd098786d2f4c855c42e4a6f30ab3e',1,'operator&gt;(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a55600f3b9859e2891e0e0b5690867b72',1,'operator&gt;(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afd7cdb8ed2a9820efe9cf322c06f188c',1,'operator&gt;(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a31bbdbe0b62b90a4d6ea4bb0a7db586b',1,'operator&gt;(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a68125e66f74eaffe5ea9267638ce870d',1,'operator&gt;(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac89eb6b29edad8cca63727ab97171c29',1,'operator&gt;(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a74e477567c9477c2cf0684f81ef4498f',1,'operator&gt;(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2d37130b6fd79b425f5ba92b65e36bed',1,'operator&gt;(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a41d55d167e9dc63bf29d15e0ff004869',1,'operator&gt;(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa95f9ebfdab3c5f524775651362ce914',1,'operator&gt;(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2826bd301bb5393473ccd363f2052c0d',1,'operator&gt;(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a62a512d0edd894759c69f724b970fbdb',1,'operator&gt;(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#a032a8d3eec2384c9f03066f7fd945995',1,'operator&gt;(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#a7512eadda6160e4c9d9e6aa4049fac20',1,'mlx::steel::operator&gt;()'],['../group__ops.html#ga74fd2777adef10e6fe628a9cdadb01cb',1,'mlx::core::operator&gt;(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga32e106e794e2c32e4e7decee2df2477f',1,'mlx::core::operator&gt;(T a, const array &amp;b)'],['../group__ops.html#ga96552b90e89923c5d2064cc427775ec5',1,'mlx::core::operator&gt;(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#aedc4e9df4bf71c0ac34fcfae60cdf550',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a14c188303d09b97867bcfd34519aa4a6',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ac97736fadafa7efa201624d0e1128ee8',1,'mlx::core::operator&gt;(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3c41a304126bc225bdc68062d1eb6e7e',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ab594f3ae1ee13227fae940fef0d00cb9',1,'mlx::core::operator&gt;(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a01dabc077a872c115a9a9ccd95f1acec',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#adabbd8768d216873617768249473a5c7',1,'mlx::core::operator&gt;(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#adae1b14669d27ce1fe0c214771c07b77',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ab03a22961d99fa12d3e74b3116e94e8f',1,'mlx::core::operator&gt;(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a42011a27a3d23a60be5be44ee7cac87c',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a50f6a94bb36d89cf28817aff88ab89c8',1,'mlx::core::operator&gt;(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac173de50ee57b1b066d49363ba978c53',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#ab09f1b4879aa3190c2f66c9bd1224021',1,'mlx::core::operator&gt;(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a91eb6ca854217424129a55ae95a123b5',1,'mlx::core::operator&gt;(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a58d5795d8312599d101ae16f194e4a2a',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aafa3bbeda78610c4285f3e57042268f3',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a8a928d76a6fbf3d336296401e14617a4',1,'mlx::core::operator&gt;(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ade2f9222fd433cd4d673c6182f256235',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ae24c337810c841ff23e327efde7045e1',1,'mlx::core::operator&gt;(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acf401ede354fcc998b13ea6442994d7e',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a2bb28a9a0894a73ae1b27e7f4da0841a',1,'mlx::core::operator&gt;(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a09d631e8a85fd7ae72e1a868b8f9b9cb',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a49421ea65b5a98df080d75b1636b2157',1,'mlx::core::operator&gt;(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a692ce931b660415e17f92d18a8e0d446',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a579bb87b3ede5663d7cd68c7c0f6fb9e',1,'mlx::core::operator&gt;(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af810587a17e692f4eec256d3c3cd27de',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a50f4177d3ca03a95fc2614e100c7391d',1,'mlx::core::operator&gt;(uint64_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_3e_3d_38',['operator&gt;=',['../backend_2metal_2kernels_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f',1,'operator&gt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a64f6787a96386246f83a8981d274150e',1,'operator&gt;=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1a788f82212afad30e4c2ee40f1c313c',1,'operator&gt;=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae88617c4a012c5dc12781a349a28c886',1,'operator&gt;=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a467a88531150a4d9d30fce07c49c126e',1,'operator&gt;=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9e21c5ea9dd724dc2ca8c54ad908f09c',1,'operator&gt;=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2f6286d222e2176bcbdc824c5d598100',1,'operator&gt;=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abec53064aa96265385ecc57de5fbc74c',1,'operator&gt;=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac766839f8f9e4863e8e18418c342c875',1,'operator&gt;=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2807fa6862b0f9689c81199b1e695ed8',1,'operator&gt;=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aee3ae0d0d1f941463b06eca0bf041b2b',1,'operator&gt;=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a523eda93c809733368e2b45382d2add6',1,'operator&gt;=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1f4e90909ac1c7280f4c7d1977c55fb7',1,'operator&gt;=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#aafbd686c180398c98b33d7643f893a46',1,'operator&gt;=(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#aa3c95c60cf69603705bb4636de547bcb',1,'mlx::steel::operator&gt;=()'],['../group__ops.html#ga3a41895f25ed083a36994d95fa102546',1,'mlx::core::operator&gt;=(const array &amp;a, const array &amp;b)'],['../group__ops.html#gaf509f2cb3b18963232f20d6c3bd229b2',1,'mlx::core::operator&gt;=(T a, const array &amp;b)'],['../group__ops.html#gafa0eb25d5978674bfc9e59d4145ec590',1,'mlx::core::operator&gt;=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a8494764f5c686743ede66dc76d85d955',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a019df48807b506d9995856684bf7797a',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a96ab6405430efb887cdb5c828cb67d6e',1,'mlx::core::operator&gt;=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac18be72269b1bcfb0249cc00a0600681',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aeb879815228efbd2c8f80986e1c8d41f',1,'mlx::core::operator&gt;=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0051156f6a568f58cd54850f746fb507',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ae93556906e115625ed1b62d36cf21b70',1,'mlx::core::operator&gt;=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab81ad16e3be591dfc9e42ac3c19b055f',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6cfe9b03e7c5f1eb9374208a552c3cc9',1,'mlx::core::operator&gt;=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2f5add83812fb137dd9226c6c01e45d5',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ad1014a836e7ce9301de8588eef1e89ee',1,'mlx::core::operator&gt;=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a17791561434dc995de9f268d145c0ed1',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a3755925b24a903045937464be117de2f',1,'mlx::core::operator&gt;=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a6262aeb513d27fc8313293b261e72abb',1,'mlx::core::operator&gt;=(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a6feb4b3ea511b0eda4d1ec9725f3fb4c',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a03b3f7fcb755ec075985ab26336926f0',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aecfbf5ef4872ae447eb4a374e4db28e4',1,'mlx::core::operator&gt;=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae4690f349b2483f5d1a4b75aba67399f',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a667e95146dd5199e67bcb121b984b1f0',1,'mlx::core::operator&gt;=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a3375f1562f148bdc07451f2b6e54e6df',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ae83df12368cb07ccb1c10c1117ff3922',1,'mlx::core::operator&gt;=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad41251938cf852b5560c1180944ebb49',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a4ddb5ef0b88929086f9b09729fda0dde',1,'mlx::core::operator&gt;=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a0908a61ab261aff726922b33fa6ed159',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a0fdadf87edd8a0a57c63953fb0ebe053',1,'mlx::core::operator&gt;=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a47c82778e43032c0bbf5d59407e81dc9',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a14e6c43b924eacca1b2dac1d5d00ca2b',1,'mlx::core::operator&gt;=(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_3d_3d_36',['operator==',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a1afd6d2a19a2b0d712063f221ab4eba7',1,'mlx::core::array::ArrayIterator::operator=='],['../backend_2metal_2kernels_2complex_8h.html#abfc19f03616441245dfc7726b278f190',1,'operator==(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065',1,'operator==(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0aa3bfcfab53700488e5f386e6de60d5',1,'operator==(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3936148781ab1c4f33f58d12c116f370',1,'operator==(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae753526b669fba27771089dc809abd66',1,'operator==(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a05a4f197a71d0f16879032f44492bb79',1,'operator==(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae86f5917847b1ec9f313996250f2e0be',1,'operator==(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aab74ec4d33a64b92b908717d500f1ecf',1,'operator==(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac30a2c1fa6f172af903fdeb6a8632606',1,'operator==(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab4e9ad547aa23daa351075e0ecc58fa2',1,'operator==(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa5fa1a8f2b39c3508fe38205469756d1',1,'operator==(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeadc1f36c6bdc219294ce9341d80afa5',1,'operator==(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3ae2091ada1e39e857fbc53c97bdb79f',1,'operator==(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac7b4d295f3c7b1e09964f24f306422da',1,'operator==(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#abcc797f27e87e857b41c1a8d33ee2c78',1,'mlx::steel::operator==()'],['../namespacemlx_1_1core.html#a937503d72b66c661bf3f5fdcd98ef97c',1,'mlx::core::operator==(const Device &amp;lhs, const Device &amp;rhs)'],['../group__ops.html#gaa30cf69f3d22f65615f5e1696dd5703f',1,'mlx::core::operator==(const array &amp;a, const array &amp;b)'],['../group__ops.html#gaf115782d009ac2a547fcca395c9ec797',1,'mlx::core::operator==(T a, const array &amp;b)'],['../group__ops.html#ga3ad3ed7aece2650943a35082dbe3a0a5',1,'mlx::core::operator==(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#ac470f937a379d6356c8f567c97cd7481',1,'mlx::core::operator==(const Stream &amp;lhs, const Stream &amp;rhs)'],['../namespacemlx_1_1core.html#aec63a0472cb943fe39f31e7678555572',1,'mlx::core::operator==(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad05311ca8e2f19ffe5849e963837cec7',1,'mlx::core::operator==(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aaaf591cb2188381e6cbd857132d04eb7',1,'mlx::core::operator==(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7ef33c33509ccccf1ab217500e8b3c1a',1,'mlx::core::operator==(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#abec4200a718b7c5ed80b7abcc4447260',1,'mlx::core::operator==(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad853981b1c5ba69b07d54c7b77055d22',1,'mlx::core::operator==(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a752d6cb4172a9cb91e5da19582329c6d',1,'mlx::core::operator==(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0175beb3de139faa08479a88215b35ea',1,'mlx::core::operator==(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a61da2851cb3beeef28049228346c28b5',1,'mlx::core::operator==(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aa24713cb9e39bacb516c992eb03d2b2b',1,'mlx::core::operator==(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a6d565dd93c46259f9486d9fdf0969589',1,'mlx::core::operator==(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a24e79a82557861de64dad66d36e6ff30',1,'mlx::core::operator==(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#af27d515ac390d62bd852b73ea759a947',1,'mlx::core::operator==(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae3e1e8b7a5410e0edf35f31f74295e2f',1,'mlx::core::operator==(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aaa22230a66b15c3e774d8ce45783a746',1,'mlx::core::operator==(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ae2a0bcdc171d7e9745d33e1d9aac4f8a',1,'mlx::core::operator==(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a331ec62442a8d3eb8ccba7b4de5168d1',1,'mlx::core::operator==(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#acfcaefe0990eb3533e2b11a6f2657492',1,'mlx::core::operator==(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a8d48dbd49cccff07777affb2a412058c',1,'mlx::core::operator==(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a88eae27edd22fa4418776672023cb276',1,'mlx::core::operator==(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a188b363f633ea360407b3f9cf4e1f1a6',1,'mlx::core::operator==(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ae065fe5c42c1a333d7858d19f6434fa9',1,'mlx::core::operator==(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a2f98db199deb6d7a82551fa4afec655a',1,'mlx::core::operator==(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a85f83add412cb320b5cd1c3da6aadbd5',1,'mlx::core::operator==(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7e2cee66c3ca1b56f4f3d7fd1d6e0be1',1,'mlx::core::operator==(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#ad436557da5c7fea71fc58182a876cfe5',1,'mlx::core::operator==(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_3e_37',['operator&gt;',['../backend_2metal_2kernels_2complex_8h.html#a032a8d3eec2384c9f03066f7fd945995',1,'operator&gt;(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57',1,'operator&gt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab9cd098786d2f4c855c42e4a6f30ab3e',1,'operator&gt;(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a55600f3b9859e2891e0e0b5690867b72',1,'operator&gt;(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afd7cdb8ed2a9820efe9cf322c06f188c',1,'operator&gt;(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31bbdbe0b62b90a4d6ea4bb0a7db586b',1,'operator&gt;(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a68125e66f74eaffe5ea9267638ce870d',1,'operator&gt;(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac89eb6b29edad8cca63727ab97171c29',1,'operator&gt;(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a74e477567c9477c2cf0684f81ef4498f',1,'operator&gt;(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2d37130b6fd79b425f5ba92b65e36bed',1,'operator&gt;(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a41d55d167e9dc63bf29d15e0ff004869',1,'operator&gt;(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa95f9ebfdab3c5f524775651362ce914',1,'operator&gt;(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2826bd301bb5393473ccd363f2052c0d',1,'operator&gt;(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a62a512d0edd894759c69f724b970fbdb',1,'operator&gt;(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#a7512eadda6160e4c9d9e6aa4049fac20',1,'mlx::steel::operator&gt;()'],['../group__ops.html#ga74fd2777adef10e6fe628a9cdadb01cb',1,'mlx::core::operator&gt;(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga32e106e794e2c32e4e7decee2df2477f',1,'mlx::core::operator&gt;(T a, const array &amp;b)'],['../group__ops.html#ga96552b90e89923c5d2064cc427775ec5',1,'mlx::core::operator&gt;(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#aedc4e9df4bf71c0ac34fcfae60cdf550',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a14c188303d09b97867bcfd34519aa4a6',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ac97736fadafa7efa201624d0e1128ee8',1,'mlx::core::operator&gt;(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3c41a304126bc225bdc68062d1eb6e7e',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ab594f3ae1ee13227fae940fef0d00cb9',1,'mlx::core::operator&gt;(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a01dabc077a872c115a9a9ccd95f1acec',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#adabbd8768d216873617768249473a5c7',1,'mlx::core::operator&gt;(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#adae1b14669d27ce1fe0c214771c07b77',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ab03a22961d99fa12d3e74b3116e94e8f',1,'mlx::core::operator&gt;(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a42011a27a3d23a60be5be44ee7cac87c',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a50f6a94bb36d89cf28817aff88ab89c8',1,'mlx::core::operator&gt;(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac173de50ee57b1b066d49363ba978c53',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#ab09f1b4879aa3190c2f66c9bd1224021',1,'mlx::core::operator&gt;(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a91eb6ca854217424129a55ae95a123b5',1,'mlx::core::operator&gt;(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a58d5795d8312599d101ae16f194e4a2a',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aafa3bbeda78610c4285f3e57042268f3',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a8a928d76a6fbf3d336296401e14617a4',1,'mlx::core::operator&gt;(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ade2f9222fd433cd4d673c6182f256235',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ae24c337810c841ff23e327efde7045e1',1,'mlx::core::operator&gt;(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acf401ede354fcc998b13ea6442994d7e',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a2bb28a9a0894a73ae1b27e7f4da0841a',1,'mlx::core::operator&gt;(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a09d631e8a85fd7ae72e1a868b8f9b9cb',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a49421ea65b5a98df080d75b1636b2157',1,'mlx::core::operator&gt;(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a692ce931b660415e17f92d18a8e0d446',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a579bb87b3ede5663d7cd68c7c0f6fb9e',1,'mlx::core::operator&gt;(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af810587a17e692f4eec256d3c3cd27de',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a50f4177d3ca03a95fc2614e100c7391d',1,'mlx::core::operator&gt;(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_3e_3d_38',['operator&gt;=',['../backend_2metal_2kernels_2complex_8h.html#aafbd686c180398c98b33d7643f893a46',1,'operator&gt;=(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f',1,'operator&gt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a64f6787a96386246f83a8981d274150e',1,'operator&gt;=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1a788f82212afad30e4c2ee40f1c313c',1,'operator&gt;=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae88617c4a012c5dc12781a349a28c886',1,'operator&gt;=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a467a88531150a4d9d30fce07c49c126e',1,'operator&gt;=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9e21c5ea9dd724dc2ca8c54ad908f09c',1,'operator&gt;=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2f6286d222e2176bcbdc824c5d598100',1,'operator&gt;=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abec53064aa96265385ecc57de5fbc74c',1,'operator&gt;=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac766839f8f9e4863e8e18418c342c875',1,'operator&gt;=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2807fa6862b0f9689c81199b1e695ed8',1,'operator&gt;=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aee3ae0d0d1f941463b06eca0bf041b2b',1,'operator&gt;=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a523eda93c809733368e2b45382d2add6',1,'operator&gt;=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1f4e90909ac1c7280f4c7d1977c55fb7',1,'operator&gt;=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aa3c95c60cf69603705bb4636de547bcb',1,'mlx::steel::operator&gt;=()'],['../group__ops.html#ga3a41895f25ed083a36994d95fa102546',1,'mlx::core::operator&gt;=(const array &amp;a, const array &amp;b)'],['../group__ops.html#gaf509f2cb3b18963232f20d6c3bd229b2',1,'mlx::core::operator&gt;=(T a, const array &amp;b)'],['../group__ops.html#gafa0eb25d5978674bfc9e59d4145ec590',1,'mlx::core::operator&gt;=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a8494764f5c686743ede66dc76d85d955',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a019df48807b506d9995856684bf7797a',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a96ab6405430efb887cdb5c828cb67d6e',1,'mlx::core::operator&gt;=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac18be72269b1bcfb0249cc00a0600681',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aeb879815228efbd2c8f80986e1c8d41f',1,'mlx::core::operator&gt;=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0051156f6a568f58cd54850f746fb507',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ae93556906e115625ed1b62d36cf21b70',1,'mlx::core::operator&gt;=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab81ad16e3be591dfc9e42ac3c19b055f',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6cfe9b03e7c5f1eb9374208a552c3cc9',1,'mlx::core::operator&gt;=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2f5add83812fb137dd9226c6c01e45d5',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ad1014a836e7ce9301de8588eef1e89ee',1,'mlx::core::operator&gt;=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a17791561434dc995de9f268d145c0ed1',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a3755925b24a903045937464be117de2f',1,'mlx::core::operator&gt;=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a6262aeb513d27fc8313293b261e72abb',1,'mlx::core::operator&gt;=(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a6feb4b3ea511b0eda4d1ec9725f3fb4c',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a03b3f7fcb755ec075985ab26336926f0',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aecfbf5ef4872ae447eb4a374e4db28e4',1,'mlx::core::operator&gt;=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae4690f349b2483f5d1a4b75aba67399f',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a667e95146dd5199e67bcb121b984b1f0',1,'mlx::core::operator&gt;=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a3375f1562f148bdc07451f2b6e54e6df',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ae83df12368cb07ccb1c10c1117ff3922',1,'mlx::core::operator&gt;=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad41251938cf852b5560c1180944ebb49',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a4ddb5ef0b88929086f9b09729fda0dde',1,'mlx::core::operator&gt;=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a0908a61ab261aff726922b33fa6ed159',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a0fdadf87edd8a0a57c63953fb0ebe053',1,'mlx::core::operator&gt;=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a47c82778e43032c0bbf5d59407e81dc9',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a14e6c43b924eacca1b2dac1d5d00ca2b',1,'mlx::core::operator&gt;=(uint64_t lhs, _MLX_Float16 rhs)']]],
   ['operator_3e_3e_39',['operator&gt;&gt;',['../group__ops.html#ga498b61f7e8f056ae00297fa0dc17303a',1,'mlx::core']]],
   ['operator_5b_5d_40',['operator[]',['../classpocketfft_1_1detail_1_1arr.html#aea0bd899b19e03f54dfd6c188727061a',1,'pocketfft::detail::arr::operator[](size_t idx)'],['../classpocketfft_1_1detail_1_1arr.html#a99c54f96bc79c7cdd8925c1663462842',1,'pocketfft::detail::arr::operator[](size_t idx) const'],['../classpocketfft_1_1detail_1_1sincos__2pibyn.html#a71b02f67c47b24adb296eafd2c7a3598',1,'pocketfft::detail::sincos_2pibyn::operator[]()'],['../classpocketfft_1_1detail_1_1cndarr.html#ae4852d1fe936a5d61832b507816c7054',1,'pocketfft::detail::cndarr::operator[]()'],['../classpocketfft_1_1detail_1_1ndarr.html#a2b2c4e205e8b5c32c9fe55dfd7b8c8d8',1,'pocketfft::detail::ndarr::operator[]()']]],
   ['operator_5e_41',['operator^',['../group__ops.html#gac3a6fe18694e84b3d63458e9553ac181',1,'mlx::core::operator^(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#ae36ea40b8477bfa12d41aae8245225c9',1,'mlx::core::operator^(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a03fc96696f5c6d9411841889d05f4670',1,'mlx::core::operator^(_MLX_BFloat16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a55130edf926366db0d6207989e609b7c',1,'mlx::core::operator^(uint16_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0b75198f364d742a1c25dd13e398f2c2',1,'mlx::core::operator^(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7f205f1b10b23180a23bf2be4bb726b1',1,'mlx::core::operator^(_MLX_Float16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a9edfe65f3c6da583c7b109290ec94b22',1,'mlx::core::operator^(uint16_t lhs, _MLX_Float16 rhs)']]],
@@ -54,7 +54,7 @@ var searchData=
   ['out_5fof_5fbounds_51',['out_of_bounds',['../struct_read_writer.html#a08e10626fbc789b6dff9172fd6c36f7c',1,'ReadWriter::out_of_bounds() const'],['../struct_read_writer.html#a6f946aea5452109dca7fc70ed39c6efe',1,'ReadWriter::out_of_bounds() const'],['../struct_read_writer.html#a8f40d7f343d32134fe27a694abfde6bf',1,'ReadWriter::out_of_bounds() const']]],
   ['out_5fstrides_52',['out_strides',['../struct_m_l_x_conv_params.html#a0c8b2cfc26859a2af9d39a2cfcc3aea6',1,'MLXConvParams']]],
   ['outer_53',['outer',['../group__ops.html#ga866af24e10db2797e1c5a5986dbf6c0d',1,'mlx::core']]],
-  ['output_5fshapes_54',['output_shapes',['../classmlx_1_1core_1_1_primitive.html#a8849dc20991398f6f9a24d6785673853',1,'mlx::core::Primitive::output_shapes()'],['../classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32',1,'mlx::core::Abs::output_shapes()'],['../classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594',1,'mlx::core::Add::output_shapes()'],['../classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974',1,'mlx::core::ArcCos::output_shapes()'],['../classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b',1,'mlx::core::ArcCosh::output_shapes()'],['../classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5',1,'mlx::core::ArcSin::output_shapes()'],['../classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed',1,'mlx::core::ArcSinh::output_shapes()'],['../classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a',1,'mlx::core::ArcTan::output_shapes()'],['../classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03',1,'mlx::core::ArcTan2::output_shapes()'],['../classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8',1,'mlx::core::ArcTanh::output_shapes()'],['../classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64',1,'mlx::core::ArgPartition::output_shapes()'],['../classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179',1,'mlx::core::ArgReduce::output_shapes()'],['../classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859',1,'mlx::core::ArgSort::output_shapes()'],['../classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4',1,'mlx::core::AsType::output_shapes()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599',1,'mlx::core::BitwiseBinary::output_shapes()'],['../classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea',1,'mlx::core::Ceil::output_shapes()'],['../classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c',1,'mlx::core::Compiled::output_shapes()'],['../classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f',1,'mlx::core::Conjugate::output_shapes()'],['../classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3',1,'mlx::core::Copy::output_shapes()'],['../classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b',1,'mlx::core::Cos::output_shapes()'],['../classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962',1,'mlx::core::Cosh::output_shapes()'],['../classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994',1,'mlx::core::Divide::output_shapes()'],['../classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b',1,'mlx::core::DivMod::output_shapes()'],['../classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867',1,'mlx::core::Select::output_shapes()'],['../classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666',1,'mlx::core::Remainder::output_shapes()'],['../classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9',1,'mlx::core::Equal::output_shapes()'],['../classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187',1,'mlx::core::Erf::output_shapes()'],['../classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639',1,'mlx::core::ErfInv::output_shapes()'],['../classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670',1,'mlx::core::Exp::output_shapes()'],['../classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08',1,'mlx::core::Expm1::output_shapes()'],['../classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015',1,'mlx::core::Floor::output_shapes()'],['../classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46',1,'mlx::core::Greater::output_shapes()'],['../classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f',1,'mlx::core::GreaterEqual::output_shapes()'],['../classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde',1,'mlx::core::Hadamard::output_shapes()'],['../classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48',1,'mlx::core::Imag::output_shapes()'],['../classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278',1,'mlx::core::Less::output_shapes()'],['../classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f',1,'mlx::core::LessEqual::output_shapes()'],['../classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d',1,'mlx::core::Log::output_shapes()'],['../classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df',1,'mlx::core::Log1p::output_shapes()'],['../classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c',1,'mlx::core::LogicalNot::output_shapes()'],['../classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617',1,'mlx::core::LogicalAnd::output_shapes()'],['../classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4',1,'mlx::core::LogicalOr::output_shapes()'],['../classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635',1,'mlx::core::LogAddExp::output_shapes()'],['../classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b',1,'mlx::core::Maximum::output_shapes()'],['../classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70',1,'mlx::core::Minimum::output_shapes()'],['../classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061',1,'mlx::core::Multiply::output_shapes()'],['../classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014',1,'mlx::core::Negative::output_shapes()'],['../classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a',1,'mlx::core::NotEqual::output_shapes()'],['../classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8',1,'mlx::core::NumberOfElements::output_shapes()'],['../classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf',1,'mlx::core::Partition::output_shapes()'],['../classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1',1,'mlx::core::Power::output_shapes()'],['../classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5',1,'mlx::core::Real::output_shapes()'],['../classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65',1,'mlx::core::Reduce::output_shapes()'],['../classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047',1,'mlx::core::Round::output_shapes()'],['../classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43',1,'mlx::core::Sigmoid::output_shapes()'],['../classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67',1,'mlx::core::Sign::output_shapes()'],['../classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a',1,'mlx::core::Sin::output_shapes()'],['../classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28',1,'mlx::core::Sinh::output_shapes()'],['../classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35',1,'mlx::core::Softmax::output_shapes()'],['../classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d',1,'mlx::core::Sort::output_shapes()'],['../classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02',1,'mlx::core::Square::output_shapes()'],['../classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5',1,'mlx::core::Sqrt::output_shapes()'],['../classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e',1,'mlx::core::StopGradient::output_shapes()'],['../classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc',1,'mlx::core::Subtract::output_shapes()'],['../classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37',1,'mlx::core::Tan::output_shapes()'],['../classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325',1,'mlx::core::Tanh::output_shapes()'],['../classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5',1,'mlx::core::Eigh::output_shapes()']]],
+  ['output_5fshapes_54',['output_shapes',['../classmlx_1_1core_1_1_primitive.html#a8849dc20991398f6f9a24d6785673853',1,'mlx::core::Primitive::output_shapes()'],['../classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32',1,'mlx::core::Abs::output_shapes()'],['../classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594',1,'mlx::core::Add::output_shapes()'],['../classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974',1,'mlx::core::ArcCos::output_shapes()'],['../classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b',1,'mlx::core::ArcCosh::output_shapes()'],['../classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5',1,'mlx::core::ArcSin::output_shapes()'],['../classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed',1,'mlx::core::ArcSinh::output_shapes()'],['../classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a',1,'mlx::core::ArcTan::output_shapes()'],['../classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03',1,'mlx::core::ArcTan2::output_shapes()'],['../classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8',1,'mlx::core::ArcTanh::output_shapes()'],['../classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64',1,'mlx::core::ArgPartition::output_shapes()'],['../classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179',1,'mlx::core::ArgReduce::output_shapes()'],['../classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859',1,'mlx::core::ArgSort::output_shapes()'],['../classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4',1,'mlx::core::AsType::output_shapes()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599',1,'mlx::core::BitwiseBinary::output_shapes()'],['../classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea',1,'mlx::core::Ceil::output_shapes()'],['../classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c',1,'mlx::core::Compiled::output_shapes()'],['../classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f',1,'mlx::core::Conjugate::output_shapes()'],['../classmlx_1_1core_1_1_contiguous.html#a1a53623d7c591ba6567ac1533fbc2b7c',1,'mlx::core::Contiguous::output_shapes()'],['../classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3',1,'mlx::core::Copy::output_shapes()'],['../classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b',1,'mlx::core::Cos::output_shapes()'],['../classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962',1,'mlx::core::Cosh::output_shapes()'],['../classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994',1,'mlx::core::Divide::output_shapes()'],['../classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b',1,'mlx::core::DivMod::output_shapes()'],['../classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867',1,'mlx::core::Select::output_shapes()'],['../classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666',1,'mlx::core::Remainder::output_shapes()'],['../classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9',1,'mlx::core::Equal::output_shapes()'],['../classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187',1,'mlx::core::Erf::output_shapes()'],['../classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639',1,'mlx::core::ErfInv::output_shapes()'],['../classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670',1,'mlx::core::Exp::output_shapes()'],['../classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08',1,'mlx::core::Expm1::output_shapes()'],['../classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015',1,'mlx::core::Floor::output_shapes()'],['../classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46',1,'mlx::core::Greater::output_shapes()'],['../classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f',1,'mlx::core::GreaterEqual::output_shapes()'],['../classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde',1,'mlx::core::Hadamard::output_shapes()'],['../classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48',1,'mlx::core::Imag::output_shapes()'],['../classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278',1,'mlx::core::Less::output_shapes()'],['../classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f',1,'mlx::core::LessEqual::output_shapes()'],['../classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d',1,'mlx::core::Log::output_shapes()'],['../classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df',1,'mlx::core::Log1p::output_shapes()'],['../classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c',1,'mlx::core::LogicalNot::output_shapes()'],['../classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617',1,'mlx::core::LogicalAnd::output_shapes()'],['../classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4',1,'mlx::core::LogicalOr::output_shapes()'],['../classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635',1,'mlx::core::LogAddExp::output_shapes()'],['../classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b',1,'mlx::core::Maximum::output_shapes()'],['../classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70',1,'mlx::core::Minimum::output_shapes()'],['../classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061',1,'mlx::core::Multiply::output_shapes()'],['../classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014',1,'mlx::core::Negative::output_shapes()'],['../classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a',1,'mlx::core::NotEqual::output_shapes()'],['../classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8',1,'mlx::core::NumberOfElements::output_shapes()'],['../classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf',1,'mlx::core::Partition::output_shapes()'],['../classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1',1,'mlx::core::Power::output_shapes()'],['../classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5',1,'mlx::core::Real::output_shapes()'],['../classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65',1,'mlx::core::Reduce::output_shapes()'],['../classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047',1,'mlx::core::Round::output_shapes()'],['../classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43',1,'mlx::core::Sigmoid::output_shapes()'],['../classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67',1,'mlx::core::Sign::output_shapes()'],['../classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a',1,'mlx::core::Sin::output_shapes()'],['../classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28',1,'mlx::core::Sinh::output_shapes()'],['../classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35',1,'mlx::core::Softmax::output_shapes()'],['../classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d',1,'mlx::core::Sort::output_shapes()'],['../classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02',1,'mlx::core::Square::output_shapes()'],['../classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5',1,'mlx::core::Sqrt::output_shapes()'],['../classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e',1,'mlx::core::StopGradient::output_shapes()'],['../classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc',1,'mlx::core::Subtract::output_shapes()'],['../classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37',1,'mlx::core::Tan::output_shapes()'],['../classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325',1,'mlx::core::Tanh::output_shapes()'],['../classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5',1,'mlx::core::Eigh::output_shapes()']]],
   ['outputs_55',['outputs',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9',1,'mlx::core::metal::DeviceStream::outputs'],['../classmlx_1_1core_1_1array.html#a2c186fd527f984f0589d4183b4976289',1,'mlx::core::array::outputs()'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f',1,'mlx::core::metal::CommandEncoder::outputs()']]],
   ['overwrite_5fdescriptor_56',['overwrite_descriptor',['../classmlx_1_1core_1_1array.html#a95e6b156c8e05439f076b85c05079387',1,'mlx::core::array']]]
 ];
diff --git a/docs/build/html/search/classes_1.js b/docs/build/html/search/classes_1.js
index f2ad7365e..1a29858f0 100644
--- a/docs/build/html/search/classes_1.js
+++ b/docs/build/html/search/classes_1.js
@@ -31,5 +31,6 @@ var searchData=
   ['array_28',['array',['../classmlx_1_1core_1_1array.html',1,'mlx::core']]],
   ['arrayiterator_29',['ArrayIterator',['../structmlx_1_1core_1_1array_1_1_array_iterator.html',1,'mlx::core::array']]],
   ['asstrided_30',['AsStrided',['../classmlx_1_1core_1_1_as_strided.html',1,'mlx::core']]],
-  ['astype_31',['AsType',['../classmlx_1_1core_1_1_as_type.html',1,'mlx::core']]]
+  ['astype_31',['AsType',['../classmlx_1_1core_1_1_as_type.html',1,'mlx::core']]],
+  ['attnparams_32',['AttnParams',['../structmlx_1_1steel_1_1_attn_params.html',1,'mlx::steel']]]
 ];
diff --git a/docs/build/html/search/classes_12.js b/docs/build/html/search/classes_12.js
index d3f29cb20..e06ca618b 100644
--- a/docs/build/html/search/classes_12.js
+++ b/docs/build/html/search/classes_12.js
@@ -7,24 +7,27 @@ var searchData=
   ['scheduler_4',['Scheduler',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html',1,'mlx::core::scheduler']]],
   ['select_5',['Select',['../structmlx_1_1core_1_1detail_1_1_select.html',1,'mlx::core::detail::Select'],['../classmlx_1_1core_1_1_select.html',1,'mlx::core::Select'],['../struct_select.html',1,'Select']]],
   ['send_6',['Send',['../classmlx_1_1core_1_1distributed_1_1_send.html',1,'mlx::core::distributed']]],
-  ['sigmoid_7',['Sigmoid',['../structmlx_1_1core_1_1detail_1_1_sigmoid.html',1,'mlx::core::detail::Sigmoid'],['../classmlx_1_1core_1_1_sigmoid.html',1,'mlx::core::Sigmoid'],['../struct_sigmoid.html',1,'Sigmoid']]],
-  ['sign_8',['Sign',['../structmlx_1_1core_1_1detail_1_1_sign.html',1,'mlx::core::detail::Sign'],['../classmlx_1_1core_1_1_sign.html',1,'mlx::core::Sign'],['../struct_sign.html',1,'Sign']]],
-  ['simple_5fiter_9',['simple_iter',['../classpocketfft_1_1detail_1_1simple__iter.html',1,'pocketfft::detail']]],
-  ['sin_10',['Sin',['../structmlx_1_1core_1_1detail_1_1_sin.html',1,'mlx::core::detail::Sin'],['../classmlx_1_1core_1_1_sin.html',1,'mlx::core::Sin'],['../struct_sin.html',1,'Sin']]],
-  ['sincos_5f2pibyn_11',['sincos_2pibyn',['../classpocketfft_1_1detail_1_1sincos__2pibyn.html',1,'pocketfft::detail']]],
-  ['sinh_12',['Sinh',['../structmlx_1_1core_1_1detail_1_1_sinh.html',1,'mlx::core::detail::Sinh'],['../classmlx_1_1core_1_1_sinh.html',1,'mlx::core::Sinh'],['../struct_sinh.html',1,'Sinh']]],
-  ['slice_13',['Slice',['../classmlx_1_1core_1_1_slice.html',1,'mlx::core']]],
-  ['sliceupdate_14',['SliceUpdate',['../classmlx_1_1core_1_1_slice_update.html',1,'mlx::core']]],
-  ['softmax_15',['Softmax',['../classmlx_1_1core_1_1_softmax.html',1,'mlx::core']]],
-  ['sort_16',['Sort',['../classmlx_1_1core_1_1_sort.html',1,'mlx::core']]],
-  ['split_17',['Split',['../classmlx_1_1core_1_1_split.html',1,'mlx::core']]],
-  ['sqrt_18',['Sqrt',['../structmlx_1_1core_1_1detail_1_1_sqrt.html',1,'mlx::core::detail::Sqrt'],['../classmlx_1_1core_1_1_sqrt.html',1,'mlx::core::Sqrt'],['../struct_sqrt.html',1,'Sqrt']]],
-  ['square_19',['Square',['../structmlx_1_1core_1_1detail_1_1_square.html',1,'mlx::core::detail::Square'],['../classmlx_1_1core_1_1_square.html',1,'mlx::core::Square'],['../struct_square.html',1,'Square']]],
-  ['stopgradient_20',['StopGradient',['../classmlx_1_1core_1_1_stop_gradient.html',1,'mlx::core']]],
-  ['stream_21',['Stream',['../structmlx_1_1core_1_1_stream.html',1,'mlx::core']]],
-  ['streamcontext_22',['StreamContext',['../structmlx_1_1core_1_1_stream_context.html',1,'mlx::core']]],
-  ['streamthread_23',['StreamThread',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html',1,'mlx::core::scheduler']]],
-  ['subtract_24',['Subtract',['../structmlx_1_1core_1_1detail_1_1_subtract.html',1,'mlx::core::detail::Subtract'],['../classmlx_1_1core_1_1_subtract.html',1,'mlx::core::Subtract'],['../struct_subtract.html',1,'Subtract']]],
-  ['sum_25',['Sum',['../struct_sum.html',1,'']]],
-  ['svd_26',['SVD',['../classmlx_1_1core_1_1_s_v_d.html',1,'mlx::core']]]
+  ['shape2d_7',['Shape2D',['../structmlx_1_1steel_1_1_shape2_d.html',1,'mlx::steel']]],
+  ['sigmoid_8',['Sigmoid',['../structmlx_1_1core_1_1detail_1_1_sigmoid.html',1,'mlx::core::detail::Sigmoid'],['../classmlx_1_1core_1_1_sigmoid.html',1,'mlx::core::Sigmoid'],['../struct_sigmoid.html',1,'Sigmoid']]],
+  ['sign_9',['Sign',['../structmlx_1_1core_1_1detail_1_1_sign.html',1,'mlx::core::detail::Sign'],['../classmlx_1_1core_1_1_sign.html',1,'mlx::core::Sign'],['../struct_sign.html',1,'Sign']]],
+  ['simple_5fiter_10',['simple_iter',['../classpocketfft_1_1detail_1_1simple__iter.html',1,'pocketfft::detail']]],
+  ['sin_11',['Sin',['../structmlx_1_1core_1_1detail_1_1_sin.html',1,'mlx::core::detail::Sin'],['../classmlx_1_1core_1_1_sin.html',1,'mlx::core::Sin'],['../struct_sin.html',1,'Sin']]],
+  ['sincos_5f2pibyn_12',['sincos_2pibyn',['../classpocketfft_1_1detail_1_1sincos__2pibyn.html',1,'pocketfft::detail']]],
+  ['sinh_13',['Sinh',['../structmlx_1_1core_1_1detail_1_1_sinh.html',1,'mlx::core::detail::Sinh'],['../classmlx_1_1core_1_1_sinh.html',1,'mlx::core::Sinh'],['../struct_sinh.html',1,'Sinh']]],
+  ['slice_14',['Slice',['../classmlx_1_1core_1_1_slice.html',1,'mlx::core']]],
+  ['sliceupdate_15',['SliceUpdate',['../classmlx_1_1core_1_1_slice_update.html',1,'mlx::core']]],
+  ['softmax_16',['Softmax',['../classmlx_1_1core_1_1_softmax.html',1,'mlx::core']]],
+  ['sort_17',['Sort',['../classmlx_1_1core_1_1_sort.html',1,'mlx::core']]],
+  ['split_18',['Split',['../classmlx_1_1core_1_1_split.html',1,'mlx::core']]],
+  ['sqrt_19',['Sqrt',['../structmlx_1_1core_1_1detail_1_1_sqrt.html',1,'mlx::core::detail::Sqrt'],['../classmlx_1_1core_1_1_sqrt.html',1,'mlx::core::Sqrt'],['../struct_sqrt.html',1,'Sqrt']]],
+  ['square_20',['Square',['../structmlx_1_1core_1_1detail_1_1_square.html',1,'mlx::core::detail::Square'],['../classmlx_1_1core_1_1_square.html',1,'mlx::core::Square'],['../struct_square.html',1,'Square']]],
+  ['stopgradient_21',['StopGradient',['../classmlx_1_1core_1_1_stop_gradient.html',1,'mlx::core']]],
+  ['stream_22',['Stream',['../structmlx_1_1core_1_1_stream.html',1,'mlx::core']]],
+  ['streamcontext_23',['StreamContext',['../structmlx_1_1core_1_1_stream_context.html',1,'mlx::core']]],
+  ['streamthread_24',['StreamThread',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html',1,'mlx::core::scheduler']]],
+  ['subop_25',['SubOp',['../struct_sub_op.html',1,'']]],
+  ['subtract_26',['Subtract',['../structmlx_1_1core_1_1detail_1_1_subtract.html',1,'mlx::core::detail::Subtract'],['../classmlx_1_1core_1_1_subtract.html',1,'mlx::core::Subtract'],['../struct_subtract.html',1,'Subtract']]],
+  ['sum_27',['Sum',['../struct_sum.html',1,'']]],
+  ['sumop_28',['SumOp',['../struct_sum_op.html',1,'']]],
+  ['svd_29',['SVD',['../classmlx_1_1core_1_1_s_v_d.html',1,'mlx::core']]]
 ];
diff --git a/docs/build/html/search/classes_13.js b/docs/build/html/search/classes_13.js
index a256f73b1..5de3436bf 100644
--- a/docs/build/html/search/classes_13.js
+++ b/docs/build/html/search/classes_13.js
@@ -12,6 +12,7 @@ var searchData=
   ['transformadd_9',['TransformAdd',['../structmlx_1_1steel_1_1_transform_add.html',1,'mlx::steel']]],
   ['transformaxpby_10',['TransformAxpby',['../structmlx_1_1steel_1_1_transform_axpby.html',1,'mlx::steel']]],
   ['transformnone_11',['TransformNone',['../structmlx_1_1steel_1_1_transform_none.html',1,'mlx::steel']]],
-  ['transpose_12',['Transpose',['../classmlx_1_1core_1_1_transpose.html',1,'mlx::core']]],
-  ['typetodtype_13',['TypeToDtype',['../structmlx_1_1core_1_1_type_to_dtype.html',1,'mlx::core']]]
+  ['transformscale_12',['TransformScale',['../struct_transform_scale.html',1,'']]],
+  ['transpose_13',['Transpose',['../classmlx_1_1core_1_1_transpose.html',1,'mlx::core']]],
+  ['typetodtype_14',['TypeToDtype',['../structmlx_1_1core_1_1_type_to_dtype.html',1,'mlx::core']]]
 ];
diff --git a/docs/build/html/search/classes_2.js b/docs/build/html/search/classes_2.js
index 61d4233f0..d6904d06d 100644
--- a/docs/build/html/search/classes_2.js
+++ b/docs/build/html/search/classes_2.js
@@ -8,11 +8,12 @@ var searchData=
   ['bitwiseor_5',['BitwiseOr',['../struct_bitwise_or.html',1,'BitwiseOr'],['../structmlx_1_1core_1_1detail_1_1_bitwise_or.html',1,'mlx::core::detail::BitwiseOr']]],
   ['bitwisexor_6',['BitwiseXor',['../struct_bitwise_xor.html',1,'BitwiseXor'],['../structmlx_1_1core_1_1detail_1_1_bitwise_xor.html',1,'mlx::core::detail::BitwiseXor']]],
   ['blockloader_7',['BlockLoader',['../structmlx_1_1steel_1_1_block_loader.html',1,'mlx::steel']]],
-  ['blockmaskedmm_8',['BlockMaskedMM',['../classmlx_1_1core_1_1_block_masked_m_m.html',1,'mlx::core']]],
-  ['blockmergesort_9',['BlockMergeSort',['../struct_block_merge_sort.html',1,'']]],
-  ['blockmma_10',['BlockMMA',['../structmlx_1_1steel_1_1_block_m_m_a.html',1,'mlx::steel']]],
-  ['blockswizzle_11',['BlockSwizzle',['../structmlx_1_1steel_1_1_block_swizzle.html',1,'mlx::steel']]],
-  ['bool4_5for_5fuint_12',['bool4_or_uint',['../unionbool4__or__uint.html',1,'']]],
-  ['broadcast_13',['Broadcast',['../classmlx_1_1core_1_1_broadcast.html',1,'mlx::core']]],
-  ['buffer_14',['Buffer',['../classmlx_1_1core_1_1allocator_1_1_buffer.html',1,'mlx::core::allocator']]]
+  ['blockloadert_8',['BlockLoaderT',['../structmlx_1_1steel_1_1_block_loader_t.html',1,'mlx::steel']]],
+  ['blockmaskedmm_9',['BlockMaskedMM',['../classmlx_1_1core_1_1_block_masked_m_m.html',1,'mlx::core']]],
+  ['blockmergesort_10',['BlockMergeSort',['../struct_block_merge_sort.html',1,'']]],
+  ['blockmma_11',['BlockMMA',['../structmlx_1_1steel_1_1_block_m_m_a.html',1,'mlx::steel']]],
+  ['blockswizzle_12',['BlockSwizzle',['../structmlx_1_1steel_1_1_block_swizzle.html',1,'mlx::steel']]],
+  ['bool4_5for_5fuint_13',['bool4_or_uint',['../unionbool4__or__uint.html',1,'']]],
+  ['broadcast_14',['Broadcast',['../classmlx_1_1core_1_1_broadcast.html',1,'mlx::core']]],
+  ['buffer_15',['Buffer',['../classmlx_1_1core_1_1allocator_1_1_buffer.html',1,'mlx::core::allocator']]]
 ];
diff --git a/docs/build/html/search/classes_3.js b/docs/build/html/search/classes_3.js
index 102a52448..1dbd6b015 100644
--- a/docs/build/html/search/classes_3.js
+++ b/docs/build/html/search/classes_3.js
@@ -22,27 +22,29 @@ var searchData=
   ['concurrent_5fqueue_3c_20std_3a_3afunction_3c_20void_28_29_3e_20_3e_19',['concurrent_queue&lt; std::function&lt; void()&gt; &gt;',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html',1,'pocketfft::detail::threading']]],
   ['concurrentcontext_20',['ConcurrentContext',['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html',1,'mlx::core::metal::CommandEncoder']]],
   ['conjugate_21',['Conjugate',['../struct_conjugate.html',1,'Conjugate'],['../classmlx_1_1core_1_1_conjugate.html',1,'mlx::core::Conjugate'],['../structmlx_1_1core_1_1detail_1_1_conjugate.html',1,'mlx::core::detail::Conjugate']]],
-  ['contiguousiterator_22',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html',1,'mlx::core']]],
-  ['conv2dgeneralbaseinfo_23',['Conv2DGeneralBaseInfo',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html',1,'mlx::steel']]],
-  ['conv2dgeneraljumpparams_24',['Conv2DGeneralJumpParams',['../structmlx_1_1steel_1_1_conv2_d_general_jump_params.html',1,'mlx::steel']]],
-  ['conv2dinputblockloadergeneral_25',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html',1,'mlx::steel']]],
-  ['conv2dinputblockloaderlargefilter_26',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html',1,'mlx::steel']]],
-  ['conv2dinputblockloadersmallchannels_27',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html',1,'mlx::steel']]],
-  ['conv2dinputblockloadersmallfilter_28',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html',1,'mlx::steel']]],
-  ['conv2dweightblockloader_29',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html',1,'mlx::steel']]],
-  ['conv2dweightblockloadergeneral_30',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html',1,'mlx::steel']]],
-  ['conv2dweightblockloadersmallchannels_31',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html',1,'mlx::steel']]],
-  ['convolution_32',['Convolution',['../classmlx_1_1core_1_1_convolution.html',1,'mlx::core']]],
-  ['copy_33',['Copy',['../classmlx_1_1core_1_1_copy.html',1,'mlx::core']]],
-  ['cos_34',['Cos',['../struct_cos.html',1,'Cos'],['../classmlx_1_1core_1_1_cos.html',1,'mlx::core::Cos'],['../structmlx_1_1core_1_1detail_1_1_cos.html',1,'mlx::core::detail::Cos']]],
-  ['cosh_35',['Cosh',['../struct_cosh.html',1,'Cosh'],['../classmlx_1_1core_1_1_cosh.html',1,'mlx::core::Cosh'],['../structmlx_1_1core_1_1detail_1_1_cosh.html',1,'mlx::core::detail::Cosh']]],
-  ['cummax_36',['CumMax',['../struct_cum_max.html',1,'']]],
-  ['cummin_37',['CumMin',['../struct_cum_min.html',1,'']]],
-  ['cumprod_38',['CumProd',['../struct_cum_prod.html',1,'']]],
-  ['cumprod_3c_20bool_20_3e_39',['CumProd&lt; bool &gt;',['../struct_cum_prod_3_01bool_01_4.html',1,'']]],
-  ['cumsum_40',['CumSum',['../struct_cum_sum.html',1,'']]],
-  ['custom_41',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html',1,'mlx::core::fast']]],
-  ['customkernel_42',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html',1,'mlx::core::fast']]],
-  ['customkernelshapeinfo_43',['CustomKernelShapeInfo',['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html',1,'mlx::core::fast']]],
-  ['customtransforms_44',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html',1,'mlx::core']]]
+  ['contiguous_22',['Contiguous',['../classmlx_1_1core_1_1_contiguous.html',1,'mlx::core']]],
+  ['contiguousiterator_23',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html',1,'mlx::core']]],
+  ['conv2dgeneralbaseinfo_24',['Conv2DGeneralBaseInfo',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html',1,'mlx::steel']]],
+  ['conv2dgeneraljumpparams_25',['Conv2DGeneralJumpParams',['../structmlx_1_1steel_1_1_conv2_d_general_jump_params.html',1,'mlx::steel']]],
+  ['conv2dinputblockloadergeneral_26',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html',1,'mlx::steel']]],
+  ['conv2dinputblockloaderlargefilter_27',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html',1,'mlx::steel']]],
+  ['conv2dinputblockloadersmallchannels_28',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html',1,'mlx::steel']]],
+  ['conv2dinputblockloadersmallfilter_29',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html',1,'mlx::steel']]],
+  ['conv2dweightblockloader_30',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html',1,'mlx::steel']]],
+  ['conv2dweightblockloadergeneral_31',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html',1,'mlx::steel']]],
+  ['conv2dweightblockloadersmallchannels_32',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html',1,'mlx::steel']]],
+  ['convolution_33',['Convolution',['../classmlx_1_1core_1_1_convolution.html',1,'mlx::core']]],
+  ['copy_34',['Copy',['../classmlx_1_1core_1_1_copy.html',1,'mlx::core']]],
+  ['cos_35',['Cos',['../struct_cos.html',1,'Cos'],['../classmlx_1_1core_1_1_cos.html',1,'mlx::core::Cos'],['../structmlx_1_1core_1_1detail_1_1_cos.html',1,'mlx::core::detail::Cos']]],
+  ['cosh_36',['Cosh',['../struct_cosh.html',1,'Cosh'],['../classmlx_1_1core_1_1_cosh.html',1,'mlx::core::Cosh'],['../structmlx_1_1core_1_1detail_1_1_cosh.html',1,'mlx::core::detail::Cosh']]],
+  ['cshape_37',['CShape',['../structmlx_1_1steel_1_1_c_shape.html',1,'mlx::steel']]],
+  ['cummax_38',['CumMax',['../struct_cum_max.html',1,'']]],
+  ['cummin_39',['CumMin',['../struct_cum_min.html',1,'']]],
+  ['cumprod_40',['CumProd',['../struct_cum_prod.html',1,'']]],
+  ['cumprod_3c_20bool_20_3e_41',['CumProd&lt; bool &gt;',['../struct_cum_prod_3_01bool_01_4.html',1,'']]],
+  ['cumsum_42',['CumSum',['../struct_cum_sum.html',1,'']]],
+  ['custom_43',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html',1,'mlx::core::fast']]],
+  ['customkernel_44',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html',1,'mlx::core::fast']]],
+  ['customkernelshapeinfo_45',['CustomKernelShapeInfo',['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html',1,'mlx::core::fast']]],
+  ['customtransforms_46',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html',1,'mlx::core']]]
 ];
diff --git a/docs/build/html/search/classes_4.js b/docs/build/html/search/classes_4.js
index 27b04fa53..5f3a1df7f 100644
--- a/docs/build/html/search/classes_4.js
+++ b/docs/build/html/search/classes_4.js
@@ -9,5 +9,6 @@ var searchData=
   ['distprimitive_6',['DistPrimitive',['../classmlx_1_1core_1_1distributed_1_1_dist_primitive.html',1,'mlx::core::distributed']]],
   ['divide_7',['Divide',['../struct_divide.html',1,'Divide'],['../structmlx_1_1core_1_1detail_1_1_divide.html',1,'mlx::core::detail::Divide'],['../classmlx_1_1core_1_1_divide.html',1,'mlx::core::Divide']]],
   ['divmod_8',['DivMod',['../struct_div_mod.html',1,'DivMod'],['../classmlx_1_1core_1_1_div_mod.html',1,'mlx::core::DivMod']]],
-  ['dtype_9',['Dtype',['../structmlx_1_1core_1_1_dtype.html',1,'mlx::core']]]
+  ['divop_9',['DivOp',['../struct_div_op.html',1,'']]],
+  ['dtype_10',['Dtype',['../structmlx_1_1core_1_1_dtype.html',1,'mlx::core']]]
 ];
diff --git a/docs/build/html/search/classes_5.js b/docs/build/html/search/classes_5.js
index bc95defd4..e98deee58 100644
--- a/docs/build/html/search/classes_5.js
+++ b/docs/build/html/search/classes_5.js
@@ -10,5 +10,6 @@ var searchData=
   ['exechartley_7',['ExecHartley',['../structpocketfft_1_1detail_1_1_exec_hartley.html',1,'pocketfft::detail']]],
   ['execr2r_8',['ExecR2R',['../structpocketfft_1_1detail_1_1_exec_r2_r.html',1,'pocketfft::detail']]],
   ['exp_9',['Exp',['../struct_exp.html',1,'Exp'],['../structmlx_1_1core_1_1detail_1_1_exp.html',1,'mlx::core::detail::Exp'],['../classmlx_1_1core_1_1_exp.html',1,'mlx::core::Exp']]],
-  ['expm1_10',['Expm1',['../struct_expm1.html',1,'Expm1'],['../structmlx_1_1core_1_1detail_1_1_expm1.html',1,'mlx::core::detail::Expm1'],['../classmlx_1_1core_1_1_expm1.html',1,'mlx::core::Expm1']]]
+  ['expm1_10',['Expm1',['../struct_expm1.html',1,'Expm1'],['../structmlx_1_1core_1_1detail_1_1_expm1.html',1,'mlx::core::detail::Expm1'],['../classmlx_1_1core_1_1_expm1.html',1,'mlx::core::Expm1']]],
+  ['expsubop_11',['ExpSubOp',['../struct_exp_sub_op.html',1,'']]]
 ];
diff --git a/docs/build/html/search/classes_b.js b/docs/build/html/search/classes_b.js
index 07d520fdd..0a104128e 100644
--- a/docs/build/html/search/classes_b.js
+++ b/docs/build/html/search/classes_b.js
@@ -3,35 +3,36 @@ var searchData=
   ['latch_0',['latch',['../classpocketfft_1_1detail_1_1threading_1_1latch.html',1,'pocketfft::detail::threading']]],
   ['layernorm_1',['LayerNorm',['../classmlx_1_1core_1_1fast_1_1_layer_norm.html',1,'mlx::core::fast']]],
   ['layernormvjp_2',['LayerNormVJP',['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html',1,'mlx::core::fast']]],
-  ['leftshift_3',['LeftShift',['../struct_left_shift.html',1,'LeftShift'],['../structmlx_1_1core_1_1detail_1_1_left_shift.html',1,'mlx::core::detail::LeftShift']]],
-  ['less_4',['Less',['../struct_less.html',1,'Less'],['../structmlx_1_1core_1_1detail_1_1_less.html',1,'mlx::core::detail::Less'],['../classmlx_1_1core_1_1_less.html',1,'mlx::core::Less']]],
-  ['lessequal_5',['LessEqual',['../struct_less_equal.html',1,'LessEqual'],['../structmlx_1_1core_1_1detail_1_1_less_equal.html',1,'mlx::core::detail::LessEqual'],['../classmlx_1_1core_1_1_less_equal.html',1,'mlx::core::LessEqual']]],
-  ['lessthan_6',['LessThan',['../struct_less_than.html',1,'']]],
-  ['limits_7',['Limits',['../struct_limits.html',1,'']]],
-  ['limits_3c_20bfloat16_5ft_20_3e_8',['Limits&lt; bfloat16_t &gt;',['../struct_limits_3_01bfloat16__t_01_4.html',1,'']]],
-  ['limits_3c_20bool_20_3e_9',['Limits&lt; bool &gt;',['../struct_limits_3_01bool_01_4.html',1,'']]],
-  ['limits_3c_20complex64_5ft_20_3e_10',['Limits&lt; complex64_t &gt;',['../struct_limits_3_01complex64__t_01_4.html',1,'']]],
-  ['limits_3c_20float_20_3e_11',['Limits&lt; float &gt;',['../struct_limits_3_01float_01_4.html',1,'']]],
-  ['limits_3c_20half_20_3e_12',['Limits&lt; half &gt;',['../struct_limits_3_01half_01_4.html',1,'']]],
-  ['limits_3c_20int16_5ft_20_3e_13',['Limits&lt; int16_t &gt;',['../struct_limits_3_01int16__t_01_4.html',1,'']]],
-  ['limits_3c_20int32_5ft_20_3e_14',['Limits&lt; int32_t &gt;',['../struct_limits_3_01int32__t_01_4.html',1,'']]],
-  ['limits_3c_20int64_5ft_20_3e_15',['Limits&lt; int64_t &gt;',['../struct_limits_3_01int64__t_01_4.html',1,'']]],
-  ['limits_3c_20int8_5ft_20_3e_16',['Limits&lt; int8_t &gt;',['../struct_limits_3_01int8__t_01_4.html',1,'']]],
-  ['limits_3c_20uint16_5ft_20_3e_17',['Limits&lt; uint16_t &gt;',['../struct_limits_3_01uint16__t_01_4.html',1,'']]],
-  ['limits_3c_20uint32_5ft_20_3e_18',['Limits&lt; uint32_t &gt;',['../struct_limits_3_01uint32__t_01_4.html',1,'']]],
-  ['limits_3c_20uint64_5ft_20_3e_19',['Limits&lt; uint64_t &gt;',['../struct_limits_3_01uint64__t_01_4.html',1,'']]],
-  ['limits_3c_20uint8_5ft_20_3e_20',['Limits&lt; uint8_t &gt;',['../struct_limits_3_01uint8__t_01_4.html',1,'']]],
-  ['load_21',['Load',['../classmlx_1_1core_1_1_load.html',1,'mlx::core']]],
-  ['log_22',['Log',['../struct_log.html',1,'Log'],['../structmlx_1_1core_1_1detail_1_1_log.html',1,'mlx::core::detail::Log'],['../classmlx_1_1core_1_1_log.html',1,'mlx::core::Log']]],
-  ['log10_23',['Log10',['../struct_log10.html',1,'Log10'],['../structmlx_1_1core_1_1detail_1_1_log10.html',1,'mlx::core::detail::Log10']]],
-  ['log1p_24',['Log1p',['../struct_log1p.html',1,'Log1p'],['../structmlx_1_1core_1_1detail_1_1_log1p.html',1,'mlx::core::detail::Log1p'],['../classmlx_1_1core_1_1_log1p.html',1,'mlx::core::Log1p']]],
-  ['log2_25',['Log2',['../struct_log2.html',1,'Log2'],['../structmlx_1_1core_1_1detail_1_1_log2.html',1,'mlx::core::detail::Log2']]],
-  ['logaddexp_26',['LogAddExp',['../struct_log_add_exp.html',1,'LogAddExp'],['../structmlx_1_1core_1_1detail_1_1_log_add_exp.html',1,'mlx::core::detail::LogAddExp'],['../classmlx_1_1core_1_1_log_add_exp.html',1,'mlx::core::LogAddExp']]],
-  ['logicaland_27',['LogicalAnd',['../struct_logical_and.html',1,'LogicalAnd'],['../structmlx_1_1core_1_1detail_1_1_logical_and.html',1,'mlx::core::detail::LogicalAnd'],['../classmlx_1_1core_1_1_logical_and.html',1,'mlx::core::LogicalAnd']]],
-  ['logicalnot_28',['LogicalNot',['../struct_logical_not.html',1,'LogicalNot'],['../structmlx_1_1core_1_1detail_1_1_logical_not.html',1,'mlx::core::detail::LogicalNot'],['../classmlx_1_1core_1_1_logical_not.html',1,'mlx::core::LogicalNot']]],
-  ['logicalor_29',['LogicalOr',['../struct_logical_or.html',1,'LogicalOr'],['../structmlx_1_1core_1_1detail_1_1_logical_or.html',1,'mlx::core::detail::LogicalOr'],['../classmlx_1_1core_1_1_logical_or.html',1,'mlx::core::LogicalOr']]],
-  ['loopalignment_30',['LoopAlignment',['../structmlx_1_1steel_1_1_loop_alignment.html',1,'mlx::steel']]],
-  ['looped_5felem_5fto_5floc_31',['looped_elem_to_loc',['../structlooped__elem__to__loc.html',1,'']]],
-  ['looped_5felem_5fto_5floc_3c_200_2c_20offset_5ft_20_3e_32',['looped_elem_to_loc&lt; 0, offset_t &gt;',['../structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html',1,'']]],
-  ['looped_5felem_5fto_5floc_3c_201_2c_20offset_5ft_20_3e_33',['looped_elem_to_loc&lt; 1, offset_t &gt;',['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html',1,'']]]
+  ['layout2d_3',['Layout2D',['../structmlx_1_1steel_1_1_layout2_d.html',1,'mlx::steel']]],
+  ['leftshift_4',['LeftShift',['../struct_left_shift.html',1,'LeftShift'],['../structmlx_1_1core_1_1detail_1_1_left_shift.html',1,'mlx::core::detail::LeftShift']]],
+  ['less_5',['Less',['../struct_less.html',1,'Less'],['../structmlx_1_1core_1_1detail_1_1_less.html',1,'mlx::core::detail::Less'],['../classmlx_1_1core_1_1_less.html',1,'mlx::core::Less']]],
+  ['lessequal_6',['LessEqual',['../struct_less_equal.html',1,'LessEqual'],['../structmlx_1_1core_1_1detail_1_1_less_equal.html',1,'mlx::core::detail::LessEqual'],['../classmlx_1_1core_1_1_less_equal.html',1,'mlx::core::LessEqual']]],
+  ['lessthan_7',['LessThan',['../struct_less_than.html',1,'']]],
+  ['limits_8',['Limits',['../struct_limits.html',1,'']]],
+  ['limits_3c_20bfloat16_5ft_20_3e_9',['Limits&lt; bfloat16_t &gt;',['../struct_limits_3_01bfloat16__t_01_4.html',1,'']]],
+  ['limits_3c_20bool_20_3e_10',['Limits&lt; bool &gt;',['../struct_limits_3_01bool_01_4.html',1,'']]],
+  ['limits_3c_20complex64_5ft_20_3e_11',['Limits&lt; complex64_t &gt;',['../struct_limits_3_01complex64__t_01_4.html',1,'']]],
+  ['limits_3c_20float_20_3e_12',['Limits&lt; float &gt;',['../struct_limits_3_01float_01_4.html',1,'']]],
+  ['limits_3c_20half_20_3e_13',['Limits&lt; half &gt;',['../struct_limits_3_01half_01_4.html',1,'']]],
+  ['limits_3c_20int16_5ft_20_3e_14',['Limits&lt; int16_t &gt;',['../struct_limits_3_01int16__t_01_4.html',1,'']]],
+  ['limits_3c_20int32_5ft_20_3e_15',['Limits&lt; int32_t &gt;',['../struct_limits_3_01int32__t_01_4.html',1,'']]],
+  ['limits_3c_20int64_5ft_20_3e_16',['Limits&lt; int64_t &gt;',['../struct_limits_3_01int64__t_01_4.html',1,'']]],
+  ['limits_3c_20int8_5ft_20_3e_17',['Limits&lt; int8_t &gt;',['../struct_limits_3_01int8__t_01_4.html',1,'']]],
+  ['limits_3c_20uint16_5ft_20_3e_18',['Limits&lt; uint16_t &gt;',['../struct_limits_3_01uint16__t_01_4.html',1,'']]],
+  ['limits_3c_20uint32_5ft_20_3e_19',['Limits&lt; uint32_t &gt;',['../struct_limits_3_01uint32__t_01_4.html',1,'']]],
+  ['limits_3c_20uint64_5ft_20_3e_20',['Limits&lt; uint64_t &gt;',['../struct_limits_3_01uint64__t_01_4.html',1,'']]],
+  ['limits_3c_20uint8_5ft_20_3e_21',['Limits&lt; uint8_t &gt;',['../struct_limits_3_01uint8__t_01_4.html',1,'']]],
+  ['load_22',['Load',['../classmlx_1_1core_1_1_load.html',1,'mlx::core']]],
+  ['log_23',['Log',['../struct_log.html',1,'Log'],['../structmlx_1_1core_1_1detail_1_1_log.html',1,'mlx::core::detail::Log'],['../classmlx_1_1core_1_1_log.html',1,'mlx::core::Log']]],
+  ['log10_24',['Log10',['../struct_log10.html',1,'Log10'],['../structmlx_1_1core_1_1detail_1_1_log10.html',1,'mlx::core::detail::Log10']]],
+  ['log1p_25',['Log1p',['../struct_log1p.html',1,'Log1p'],['../structmlx_1_1core_1_1detail_1_1_log1p.html',1,'mlx::core::detail::Log1p'],['../classmlx_1_1core_1_1_log1p.html',1,'mlx::core::Log1p']]],
+  ['log2_26',['Log2',['../struct_log2.html',1,'Log2'],['../structmlx_1_1core_1_1detail_1_1_log2.html',1,'mlx::core::detail::Log2']]],
+  ['logaddexp_27',['LogAddExp',['../struct_log_add_exp.html',1,'LogAddExp'],['../structmlx_1_1core_1_1detail_1_1_log_add_exp.html',1,'mlx::core::detail::LogAddExp'],['../classmlx_1_1core_1_1_log_add_exp.html',1,'mlx::core::LogAddExp']]],
+  ['logicaland_28',['LogicalAnd',['../struct_logical_and.html',1,'LogicalAnd'],['../structmlx_1_1core_1_1detail_1_1_logical_and.html',1,'mlx::core::detail::LogicalAnd'],['../classmlx_1_1core_1_1_logical_and.html',1,'mlx::core::LogicalAnd']]],
+  ['logicalnot_29',['LogicalNot',['../struct_logical_not.html',1,'LogicalNot'],['../structmlx_1_1core_1_1detail_1_1_logical_not.html',1,'mlx::core::detail::LogicalNot'],['../classmlx_1_1core_1_1_logical_not.html',1,'mlx::core::LogicalNot']]],
+  ['logicalor_30',['LogicalOr',['../struct_logical_or.html',1,'LogicalOr'],['../structmlx_1_1core_1_1detail_1_1_logical_or.html',1,'mlx::core::detail::LogicalOr'],['../classmlx_1_1core_1_1_logical_or.html',1,'mlx::core::LogicalOr']]],
+  ['loopalignment_31',['LoopAlignment',['../structmlx_1_1steel_1_1_loop_alignment.html',1,'mlx::steel']]],
+  ['loopedelemtoloc_32',['LoopedElemToLoc',['../struct_looped_elem_to_loc.html',1,'']]],
+  ['loopedelemtoloc_3c_201_2c_20offsett_2c_20false_20_3e_33',['LoopedElemToLoc&lt; 1, OffsetT, false &gt;',['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html',1,'']]],
+  ['loopedelemtoloc_3c_201_2c_20offsett_2c_20true_20_3e_34',['LoopedElemToLoc&lt; 1, OffsetT, true &gt;',['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html',1,'']]]
 ];
diff --git a/docs/build/html/search/classes_c.js b/docs/build/html/search/classes_c.js
index 4fa74563a..e52c9fe34 100644
--- a/docs/build/html/search/classes_c.js
+++ b/docs/build/html/search/classes_c.js
@@ -4,19 +4,19 @@ var searchData=
   ['matmul_1',['Matmul',['../classmlx_1_1core_1_1_matmul.html',1,'mlx::core']]],
   ['max_2',['Max',['../struct_max.html',1,'']]],
   ['maximum_3',['Maximum',['../struct_maximum.html',1,'Maximum'],['../structmlx_1_1core_1_1detail_1_1_maximum.html',1,'mlx::core::detail::Maximum'],['../classmlx_1_1core_1_1_maximum.html',1,'mlx::core::Maximum']]],
-  ['metalallocator_4',['MetalAllocator',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html',1,'mlx::core::metal']]],
-  ['min_5',['Min',['../struct_min.html',1,'']]],
-  ['minimum_6',['Minimum',['../struct_minimum.html',1,'Minimum'],['../structmlx_1_1core_1_1detail_1_1_minimum.html',1,'mlx::core::detail::Minimum'],['../classmlx_1_1core_1_1_minimum.html',1,'mlx::core::Minimum']]],
-  ['mlx_5fatomic_7',['mlx_atomic',['../structmlx__atomic.html',1,'']]],
-  ['mlx_5fatomic_3c_20t_2c_20enable_5fif_5ft_3c_20is_5fmetal_5fatomic_3c_20t_20_3e_20_3e_20_3e_8',['mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;',['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html',1,'']]],
-  ['mlxconvparams_9',['MLXConvParams',['../struct_m_l_x_conv_params.html',1,'']]],
-  ['mlxconvparams_3c_202_20_3e_10',['MLXConvParams&lt; 2 &gt;',['../struct_m_l_x_conv_params.html',1,'']]],
-  ['mlxfastattentionparams_11',['MLXFastAttentionParams',['../struct_m_l_x_fast_attention_params.html',1,'']]],
-  ['mlxscaleddotproductattentionparams_12',['MLXScaledDotProductAttentionParams',['../struct_m_l_x_scaled_dot_product_attention_params.html',1,'']]],
-  ['mmatile_13',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
-  ['mmatile_3c_20float_2c_201_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_14',['MMATile&lt; float, 1, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
-  ['mmatile_3c_20float_2c_20tm_2c_201_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_15',['MMATile&lt; float, TM, 1, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
-  ['mmatile_3c_20float_2c_20tm_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_16',['MMATile&lt; float, TM, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['maxop_4',['MaxOp',['../struct_max_op.html',1,'']]],
+  ['metalallocator_5',['MetalAllocator',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html',1,'mlx::core::metal']]],
+  ['min_6',['Min',['../struct_min.html',1,'']]],
+  ['minimum_7',['Minimum',['../struct_minimum.html',1,'Minimum'],['../structmlx_1_1core_1_1detail_1_1_minimum.html',1,'mlx::core::detail::Minimum'],['../classmlx_1_1core_1_1_minimum.html',1,'mlx::core::Minimum']]],
+  ['mlx_5fatomic_8',['mlx_atomic',['../structmlx__atomic.html',1,'']]],
+  ['mlx_5fatomic_3c_20t_2c_20enable_5fif_5ft_3c_20is_5fmetal_5fatomic_3c_20t_20_3e_20_3e_20_3e_9',['mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;',['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html',1,'']]],
+  ['mlxconvparams_10',['MLXConvParams',['../struct_m_l_x_conv_params.html',1,'']]],
+  ['mlxconvparams_3c_202_20_3e_11',['MLXConvParams&lt; 2 &gt;',['../struct_m_l_x_conv_params.html',1,'']]],
+  ['mmatile_12',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['mmatile_3c_20float_2c_201_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_13',['MMATile&lt; float, 1, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['mmatile_3c_20float_2c_20tm_2c_201_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_14',['MMATile&lt; float, TM, 1, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['mmatile_3c_20float_2c_20tm_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_15',['MMATile&lt; float, TM, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['mulop_16',['MulOp',['../struct_mul_op.html',1,'']]],
   ['multi_5fiter_17',['multi_iter',['../classpocketfft_1_1detail_1_1multi__iter.html',1,'pocketfft::detail']]],
   ['multiply_18',['Multiply',['../structmlx_1_1core_1_1detail_1_1_multiply.html',1,'mlx::core::detail::Multiply'],['../classmlx_1_1core_1_1_multiply.html',1,'mlx::core::Multiply'],['../struct_multiply.html',1,'Multiply']]]
 ];
diff --git a/docs/build/html/search/defines_2.js b/docs/build/html/search/defines_2.js
index 51c520bfc..03e79577b 100644
--- a/docs/build/html/search/defines_2.js
+++ b/docs/build/html/search/defines_2.js
@@ -1,13 +1,12 @@
 var searchData=
 [
-  ['bfloat16_5fto_5fuint16_0',['bfloat16_to_uint16',['../bf16__math_8h.html#a51cfdd4502e755310f6f3456f039bea7',1,'bf16_math.h']]],
-  ['bfloat_5fbinop_1',['bfloat_binop',['../backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707',1,'bfloat_binop:&#160;bf16.h'],['../types_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707',1,'bfloat_binop:&#160;bf16.h']]],
-  ['bfloat_5fbinop_5fbase_2',['bfloat_binop_base',['../backend_2metal_2kernels_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70',1,'bfloat_binop_base:&#160;bf16.h'],['../types_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70',1,'bfloat_binop_base:&#160;bf16.h']]],
-  ['bfloat_5fbinop_5fhelper_3',['bfloat_binop_helper',['../backend_2metal_2kernels_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594',1,'bfloat_binop_helper:&#160;bf16.h'],['../types_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594',1,'bfloat_binop_helper:&#160;bf16.h']]],
-  ['bfloat_5fbitop_4',['bfloat_bitop',['../types_2bf16_8h.html#aac9ba86d4bf05bcda1936494f9b9b4d3',1,'bf16.h']]],
-  ['bfloat_5fcompop_5',['bfloat_compop',['../backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239',1,'bfloat_compop:&#160;bf16.h'],['../types_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239',1,'bfloat_compop:&#160;bf16.h']]],
-  ['bfloat_5finplace_5fbitop_6',['bfloat_inplace_bitop',['../types_2bf16_8h.html#af13b46bc58e6e6f675ae47aabec37711',1,'bf16.h']]],
-  ['bfloat_5finplace_5fop_7',['bfloat_inplace_op',['../backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c',1,'bfloat_inplace_op:&#160;bf16.h'],['../types_2bf16_8h.html#aee905053f51f76e0c1af94199714d514',1,'bfloat_inplace_op:&#160;bf16.h']]],
-  ['bfloat_5finplace_5fop_5faddr_5fspace_5fhelper_8',['bfloat_inplace_op_addr_space_helper',['../backend_2metal_2kernels_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8',1,'bfloat_inplace_op_addr_space_helper:&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1457da931c28fa4e2500daa4e6441e8b',1,'bfloat_inplace_op_addr_space_helper:&#160;bf16.h']]],
-  ['bfloat_5finplace_5fop_5fhelper_9',['bfloat_inplace_op_helper',['../backend_2metal_2kernels_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d',1,'bfloat_inplace_op_helper:&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afe5988aa8147be2bafda6a5b7792fe15',1,'bfloat_inplace_op_helper:&#160;bf16.h']]]
+  ['bfloat_5fbinop_0',['bfloat_binop',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707',1,'bfloat_binop:&#160;bf16.h'],['../types_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707',1,'bfloat_binop:&#160;bf16.h']]],
+  ['bfloat_5fbinop_5fbase_1',['bfloat_binop_base',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70',1,'bfloat_binop_base:&#160;bf16.h'],['../types_2bf16_8h.html#a78c92beda4436da9a2e520fa98c59f70',1,'bfloat_binop_base:&#160;bf16.h']]],
+  ['bfloat_5fbinop_5fhelper_2',['bfloat_binop_helper',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594',1,'bfloat_binop_helper:&#160;bf16.h'],['../types_2bf16_8h.html#ac7ff36230dab17e8f17b7a7c80888594',1,'bfloat_binop_helper:&#160;bf16.h']]],
+  ['bfloat_5fbitop_3',['bfloat_bitop',['../types_2bf16_8h.html#aac9ba86d4bf05bcda1936494f9b9b4d3',1,'bf16.h']]],
+  ['bfloat_5fcompop_4',['bfloat_compop',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239',1,'bfloat_compop:&#160;bf16.h'],['../types_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239',1,'bfloat_compop:&#160;bf16.h']]],
+  ['bfloat_5finplace_5fbitop_5',['bfloat_inplace_bitop',['../types_2bf16_8h.html#af13b46bc58e6e6f675ae47aabec37711',1,'bf16.h']]],
+  ['bfloat_5finplace_5fop_6',['bfloat_inplace_op',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c',1,'bfloat_inplace_op:&#160;bf16.h'],['../types_2bf16_8h.html#aee905053f51f76e0c1af94199714d514',1,'bfloat_inplace_op:&#160;bf16.h']]],
+  ['bfloat_5finplace_5fop_5faddr_5fspace_5fhelper_7',['bfloat_inplace_op_addr_space_helper',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af30a2cbd2c3415516203b83bd21872f8',1,'bfloat_inplace_op_addr_space_helper:&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1457da931c28fa4e2500daa4e6441e8b',1,'bfloat_inplace_op_addr_space_helper:&#160;bf16.h']]],
+  ['bfloat_5finplace_5fop_5fhelper_8',['bfloat_inplace_op_helper',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2846fd11b5e19b435e9f7ef0998c9b1d',1,'bfloat_inplace_op_helper:&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afe5988aa8147be2bafda6a5b7792fe15',1,'bfloat_inplace_op_helper:&#160;bf16.h']]]
 ];
diff --git a/docs/build/html/search/defines_8.js b/docs/build/html/search/defines_8.js
index 09218ba2b..9e2f63e4c 100644
--- a/docs/build/html/search/defines_8.js
+++ b/docs/build/html/search/defines_8.js
@@ -1,10 +1,6 @@
 var searchData=
 [
-  ['max_5foutput_5fsize_0',['MAX_OUTPUT_SIZE',['../backend_2metal_2kernels_2fft_8h.html#a28d683cf067736d76f867f30c066317e',1,'fft.h']]],
-  ['max_5fradix_1',['MAX_RADIX',['../backend_2metal_2kernels_2fft_8h.html#a7b6e56afa21f022c5e754b000955735a',1,'MAX_RADIX:&#160;fft.h'],['../readwrite_8h.html#a7b6e56afa21f022c5e754b000955735a',1,'MAX_RADIX:&#160;readwrite.h']]],
-  ['mlx_5flapack_5ffunc_2',['MLX_LAPACK_FUNC',['../lapack_8h.html#ae22db9704827bf013a0a61f21a47464b',1,'lapack.h']]],
-  ['mlx_5fmtl_5fconst_3',['MLX_MTL_CONST',['../kernels_2gemv__masked_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;gemv_masked.h'],['../quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;quantized.h'],['../sort_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;sort.h']]],
-  ['mlx_5fmtl_5floop_5funroll_4',['MLX_MTL_LOOP_UNROLL',['../sort_8h.html#ad34b622323cebef136669fedd7229515',1,'sort.h']]],
-  ['mlx_5fmtl_5fpragma_5funroll_5',['MLX_MTL_PRAGMA_UNROLL',['../kernels_2gemv__masked_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;gemv_masked.h'],['../backend_2metal_2kernels_2utils_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;utils.h']]],
-  ['mtl_5fconst_6',['MTL_CONST',['../defines_8h.html#a767ed9f2604de22b259cee02c4ce1d22',1,'defines.h']]]
+  ['jit_5felse_0',['jit_else',['../backend_2metal_2kernels_2jit_2bf16_8h.html#a4b2f08732045407adc7ee181e39e5ae3',1,'bf16.h']]],
+  ['jit_5fendif_1',['jit_endif',['../backend_2metal_2kernels_2jit_2bf16_8h.html#a5049b44a1fffcb837e0c470ae4cafc56',1,'bf16.h']]],
+  ['jit_5fif_2',['jit_if',['../backend_2metal_2kernels_2jit_2bf16_8h.html#aaf5bb88c2349054a6c4c2aefee63d3d2',1,'bf16.h']]]
 ];
diff --git a/docs/build/html/search/defines_9.js b/docs/build/html/search/defines_9.js
index 5aca25aba..09218ba2b 100644
--- a/docs/build/html/search/defines_9.js
+++ b/docs/build/html/search/defines_9.js
@@ -1,22 +1,10 @@
 var searchData=
 [
-  ['pocketfft_5fcache_5fsize_0',['POCKETFFT_CACHE_SIZE',['../pocketfft_8h.html#a9e604bcf20603d70b62b233d3f306714',1,'pocketfft.h']]],
-  ['pocketfft_5fno_5fvectors_1',['POCKETFFT_NO_VECTORS',['../pocketfft_8h.html#aa9cdaed0819c48f97fcd19f05c289160',1,'pocketfft.h']]],
-  ['pocketfft_5fnoinline_2',['POCKETFFT_NOINLINE',['../pocketfft_8h.html#a7020984e0ca1d6e565629ca6e7c1a7e0',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep11_3',['POCKETFFT_PARTSTEP11',['../pocketfft_8h.html#a1793d0d00f2e13101eb5ad0719c40817',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep11a_4',['POCKETFFT_PARTSTEP11a',['../pocketfft_8h.html#ac35e1aa5ae84d655256b7a0afd9051c2',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep11a0_5',['POCKETFFT_PARTSTEP11a0',['../pocketfft_8h.html#ab2df44457945ab625fb38a777a46af1b',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep3a_6',['POCKETFFT_PARTSTEP3a',['../pocketfft_8h.html#ac112b26e5130636ac1d91c2f0af45e0b',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep3b_7',['POCKETFFT_PARTSTEP3b',['../pocketfft_8h.html#a41e646e5535a3a1c6e0d0e67122382f5',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep5a_8',['POCKETFFT_PARTSTEP5a',['../pocketfft_8h.html#a078bc2bd38ab0ffb15b981878c9de03c',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep5b_9',['POCKETFFT_PARTSTEP5b',['../pocketfft_8h.html#ab8a5da142555e059c5e9c618f75b46fa',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep7_10',['POCKETFFT_PARTSTEP7',['../pocketfft_8h.html#af7de1f82911a973d8446cf3f40ff3044',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep7a_11',['POCKETFFT_PARTSTEP7a',['../pocketfft_8h.html#a2b27f6e1f0ee131765186870517255c6',1,'pocketfft.h']]],
-  ['pocketfft_5fpartstep7a0_12',['POCKETFFT_PARTSTEP7a0',['../pocketfft_8h.html#a9c2fc2de74a031c38e9d8a21249ae1cd',1,'pocketfft.h']]],
-  ['pocketfft_5fprep11_13',['POCKETFFT_PREP11',['../pocketfft_8h.html#a536d2ea61479d4b074bf52ce09fdbc3a',1,'pocketfft.h']]],
-  ['pocketfft_5fprep3_14',['POCKETFFT_PREP3',['../pocketfft_8h.html#ae2fd9d433c417f0768fe1b58145b2e59',1,'pocketfft.h']]],
-  ['pocketfft_5fprep5_15',['POCKETFFT_PREP5',['../pocketfft_8h.html#a73077c26d2a82754db2a9c48bc0e11a6',1,'pocketfft.h']]],
-  ['pocketfft_5fprep7_16',['POCKETFFT_PREP7',['../pocketfft_8h.html#ae7c4d0cda5b3824f84eac54addabd6ec',1,'pocketfft.h']]],
-  ['pocketfft_5frearrange_17',['POCKETFFT_REARRANGE',['../pocketfft_8h.html#acffdf2e1ab84f36a7a097e1b8b87a9f9',1,'pocketfft.h']]],
-  ['pocketfft_5frestrict_18',['POCKETFFT_RESTRICT',['../pocketfft_8h.html#abbe177c4872821b32d76d5ce08d6ce82',1,'pocketfft.h']]]
+  ['max_5foutput_5fsize_0',['MAX_OUTPUT_SIZE',['../backend_2metal_2kernels_2fft_8h.html#a28d683cf067736d76f867f30c066317e',1,'fft.h']]],
+  ['max_5fradix_1',['MAX_RADIX',['../backend_2metal_2kernels_2fft_8h.html#a7b6e56afa21f022c5e754b000955735a',1,'MAX_RADIX:&#160;fft.h'],['../readwrite_8h.html#a7b6e56afa21f022c5e754b000955735a',1,'MAX_RADIX:&#160;readwrite.h']]],
+  ['mlx_5flapack_5ffunc_2',['MLX_LAPACK_FUNC',['../lapack_8h.html#ae22db9704827bf013a0a61f21a47464b',1,'lapack.h']]],
+  ['mlx_5fmtl_5fconst_3',['MLX_MTL_CONST',['../kernels_2gemv__masked_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;gemv_masked.h'],['../quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;quantized.h'],['../sort_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;sort.h']]],
+  ['mlx_5fmtl_5floop_5funroll_4',['MLX_MTL_LOOP_UNROLL',['../sort_8h.html#ad34b622323cebef136669fedd7229515',1,'sort.h']]],
+  ['mlx_5fmtl_5fpragma_5funroll_5',['MLX_MTL_PRAGMA_UNROLL',['../kernels_2gemv__masked_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;gemv_masked.h'],['../backend_2metal_2kernels_2utils_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;utils.h']]],
+  ['mtl_5fconst_6',['MTL_CONST',['../defines_8h.html#a767ed9f2604de22b259cee02c4ce1d22',1,'defines.h']]]
 ];
diff --git a/docs/build/html/search/defines_a.js b/docs/build/html/search/defines_a.js
index b1a537eca..5aca25aba 100644
--- a/docs/build/html/search/defines_a.js
+++ b/docs/build/html/search/defines_a.js
@@ -1,4 +1,22 @@
 var searchData=
 [
-  ['radix_5fstep_0',['RADIX_STEP',['../backend_2metal_2kernels_2fft_8h.html#a794032d3a9acff0e31c77c69d0007f10',1,'fft.h']]]
+  ['pocketfft_5fcache_5fsize_0',['POCKETFFT_CACHE_SIZE',['../pocketfft_8h.html#a9e604bcf20603d70b62b233d3f306714',1,'pocketfft.h']]],
+  ['pocketfft_5fno_5fvectors_1',['POCKETFFT_NO_VECTORS',['../pocketfft_8h.html#aa9cdaed0819c48f97fcd19f05c289160',1,'pocketfft.h']]],
+  ['pocketfft_5fnoinline_2',['POCKETFFT_NOINLINE',['../pocketfft_8h.html#a7020984e0ca1d6e565629ca6e7c1a7e0',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep11_3',['POCKETFFT_PARTSTEP11',['../pocketfft_8h.html#a1793d0d00f2e13101eb5ad0719c40817',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep11a_4',['POCKETFFT_PARTSTEP11a',['../pocketfft_8h.html#ac35e1aa5ae84d655256b7a0afd9051c2',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep11a0_5',['POCKETFFT_PARTSTEP11a0',['../pocketfft_8h.html#ab2df44457945ab625fb38a777a46af1b',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep3a_6',['POCKETFFT_PARTSTEP3a',['../pocketfft_8h.html#ac112b26e5130636ac1d91c2f0af45e0b',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep3b_7',['POCKETFFT_PARTSTEP3b',['../pocketfft_8h.html#a41e646e5535a3a1c6e0d0e67122382f5',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep5a_8',['POCKETFFT_PARTSTEP5a',['../pocketfft_8h.html#a078bc2bd38ab0ffb15b981878c9de03c',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep5b_9',['POCKETFFT_PARTSTEP5b',['../pocketfft_8h.html#ab8a5da142555e059c5e9c618f75b46fa',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep7_10',['POCKETFFT_PARTSTEP7',['../pocketfft_8h.html#af7de1f82911a973d8446cf3f40ff3044',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep7a_11',['POCKETFFT_PARTSTEP7a',['../pocketfft_8h.html#a2b27f6e1f0ee131765186870517255c6',1,'pocketfft.h']]],
+  ['pocketfft_5fpartstep7a0_12',['POCKETFFT_PARTSTEP7a0',['../pocketfft_8h.html#a9c2fc2de74a031c38e9d8a21249ae1cd',1,'pocketfft.h']]],
+  ['pocketfft_5fprep11_13',['POCKETFFT_PREP11',['../pocketfft_8h.html#a536d2ea61479d4b074bf52ce09fdbc3a',1,'pocketfft.h']]],
+  ['pocketfft_5fprep3_14',['POCKETFFT_PREP3',['../pocketfft_8h.html#ae2fd9d433c417f0768fe1b58145b2e59',1,'pocketfft.h']]],
+  ['pocketfft_5fprep5_15',['POCKETFFT_PREP5',['../pocketfft_8h.html#a73077c26d2a82754db2a9c48bc0e11a6',1,'pocketfft.h']]],
+  ['pocketfft_5fprep7_16',['POCKETFFT_PREP7',['../pocketfft_8h.html#ae7c4d0cda5b3824f84eac54addabd6ec',1,'pocketfft.h']]],
+  ['pocketfft_5frearrange_17',['POCKETFFT_REARRANGE',['../pocketfft_8h.html#acffdf2e1ab84f36a7a097e1b8b87a9f9',1,'pocketfft.h']]],
+  ['pocketfft_5frestrict_18',['POCKETFFT_RESTRICT',['../pocketfft_8h.html#abbe177c4872821b32d76d5ce08d6ce82',1,'pocketfft.h']]]
 ];
diff --git a/docs/build/html/search/defines_b.js b/docs/build/html/search/defines_b.js
index 3a705c846..b1a537eca 100644
--- a/docs/build/html/search/defines_b.js
+++ b/docs/build/html/search/defines_b.js
@@ -1,5 +1,4 @@
 var searchData=
 [
-  ['steel_5fconst_0',['STEEL_CONST',['../steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b',1,'defines.h']]],
-  ['steel_5fpragma_5funroll_1',['STEEL_PRAGMA_UNROLL',['../steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6',1,'defines.h']]]
+  ['radix_5fstep_0',['RADIX_STEP',['../backend_2metal_2kernels_2fft_8h.html#a794032d3a9acff0e31c77c69d0007f10',1,'fft.h']]]
 ];
diff --git a/docs/build/html/search/defines_c.js b/docs/build/html/search/defines_c.js
index 822e6ecf7..3a705c846 100644
--- a/docs/build/html/search/defines_c.js
+++ b/docs/build/html/search/defines_c.js
@@ -1,4 +1,5 @@
 var searchData=
 [
-  ['uint16_5fto_5fbfloat16_0',['uint16_to_bfloat16',['../bf16__math_8h.html#a030d871474c0e7d907fccffcc8c047e0',1,'bf16_math.h']]]
+  ['steel_5fconst_0',['STEEL_CONST',['../steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b',1,'defines.h']]],
+  ['steel_5fpragma_5funroll_1',['STEEL_PRAGMA_UNROLL',['../steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6',1,'defines.h']]]
 ];
diff --git a/docs/build/html/search/files_0.js b/docs/build/html/search/files_0.js
index 257073d85..8570df320 100644
--- a/docs/build/html/search/files_0.js
+++ b/docs/build/html/search/files_0.js
@@ -3,5 +3,6 @@ var searchData=
   ['allocator_2eh_0',['allocator.h',['../allocator_8h.html',1,'(Global Namespace)'],['../backend_2metal_2allocator_8h.html',1,'(Global Namespace)']]],
   ['arange_2eh_1',['arange.h',['../common_2arange_8h.html',1,'(Global Namespace)'],['../metal_2jit_2arange_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2arange_8h.html',1,'(Global Namespace)']]],
   ['array_2eh_2',['array.h',['../array_8h.html',1,'']]],
-  ['atomic_2eh_3',['atomic.h',['../atomic_8h.html',1,'']]]
+  ['atomic_2eh_3',['atomic.h',['../atomic_8h.html',1,'']]],
+  ['attn_2eh_4',['attn.h',['../attn_8h.html',1,'']]]
 ];
diff --git a/docs/build/html/search/files_1.js b/docs/build/html/search/files_1.js
index 7a4072998..af8aecb92 100644
--- a/docs/build/html/search/files_1.js
+++ b/docs/build/html/search/files_1.js
@@ -1,6 +1,6 @@
 var searchData=
 [
-  ['bf16_2eh_0',['bf16.h',['../backend_2metal_2kernels_2bf16_8h.html',1,'(Global Namespace)'],['../types_2bf16_8h.html',1,'(Global Namespace)']]],
+  ['bf16_2eh_0',['bf16.h',['../backend_2metal_2kernels_2jit_2bf16_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2metal__3__1_2bf16_8h.html',1,'(Global Namespace)'],['../types_2bf16_8h.html',1,'(Global Namespace)']]],
   ['bf16_5fmath_2eh_1',['bf16_math.h',['../bf16__math_8h.html',1,'']]],
   ['binary_2eh_2',['binary.h',['../common_2binary_8h.html',1,'(Global Namespace)'],['../metal_2binary_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2binary_8h.html',1,'(Global Namespace)']]],
   ['binary_5fops_2eh_3',['binary_ops.h',['../binary__ops_8h.html',1,'']]],
diff --git a/docs/build/html/search/files_10.js b/docs/build/html/search/files_10.js
index 103b9ebd4..01964bcf5 100644
--- a/docs/build/html/search/files_10.js
+++ b/docs/build/html/search/files_10.js
@@ -1,13 +1,13 @@
 var searchData=
 [
-  ['scaled_5fdot_5fproduct_5fattention_5fparams_2eh_0',['scaled_dot_product_attention_params.h',['../scaled__dot__product__attention__params_8h.html',1,'']]],
-  ['scan_2eh_1',['scan.h',['../scan_8h.html',1,'']]],
-  ['scatter_2eh_2',['scatter.h',['../scatter_8h.html',1,'']]],
-  ['scheduler_2eh_3',['scheduler.h',['../scheduler_8h.html',1,'']]],
-  ['sdpa_5fvector_2eh_4',['sdpa_vector.h',['../sdpa__vector_8h.html',1,'']]],
-  ['slicing_2eh_5',['slicing.h',['../common_2slicing_8h.html',1,'(Global Namespace)'],['../metal_2slicing_8h.html',1,'(Global Namespace)']]],
-  ['softmax_2eh_6',['softmax.h',['../jit_2softmax_8h.html',1,'(Global Namespace)'],['../kernels_2softmax_8h.html',1,'(Global Namespace)']]],
-  ['sort_2eh_7',['sort.h',['../sort_8h.html',1,'']]],
+  ['scan_2eh_0',['scan.h',['../scan_8h.html',1,'']]],
+  ['scatter_2eh_1',['scatter.h',['../scatter_8h.html',1,'']]],
+  ['scheduler_2eh_2',['scheduler.h',['../scheduler_8h.html',1,'']]],
+  ['sdpa_5fvector_2eh_3',['sdpa_vector.h',['../sdpa__vector_8h.html',1,'']]],
+  ['slicing_2eh_4',['slicing.h',['../common_2slicing_8h.html',1,'(Global Namespace)'],['../metal_2slicing_8h.html',1,'(Global Namespace)']]],
+  ['softmax_2eh_5',['softmax.h',['../jit_2softmax_8h.html',1,'(Global Namespace)'],['../kernels_2softmax_8h.html',1,'(Global Namespace)']]],
+  ['sort_2eh_6',['sort.h',['../sort_8h.html',1,'']]],
+  ['steel_5fattention_2eh_7',['steel_attention.h',['../steel__attention_8h.html',1,'']]],
   ['steel_5fconv_2eh_8',['steel_conv.h',['../jit_2steel__conv_8h.html',1,'(Global Namespace)'],['../kernels_2steel_2conv_2kernels_2steel__conv_8h.html',1,'(Global Namespace)']]],
   ['steel_5fconv_5fgeneral_2eh_9',['steel_conv_general.h',['../steel__conv__general_8h.html',1,'']]],
   ['steel_5fgemm_2eh_10',['steel_gemm.h',['../steel__gemm_8h.html',1,'']]],
diff --git a/docs/build/html/search/files_11.js b/docs/build/html/search/files_11.js
index 6f77b5ab2..b7521b551 100644
--- a/docs/build/html/search/files_11.js
+++ b/docs/build/html/search/files_11.js
@@ -4,7 +4,7 @@ var searchData=
   ['ternary_5fops_2eh_1',['ternary_ops.h',['../ternary__ops_8h.html',1,'']]],
   ['threadpool_2eh_2',['threadpool.h',['../threadpool_8h.html',1,'']]],
   ['threefry_2eh_3',['threefry.h',['../threefry_8h.html',1,'']]],
-  ['transforms_2eh_4',['transforms.h',['../backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html',1,'(Global Namespace)'],['../transforms_8h.html',1,'(Global Namespace)']]],
+  ['transforms_2eh_4',['transforms.h',['../backend_2metal_2kernels_2steel_2attn_2transforms_8h.html',1,'(Global Namespace)'],['../backend_2metal_2kernels_2steel_2gemm_2transforms_8h.html',1,'(Global Namespace)'],['../transforms_8h.html',1,'(Global Namespace)']]],
   ['transforms_5fimpl_2eh_5',['transforms_impl.h',['../transforms__impl_8h.html',1,'']]],
   ['type_5ftraits_2eh_6',['type_traits.h',['../type__traits_8h.html',1,'']]]
 ];
diff --git a/docs/build/html/search/files_a.js b/docs/build/html/search/files_a.js
index 599d21dfb..a0282c4c3 100644
--- a/docs/build/html/search/files_a.js
+++ b/docs/build/html/search/files_a.js
@@ -3,7 +3,7 @@ var searchData=
   ['lapack_2eh_0',['lapack.h',['../lapack_8h.html',1,'']]],
   ['linalg_2eh_1',['linalg.h',['../linalg_8h.html',1,'']]],
   ['load_2eh_2',['load.h',['../backend_2common_2load_8h.html',1,'(Global Namespace)'],['../io_2load_8h.html',1,'(Global Namespace)']]],
-  ['loader_2eh_3',['loader.h',['../conv_2loader_8h.html',1,'(Global Namespace)'],['../gemm_2loader_8h.html',1,'(Global Namespace)']]],
+  ['loader_2eh_3',['loader.h',['../attn_2loader_8h.html',1,'(Global Namespace)'],['../conv_2loader_8h.html',1,'(Global Namespace)'],['../gemm_2loader_8h.html',1,'(Global Namespace)']]],
   ['loader_5fchannel_5fl_2eh_4',['loader_channel_l.h',['../loader__channel__l_8h.html',1,'']]],
   ['loader_5fchannel_5fn_2eh_5',['loader_channel_n.h',['../loader__channel__n_8h.html',1,'']]],
   ['loader_5fgeneral_2eh_6',['loader_general.h',['../loader__general_8h.html',1,'']]]
diff --git a/docs/build/html/search/files_b.js b/docs/build/html/search/files_b.js
index e87b82274..d836b920a 100644
--- a/docs/build/html/search/files_b.js
+++ b/docs/build/html/search/files_b.js
@@ -4,5 +4,5 @@ var searchData=
   ['metal_2eh_1',['metal.h',['../metal_8h.html',1,'']]],
   ['metal_5fimpl_2eh_2',['metal_impl.h',['../metal__impl_8h.html',1,'']]],
   ['mlx_2eh_3',['mlx.h',['../mlx_8h.html',1,'']]],
-  ['mma_2eh_4',['mma.h',['../mma_8h.html',1,'']]]
+  ['mma_2eh_4',['mma.h',['../attn_2mma_8h.html',1,'(Global Namespace)'],['../gemm_2mma_8h.html',1,'(Global Namespace)']]]
 ];
diff --git a/docs/build/html/search/files_d.js b/docs/build/html/search/files_d.js
index 506b3e167..0ea2bf8fb 100644
--- a/docs/build/html/search/files_d.js
+++ b/docs/build/html/search/files_d.js
@@ -1,6 +1,6 @@
 var searchData=
 [
-  ['params_2eh_0',['params.h',['../conv_2params_8h.html',1,'(Global Namespace)'],['../gemm_2params_8h.html',1,'(Global Namespace)']]],
+  ['params_2eh_0',['params.h',['../attn_2params_8h.html',1,'(Global Namespace)'],['../conv_2params_8h.html',1,'(Global Namespace)'],['../gemm_2params_8h.html',1,'(Global Namespace)']]],
   ['pocketfft_2eh_1',['pocketfft.h',['../pocketfft_8h.html',1,'']]],
   ['primitives_2eh_2',['primitives.h',['../distributed_2primitives_8h.html',1,'(Global Namespace)'],['../primitives_8h.html',1,'(Global Namespace)']]]
 ];
diff --git a/docs/build/html/search/functions_1.js b/docs/build/html/search/functions_1.js
index b4193ba2a..51915fc81 100644
--- a/docs/build/html/search/functions_1.js
+++ b/docs/build/html/search/functions_1.js
@@ -13,71 +13,71 @@ var searchData=
   ['adjust_5fmatrix_5foffsets_10',['adjust_matrix_offsets',['../quantized_8h.html#accab1f9e17a65242347c051f98e4c0be',1,'adjust_matrix_offsets(const device T *&amp;x, const device uint32_t *&amp;w, const device T *&amp;scales, const device T *&amp;biases, device T *&amp;y, int output_stride, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid):&#160;quantized.h'],['../quantized_8h.html#a3ab400746ad77be89c30d25638e01698',1,'adjust_matrix_offsets(const device T *&amp;x, const device uint32_t *&amp;w, const device T *&amp;scales, const device T *&amp;biases, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, device T *&amp;y, int output_stride, const constant int &amp;batch_ndims, const constant int *batch_shape, const constant size_t *lhs_strides, const constant size_t *rhs_strides, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid):&#160;quantized.h']]],
   ['advance_11',['advance',['../classpocketfft_1_1detail_1_1multi__iter.html#a5ddcc0666125b3cb6c0d62b30befdd2c',1,'pocketfft::detail::multi_iter::advance()'],['../classpocketfft_1_1detail_1_1simple__iter.html#a73a9ecd3008d2bd35aaa00bf9fac074f',1,'pocketfft::detail::simple_iter::advance()'],['../classpocketfft_1_1detail_1_1rev__iter.html#ad1918c84ae963188afc7599629b29686',1,'pocketfft::detail::rev_iter::advance()']]],
   ['affine_5fdequantize_12',['affine_dequantize',['../quantized_8h.html#a6076203615038eb06816158f7b3869c6',1,'affine_dequantize():&#160;quantized.h'],['../namespacemlx_1_1core_1_1fast.html#a12c7ef41409d6fb378008e67b6fab328',1,'mlx::core::fast::affine_dequantize()']]],
-  ['affine_5fquantize_13',['affine_quantize',['../quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59',1,'affine_quantize():&#160;quantized.h'],['../namespacemlx_1_1core_1_1fast.html#aa4b5f6886b2288cb6dfdd8598579f080',1,'mlx::core::fast::affine_quantize(const array &amp;w, int group_size=64, int bits=4, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fast.html#a638c7e9b9ea8677f01786d8f9738baf8',1,'mlx::core::fast::affine_quantize(const array &amp;w, const array &amp;scales, const array &amp;biases, int group_size=64, int bits=4, StreamOrDevice s={})']]],
-  ['affine_5fquantize_5fscales_5fbiases_14',['affine_quantize_scales_biases',['../quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c',1,'quantized.h']]],
-  ['affinequantize_15',['AffineQuantize',['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a84d5fa9e8c3de407fbcc5f38d2ed1473',1,'mlx::core::fast::AffineQuantize']]],
-  ['aligned_5falloc_16',['aligned_alloc',['../namespacepocketfft_1_1detail.html#ae397445c61400f47a8fe3f8e1b6d0b76',1,'pocketfft::detail']]],
-  ['aligned_5fallocator_17',['aligned_allocator',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a57c07047ac09c6cf48a269429de2b0fb',1,'pocketfft::detail::threading::aligned_allocator::aligned_allocator(const aligned_allocator&lt; U &gt; &amp;)'],['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a0c390851ec37c5cdc5c1e7c6232a0b94',1,'pocketfft::detail::threading::aligned_allocator::aligned_allocator()=default']]],
-  ['aligned_5fdealloc_18',['aligned_dealloc',['../namespacepocketfft_1_1detail.html#aec7820e36a33e0a8bb83aa03b04b81e8',1,'pocketfft::detail']]],
-  ['all_19',['all',['../group__ops.html#ga3b1b90ef1275ca17655b6d7f25d3ee68',1,'mlx::core::all(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga3689e12e8f42dadb4cbe2b07dc4099f4',1,'mlx::core::all(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gac0919c6ba53aea35a7683dea7e9a9a59',1,'mlx::core::all(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gae2d5fcc5b62d673cca76c08b7b4afbbc',1,'mlx::core::all(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['all_5fgather_20',['all_gather',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aeb5a1726358213bc75756506f7b54d04',1,'mlx::core::distributed::detail::all_gather()'],['../namespacemlx_1_1core_1_1distributed.html#a82ef5e8cc7ac62cd228e51b1c1b77cb7',1,'mlx::core::distributed::all_gather()']]],
-  ['all_5freduce_21',['all_reduce',['../reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d',1,'reduce_all.h']]],
-  ['all_5freduce_5fdispatch_22',['all_reduce_dispatch',['../namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098',1,'mlx::core']]],
-  ['all_5fsum_23',['all_sum',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aa1d225b25f7b6426c48c5e35860ee960',1,'mlx::core::distributed::detail::all_sum()'],['../namespacemlx_1_1core_1_1distributed.html#a67ccb1a5445fc6f5db49dd36a15e5980',1,'mlx::core::distributed::all_sum()']]],
-  ['allclose_24',['allclose',['../group__ops.html#gaf0cd4257de7542daf9faf5e605e31020',1,'mlx::core']]],
-  ['allgather_25',['AllGather',['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#af4b10a5b61f160fb64353057c185b661',1,'mlx::core::distributed::AllGather']]],
-  ['alloc_5ftmp_26',['alloc_tmp',['../namespacepocketfft_1_1detail.html#a4db03cbcd9d43d9e0b0b9067713c80e9',1,'pocketfft::detail::alloc_tmp(const shape_t &amp;shape, size_t axsize, size_t elemsize)'],['../namespacepocketfft_1_1detail.html#a13832735696303b9559c4663631d5475',1,'pocketfft::detail::alloc_tmp(const shape_t &amp;shape, const shape_t &amp;axes, size_t elemsize)']]],
-  ['allocate_27',['allocate',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a4f785747b898980756af9e5515363826',1,'pocketfft::detail::threading::aligned_allocator']]],
-  ['allocator_28',['Allocator',['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a5803678a418fef687fc65fa9d5c37b65',1,'mlx::core::allocator::Allocator::Allocator()=default'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#aa05c081ce80dc036f9d3dd8c195259d2',1,'mlx::core::allocator::Allocator::Allocator(const Allocator &amp;other)=delete'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a8e8ce346a16cf0c62847bed9289f9959',1,'mlx::core::allocator::Allocator::Allocator(Allocator &amp;&amp;other)=delete']]],
-  ['allocator_29',['allocator',['../namespacemlx_1_1core_1_1allocator.html#aa23e2f20a336d0b159c097087194634e',1,'mlx::core::allocator::allocator()'],['../namespacemlx_1_1core_1_1metal.html#a74b3558bd518aecde6b14b0ba5e1a0d5',1,'mlx::core::metal::allocator()']]],
-  ['allreduce_30',['AllReduce',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a2d1ea56cbf72a316680ea90aa6da1c2d',1,'mlx::core::distributed::AllReduce']]],
-  ['any_31',['any',['../group__ops.html#ga8598dd718fb05cb28535e250372d4e6f',1,'mlx::core::any(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#gad37df97f253a963bece124198dbaf9ba',1,'mlx::core::any(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaf240618fc8b06debf5f56e97e84f18ef',1,'mlx::core::any(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gab1d56277d468a55227f4dad6bc2fc1ce',1,'mlx::core::any(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['apply_32',['apply',['../struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218',1,'ScaleOp::apply(InT x) const'],['../struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218',1,'ScaleOp::apply(InT x) const'],['../structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75',1,'mlx::steel::TransformNone::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90',1,'mlx::steel::TransformNone::apply(InT x, OutT)'],['../structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf',1,'mlx::steel::TransformAdd::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19',1,'mlx::steel::TransformAdd::apply(InT x, OutT c)'],['../structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87',1,'mlx::steel::TransformAxpby::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba',1,'mlx::steel::TransformAxpby::apply(InT x, OutT c) const']]],
-  ['apply_5fepilogue_33',['apply_epilogue',['../structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff',1,'mlx::steel::BlockMMA::apply_epilogue(thread const UnaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae',1,'mlx::steel::BlockMMA::apply_epilogue(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)']]],
-  ['apply_5fepilogue_5fsafe_34',['apply_epilogue_safe',['../structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a',1,'mlx::steel::BlockMMA']]],
-  ['apply_5finplace_5fop_35',['apply_inplace_op',['../structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf',1,'mlx::steel::BlockLoader']]],
-  ['arange_36',['Arange',['../classmlx_1_1core_1_1_arange.html#a1a70c3b0b9c67d5a9446c141c5b7c574',1,'mlx::core::Arange']]],
-  ['arange_37',['arange',['../namespacemlx_1_1core.html#a369aa886219b83cf219e7a7862ce260b',1,'mlx::core::arange()'],['../namespacemlx_1_1core_1_1metal.html#a272c36f0faf2570cbb2f36030e9a3f26',1,'mlx::core::metal::arange()'],['../metal_2kernels_2arange_8h.html#a1e5126ee6ae0164c2343230c4d87c03e',1,'arange():&#160;arange.h'],['../group__ops.html#ga7ca088b8090b9f84f2e08345cf3f835a',1,'mlx::core::arange(double start, double stop, double step, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga4c36b841dc5cba391dad029be5a0ad98',1,'mlx::core::arange(double start, double stop, double step, StreamOrDevice s={})'],['../group__ops.html#ga8d7cf9eb15e2daf1469058907e8abc85',1,'mlx::core::arange(double start, double stop, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga74566a14e69ba6a25f5a35e7ade5c282',1,'mlx::core::arange(double start, double stop, StreamOrDevice s={})'],['../group__ops.html#ga345aa27af3dae3646b8b4b1068e89a3e',1,'mlx::core::arange(double stop, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#gaae179075d0fe23f4bd53fdf8c41f4c70',1,'mlx::core::arange(double stop, StreamOrDevice s={})'],['../group__ops.html#ga6b945f513077c2978afc1a952c884860',1,'mlx::core::arange(int start, int stop, int step, StreamOrDevice s={})'],['../group__ops.html#ga1c39fcc6eaa1c1867735c7f849d708d6',1,'mlx::core::arange(int start, int stop, StreamOrDevice s={})'],['../group__ops.html#gafe6e4580452c873cac294f16129e633f',1,'mlx::core::arange(int stop, StreamOrDevice s={})']]],
-  ['arccos_38',['ArcCos',['../classmlx_1_1core_1_1_arc_cos.html#a66f4ee841d17923d93241b71ea5103e9',1,'mlx::core::ArcCos']]],
-  ['arccos_39',['arccos',['../group__ops.html#ga08bec7cb10c84466487b507fc5bf9776',1,'mlx::core']]],
-  ['arccosh_40',['ArcCosh',['../classmlx_1_1core_1_1_arc_cosh.html#a34597054db467941a2a883c653ba4d71',1,'mlx::core::ArcCosh']]],
-  ['arccosh_41',['arccosh',['../group__ops.html#gaafafcfcebdf7248679c8543d0c0497e5',1,'mlx::core']]],
-  ['arcsin_42',['ArcSin',['../classmlx_1_1core_1_1_arc_sin.html#a97cb8c3d4d9d6abc627dec49a404f013',1,'mlx::core::ArcSin']]],
-  ['arcsin_43',['arcsin',['../group__ops.html#ga8770e8c8f23f13343911f4c9d6e1c619',1,'mlx::core']]],
-  ['arcsinh_44',['ArcSinh',['../classmlx_1_1core_1_1_arc_sinh.html#a30076b222788deeaaf9ad92d3c535f20',1,'mlx::core::ArcSinh']]],
-  ['arcsinh_45',['arcsinh',['../group__ops.html#gac62e2cedc49ef2c90dd8584000317450',1,'mlx::core']]],
-  ['arctan_46',['ArcTan',['../classmlx_1_1core_1_1_arc_tan.html#a3511153bbd421e89fd9294cdb3f79b44',1,'mlx::core::ArcTan']]],
-  ['arctan_47',['arctan',['../group__ops.html#gaa041f3f070e68f4946db07516b7d092e',1,'mlx::core']]],
-  ['arctan2_48',['ArcTan2',['../classmlx_1_1core_1_1_arc_tan2.html#aa1a4ebab9924b6bcc80df5b52ed0121a',1,'mlx::core::ArcTan2']]],
-  ['arctan2_49',['arctan2',['../group__ops.html#ga6caba9c92b5989123501f909cc7da354',1,'mlx::core']]],
-  ['arctanh_50',['ArcTanh',['../classmlx_1_1core_1_1_arc_tanh.html#a17857bd0e2a3ecf1f7bf8e1a3d354358',1,'mlx::core::ArcTanh']]],
-  ['arctanh_51',['arctanh',['../group__ops.html#gab46a35925a04c5a9d2ec7898ee55358e',1,'mlx::core']]],
-  ['argmax_52',['argmax',['../group__ops.html#gae60b0b5339b9c50b9970260faf613e83',1,'mlx::core::argmax(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#gae6f6c5a840320b336fdc9687e0ed56c8',1,'mlx::core::argmax(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga2efa67466510fc26ab9ea8dff30f2ba5',1,'mlx::core::argmax(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['argmin_53',['argmin',['../group__ops.html#ga7c3bd5ef430a71dfd298e626741e3c71',1,'mlx::core::argmin(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga6bc577c5ab10cd9c848ba81321595070',1,'mlx::core::argmin(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaf66dc3c77b88e4009e0678eda41eca81',1,'mlx::core::argmin(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['argpartition_54',['ArgPartition',['../classmlx_1_1core_1_1_arg_partition.html#ab54b13dbf92351ba1ac06fd3e5a802df',1,'mlx::core::ArgPartition']]],
-  ['argpartition_55',['argpartition',['../group__ops.html#gaf301c49c10fa9b95a9e8dc52ead1a8dd',1,'mlx::core::argpartition(const array &amp;a, int kth, StreamOrDevice s={})'],['../group__ops.html#ga7b15c654c7463def57857a0e239989a3',1,'mlx::core::argpartition(const array &amp;a, int kth, int axis, StreamOrDevice s={})']]],
-  ['argreduce_56',['ArgReduce',['../classmlx_1_1core_1_1_arg_reduce.html#aaccf8021dc24895656e25142eb65aa03',1,'mlx::core::ArgReduce']]],
-  ['argsort_57',['ArgSort',['../classmlx_1_1core_1_1_arg_sort.html#a38507a8445302a81cb44674c4a5fc0b0',1,'mlx::core::ArgSort']]],
-  ['argsort_58',['argsort',['../group__ops.html#ga8df3b2703bf671457422894dd870cdc5',1,'mlx::core::argsort(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga7878e0daa5a75f44e57b5fe948fa3ef6',1,'mlx::core::argsort(const array &amp;a, int axis, StreamOrDevice s={})']]],
-  ['argument_5fencoder_59',['argument_encoder',['../classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881',1,'mlx::core::metal::Device']]],
-  ['arr_60',['arr',['../classpocketfft_1_1detail_1_1arr.html#a961a24410638b35129cd6b81850d2a42',1,'pocketfft::detail::arr::arr()'],['../classpocketfft_1_1detail_1_1arr.html#a04f832b780a4453fdf3b69bf75b182bd',1,'pocketfft::detail::arr::arr(size_t n)'],['../classpocketfft_1_1detail_1_1arr.html#a0cd8fb4a588a74d428a7349d38b477d0',1,'pocketfft::detail::arr::arr(arr &amp;&amp;other)']]],
-  ['arr_5finfo_61',['arr_info',['../classpocketfft_1_1detail_1_1arr__info.html#a0dbddb7d86ca306159fc9ef9a453b21e',1,'pocketfft::detail::arr_info']]],
-  ['array_62',['array',['../classmlx_1_1core_1_1array.html#a75fac72da3ce214fa3737df92a64b232',1,'mlx::core::array::array(T val, Dtype dtype=TypeToDtype&lt; T &gt;())'],['../classmlx_1_1core_1_1array.html#a6db4b8c28c767cc16ad2785ece496dca',1,'mlx::core::array::array(const std::complex&lt; float &gt; &amp;val, Dtype dtype=complex64)'],['../classmlx_1_1core_1_1array.html#a3e506a53b9c7567448f7809dda680210',1,'mlx::core::array::array(It data, std::vector&lt; int &gt; shape, Dtype dtype=TypeToDtype&lt; typename std::iterator_traits&lt; It &gt;::value_type &gt;())'],['../classmlx_1_1core_1_1array.html#a87f170384f4fb93decf2b80ae7280f00',1,'mlx::core::array::array(std::initializer_list&lt; T &gt; data, Dtype dtype=TypeToDtype&lt; T &gt;())'],['../classmlx_1_1core_1_1array.html#a46642301da11e3eb4312c37349fbc9d7',1,'mlx::core::array::array(std::initializer_list&lt; float &gt; data)'],['../classmlx_1_1core_1_1array.html#a5e1812029394bfb1a706c83611286f49',1,'mlx::core::array::array(std::initializer_list&lt; int &gt; data, Dtype dtype)'],['../classmlx_1_1core_1_1array.html#a44e57a41819321e0d796e08cb9a06e4b',1,'mlx::core::array::array(std::initializer_list&lt; T &gt; data, std::vector&lt; int &gt; shape, Dtype dtype=TypeToDtype&lt; T &gt;())'],['../classmlx_1_1core_1_1array.html#a5b5f562ff14c150842cb61628e531663',1,'mlx::core::array::array(allocator::Buffer data, std::vector&lt; int &gt; shape, Dtype dtype, deleter_t deleter=allocator::free)'],['../classmlx_1_1core_1_1array.html#a297df274e2da5cb884257bbeffd6b187',1,'mlx::core::array::array(const array &amp;other)=default'],['../classmlx_1_1core_1_1array.html#ab6cbccbba66cc54acda4390b19f0397c',1,'mlx::core::array::array(array &amp;&amp;other)=default'],['../classmlx_1_1core_1_1array.html#adaade8f4bb7f8ecc0ba07efb17cd2620',1,'mlx::core::array::array(std::vector&lt; int &gt; shape, Dtype dtype, std::shared_ptr&lt; Primitive &gt; primitive, std::vector&lt; array &gt; inputs)']]],
-  ['array_5fequal_63',['array_equal',['../group__ops.html#ga8f3059336ee0c87207b1f8c6ab312645',1,'mlx::core::array_equal(const array &amp;a, const array &amp;b, bool equal_nan, StreamOrDevice s={})'],['../group__ops.html#gaf79cf0271ca0105d7b14295a90d0ed14',1,'mlx::core::array_equal(const array &amp;a, const array &amp;b, StreamOrDevice s={})']]],
-  ['arrayiterator_64',['ArrayIterator',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ad3afcb24c6db7642bbc06835f7f3e27a',1,'mlx::core::array::ArrayIterator']]],
-  ['as_5fstrided_65',['as_strided',['../group__ops.html#ga8de80ecef30fc560003d40f61a38b99d',1,'mlx::core']]],
-  ['asin_66',['asin',['../namespacemetal.html#a16e843194df3fd136404bf80ba5ac95c',1,'metal::asin()'],['../namespacemetal_1_1fast.html#a769455a283da99654b6e42c3acf13eb1',1,'metal::fast::asin()'],['../namespacemetal_1_1precise.html#adc7b8b6e12e320cb32030f728dcbf438',1,'metal::precise::asin()']]],
-  ['asinh_67',['asinh',['../namespacemetal.html#abcc3251866930cfe880f89e7473d0e63',1,'metal::asinh()'],['../namespacemetal_1_1fast.html#a4367034b7b3e14310803bb2be975a556',1,'metal::fast::asinh()'],['../namespacemetal_1_1precise.html#aaad1cdde6687c8011fbc5fda1bb13424',1,'metal::precise::asinh()']]],
-  ['asstrided_68',['AsStrided',['../classmlx_1_1core_1_1_as_strided.html#a80c0547f72ed53374eafc57d57b5d4af',1,'mlx::core::AsStrided']]],
-  ['astype_69',['AsType',['../classmlx_1_1core_1_1_as_type.html#a8c3241d402a8977bb4db037e225f5b47',1,'mlx::core::AsType']]],
-  ['astype_70',['astype',['../group__ops.html#ga0e58c24fc5668e5a521e5b45e8370a62',1,'mlx::core']]],
-  ['async_5feval_71',['async_eval',['../namespacemlx_1_1core.html#a15dda19aa7fa1fc5fca35df5cf963297',1,'mlx::core']]],
-  ['atan_72',['atan',['../namespacemetal.html#a80a771553d9a0012b93620d19c48b00f',1,'metal::atan()'],['../namespacemetal_1_1fast.html#a769503b4b7f89071d0983258c5a3ac5a',1,'metal::fast::atan()'],['../namespacemetal_1_1precise.html#aaaf4b5f4786a912089bbf0ae7619a6be',1,'metal::precise::atan()']]],
-  ['atan2_73',['atan2',['../namespacemetal.html#a1d430793eaa38ccf0d07145e3fcd1e61',1,'metal::atan2()'],['../namespacemetal_1_1fast.html#a00e687ea46f5affe26e6aef8fd62b89a',1,'metal::fast::atan2()'],['../namespacemetal_1_1precise.html#a6f161b049cc6884f87b09b33c2d1cd7f',1,'metal::precise::atan2()']]],
-  ['atanh_74',['atanh',['../namespacemetal.html#a57116427997ba71dd3863bfb15de33bf',1,'metal::atanh()'],['../namespacemetal_1_1fast.html#af24608fc605db9a14427d37c36dc1c53',1,'metal::fast::atanh()'],['../namespacemetal_1_1precise.html#a902994837653b90c47f4285673e712c4',1,'metal::precise::atanh()']]],
-  ['atleast_5f1d_75',['atleast_1d',['../group__ops.html#gaba4d25e7a2bf87ba4feb7837ec7fa396',1,'mlx::core::atleast_1d(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga08ca172ce80157c916c89dd0b45b95c5',1,'mlx::core::atleast_1d(const std::vector&lt; array &gt; &amp;a, StreamOrDevice s={})']]],
-  ['atleast_5f2d_76',['atleast_2d',['../group__ops.html#gaeeb7f5bb88aa32a3ac2be1f39c5f8087',1,'mlx::core::atleast_2d(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga9950299a80c2562f13448758f856d1f5',1,'mlx::core::atleast_2d(const std::vector&lt; array &gt; &amp;a, StreamOrDevice s={})']]],
-  ['atleast_5f3d_77',['atleast_3d',['../group__ops.html#ga4afd919601e67782ff964465919956a0',1,'mlx::core::atleast_3d(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaffdf742ad79440a60dda40062a8074fe',1,'mlx::core::atleast_3d(const std::vector&lt; array &gt; &amp;a, StreamOrDevice s={})']]],
-  ['atomic_5fupdate_78',['atomic_update',['../struct_none.html#aecbce7c97e8b1d5dc4afd2e788c24e06',1,'None']]],
-  ['attach_5fevent_79',['attach_event',['../classmlx_1_1core_1_1array.html#a000c3cfe13cb378bf0523b62816190da',1,'mlx::core::array']]]
+  ['affine_5fquantize_13',['affine_quantize',['../quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59',1,'affine_quantize():&#160;quantized.h'],['../namespacemlx_1_1core_1_1fast.html#aa4b5f6886b2288cb6dfdd8598579f080',1,'mlx::core::fast::affine_quantize()']]],
+  ['affinequantize_14',['AffineQuantize',['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a84d5fa9e8c3de407fbcc5f38d2ed1473',1,'mlx::core::fast::AffineQuantize']]],
+  ['aligned_5falloc_15',['aligned_alloc',['../namespacepocketfft_1_1detail.html#ae397445c61400f47a8fe3f8e1b6d0b76',1,'pocketfft::detail']]],
+  ['aligned_5fallocator_16',['aligned_allocator',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a57c07047ac09c6cf48a269429de2b0fb',1,'pocketfft::detail::threading::aligned_allocator::aligned_allocator(const aligned_allocator&lt; U &gt; &amp;)'],['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a0c390851ec37c5cdc5c1e7c6232a0b94',1,'pocketfft::detail::threading::aligned_allocator::aligned_allocator()=default']]],
+  ['aligned_5fdealloc_17',['aligned_dealloc',['../namespacepocketfft_1_1detail.html#aec7820e36a33e0a8bb83aa03b04b81e8',1,'pocketfft::detail']]],
+  ['all_18',['all',['../group__ops.html#ga3b1b90ef1275ca17655b6d7f25d3ee68',1,'mlx::core::all(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga3689e12e8f42dadb4cbe2b07dc4099f4',1,'mlx::core::all(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gac0919c6ba53aea35a7683dea7e9a9a59',1,'mlx::core::all(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gae2d5fcc5b62d673cca76c08b7b4afbbc',1,'mlx::core::all(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['all_5fgather_19',['all_gather',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aeb5a1726358213bc75756506f7b54d04',1,'mlx::core::distributed::detail::all_gather()'],['../namespacemlx_1_1core_1_1distributed.html#a82ef5e8cc7ac62cd228e51b1c1b77cb7',1,'mlx::core::distributed::all_gather()']]],
+  ['all_5freduce_20',['all_reduce',['../reduce__all_8h.html#a9086a585eda5a887160ee24baae0a7b8',1,'reduce_all.h']]],
+  ['all_5freduce_5fdispatch_21',['all_reduce_dispatch',['../namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098',1,'mlx::core']]],
+  ['all_5fsum_22',['all_sum',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aa1d225b25f7b6426c48c5e35860ee960',1,'mlx::core::distributed::detail::all_sum()'],['../namespacemlx_1_1core_1_1distributed.html#a67ccb1a5445fc6f5db49dd36a15e5980',1,'mlx::core::distributed::all_sum()']]],
+  ['allclose_23',['allclose',['../group__ops.html#gaf0cd4257de7542daf9faf5e605e31020',1,'mlx::core']]],
+  ['allgather_24',['AllGather',['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#af4b10a5b61f160fb64353057c185b661',1,'mlx::core::distributed::AllGather']]],
+  ['alloc_5ftmp_25',['alloc_tmp',['../namespacepocketfft_1_1detail.html#a4db03cbcd9d43d9e0b0b9067713c80e9',1,'pocketfft::detail::alloc_tmp(const shape_t &amp;shape, size_t axsize, size_t elemsize)'],['../namespacepocketfft_1_1detail.html#a13832735696303b9559c4663631d5475',1,'pocketfft::detail::alloc_tmp(const shape_t &amp;shape, const shape_t &amp;axes, size_t elemsize)']]],
+  ['allocate_26',['allocate',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#a4f785747b898980756af9e5515363826',1,'pocketfft::detail::threading::aligned_allocator']]],
+  ['allocator_27',['Allocator',['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a5803678a418fef687fc65fa9d5c37b65',1,'mlx::core::allocator::Allocator::Allocator()=default'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#aa05c081ce80dc036f9d3dd8c195259d2',1,'mlx::core::allocator::Allocator::Allocator(const Allocator &amp;other)=delete'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a8e8ce346a16cf0c62847bed9289f9959',1,'mlx::core::allocator::Allocator::Allocator(Allocator &amp;&amp;other)=delete']]],
+  ['allocator_28',['allocator',['../namespacemlx_1_1core_1_1allocator.html#aa23e2f20a336d0b159c097087194634e',1,'mlx::core::allocator::allocator()'],['../namespacemlx_1_1core_1_1metal.html#a74b3558bd518aecde6b14b0ba5e1a0d5',1,'mlx::core::metal::allocator()']]],
+  ['allreduce_29',['AllReduce',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a2d1ea56cbf72a316680ea90aa6da1c2d',1,'mlx::core::distributed::AllReduce']]],
+  ['any_30',['any',['../group__ops.html#ga8598dd718fb05cb28535e250372d4e6f',1,'mlx::core::any(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#gad37df97f253a963bece124198dbaf9ba',1,'mlx::core::any(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaf240618fc8b06debf5f56e97e84f18ef',1,'mlx::core::any(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gab1d56277d468a55227f4dad6bc2fc1ce',1,'mlx::core::any(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['apply_31',['apply',['../struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218',1,'ScaleOp::apply()'],['../struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16',1,'TransformScale::apply()'],['../struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e',1,'MaxOp::apply()'],['../struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d',1,'SumOp::apply()'],['../struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756',1,'MulOp::apply()'],['../struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143',1,'SubOp::apply()'],['../struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334',1,'ExpSubOp::apply()'],['../struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221',1,'DivOp::apply()'],['../structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75',1,'mlx::steel::TransformNone::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90',1,'mlx::steel::TransformNone::apply(InT x, OutT)'],['../structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf',1,'mlx::steel::TransformAdd::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19',1,'mlx::steel::TransformAdd::apply(InT x, OutT c)'],['../structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87',1,'mlx::steel::TransformAxpby::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba',1,'mlx::steel::TransformAxpby::apply(InT x, OutT c) const'],['../struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218',1,'ScaleOp::apply()'],['../structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75',1,'mlx::steel::TransformNone::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90',1,'mlx::steel::TransformNone::apply(InT x, OutT)'],['../structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf',1,'mlx::steel::TransformAdd::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19',1,'mlx::steel::TransformAdd::apply(InT x, OutT c)'],['../structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87',1,'mlx::steel::TransformAxpby::apply(InT x)'],['../structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba',1,'mlx::steel::TransformAxpby::apply(InT x, OutT c) const']]],
+  ['apply_5fepilogue_32',['apply_epilogue',['../structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff',1,'mlx::steel::BlockMMA::apply_epilogue(thread const UnaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae',1,'mlx::steel::BlockMMA::apply_epilogue(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff',1,'mlx::steel::BlockMMA::apply_epilogue(thread const UnaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae',1,'mlx::steel::BlockMMA::apply_epilogue(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)']]],
+  ['apply_5fepilogue_5fsafe_33',['apply_epilogue_safe',['../structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a',1,'mlx::steel::BlockMMA::apply_epilogue_safe(const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const BinaryEpilogue &amp;epilogue_op)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a',1,'mlx::steel::BlockMMA::apply_epilogue_safe(const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const BinaryEpilogue &amp;epilogue_op)']]],
+  ['apply_5finplace_5fop_34',['apply_inplace_op',['../structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf',1,'mlx::steel::BlockLoader::apply_inplace_op()'],['../structmlx_1_1steel_1_1_block_loader_t.html#a2b136fad00dc54300e68aa6b905eff97',1,'mlx::steel::BlockLoaderT::apply_inplace_op()'],['../structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf',1,'mlx::steel::BlockLoader::apply_inplace_op()']]],
+  ['arange_35',['Arange',['../classmlx_1_1core_1_1_arange.html#a1a70c3b0b9c67d5a9446c141c5b7c574',1,'mlx::core::Arange']]],
+  ['arange_36',['arange',['../namespacemlx_1_1core.html#a369aa886219b83cf219e7a7862ce260b',1,'mlx::core::arange()'],['../namespacemlx_1_1core_1_1metal.html#a272c36f0faf2570cbb2f36030e9a3f26',1,'mlx::core::metal::arange()'],['../metal_2kernels_2arange_8h.html#a1e5126ee6ae0164c2343230c4d87c03e',1,'arange():&#160;arange.h'],['../group__ops.html#ga7ca088b8090b9f84f2e08345cf3f835a',1,'mlx::core::arange(double start, double stop, double step, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga4c36b841dc5cba391dad029be5a0ad98',1,'mlx::core::arange(double start, double stop, double step, StreamOrDevice s={})'],['../group__ops.html#ga8d7cf9eb15e2daf1469058907e8abc85',1,'mlx::core::arange(double start, double stop, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga74566a14e69ba6a25f5a35e7ade5c282',1,'mlx::core::arange(double start, double stop, StreamOrDevice s={})'],['../group__ops.html#ga345aa27af3dae3646b8b4b1068e89a3e',1,'mlx::core::arange(double stop, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#gaae179075d0fe23f4bd53fdf8c41f4c70',1,'mlx::core::arange(double stop, StreamOrDevice s={})'],['../group__ops.html#ga6b945f513077c2978afc1a952c884860',1,'mlx::core::arange(int start, int stop, int step, StreamOrDevice s={})'],['../group__ops.html#ga1c39fcc6eaa1c1867735c7f849d708d6',1,'mlx::core::arange(int start, int stop, StreamOrDevice s={})'],['../group__ops.html#gafe6e4580452c873cac294f16129e633f',1,'mlx::core::arange(int stop, StreamOrDevice s={})']]],
+  ['arccos_37',['ArcCos',['../classmlx_1_1core_1_1_arc_cos.html#a66f4ee841d17923d93241b71ea5103e9',1,'mlx::core::ArcCos']]],
+  ['arccos_38',['arccos',['../group__ops.html#ga08bec7cb10c84466487b507fc5bf9776',1,'mlx::core']]],
+  ['arccosh_39',['ArcCosh',['../classmlx_1_1core_1_1_arc_cosh.html#a34597054db467941a2a883c653ba4d71',1,'mlx::core::ArcCosh']]],
+  ['arccosh_40',['arccosh',['../group__ops.html#gaafafcfcebdf7248679c8543d0c0497e5',1,'mlx::core']]],
+  ['arcsin_41',['ArcSin',['../classmlx_1_1core_1_1_arc_sin.html#a97cb8c3d4d9d6abc627dec49a404f013',1,'mlx::core::ArcSin']]],
+  ['arcsin_42',['arcsin',['../group__ops.html#ga8770e8c8f23f13343911f4c9d6e1c619',1,'mlx::core']]],
+  ['arcsinh_43',['ArcSinh',['../classmlx_1_1core_1_1_arc_sinh.html#a30076b222788deeaaf9ad92d3c535f20',1,'mlx::core::ArcSinh']]],
+  ['arcsinh_44',['arcsinh',['../group__ops.html#gac62e2cedc49ef2c90dd8584000317450',1,'mlx::core']]],
+  ['arctan_45',['ArcTan',['../classmlx_1_1core_1_1_arc_tan.html#a3511153bbd421e89fd9294cdb3f79b44',1,'mlx::core::ArcTan']]],
+  ['arctan_46',['arctan',['../group__ops.html#gaa041f3f070e68f4946db07516b7d092e',1,'mlx::core']]],
+  ['arctan2_47',['ArcTan2',['../classmlx_1_1core_1_1_arc_tan2.html#aa1a4ebab9924b6bcc80df5b52ed0121a',1,'mlx::core::ArcTan2']]],
+  ['arctan2_48',['arctan2',['../group__ops.html#ga6caba9c92b5989123501f909cc7da354',1,'mlx::core']]],
+  ['arctanh_49',['ArcTanh',['../classmlx_1_1core_1_1_arc_tanh.html#a17857bd0e2a3ecf1f7bf8e1a3d354358',1,'mlx::core::ArcTanh']]],
+  ['arctanh_50',['arctanh',['../group__ops.html#gab46a35925a04c5a9d2ec7898ee55358e',1,'mlx::core']]],
+  ['argmax_51',['argmax',['../group__ops.html#gae60b0b5339b9c50b9970260faf613e83',1,'mlx::core::argmax(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#gae6f6c5a840320b336fdc9687e0ed56c8',1,'mlx::core::argmax(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga2efa67466510fc26ab9ea8dff30f2ba5',1,'mlx::core::argmax(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['argmin_52',['argmin',['../group__ops.html#ga7c3bd5ef430a71dfd298e626741e3c71',1,'mlx::core::argmin(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga6bc577c5ab10cd9c848ba81321595070',1,'mlx::core::argmin(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaf66dc3c77b88e4009e0678eda41eca81',1,'mlx::core::argmin(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['argpartition_53',['ArgPartition',['../classmlx_1_1core_1_1_arg_partition.html#ab54b13dbf92351ba1ac06fd3e5a802df',1,'mlx::core::ArgPartition']]],
+  ['argpartition_54',['argpartition',['../group__ops.html#gaf301c49c10fa9b95a9e8dc52ead1a8dd',1,'mlx::core::argpartition(const array &amp;a, int kth, StreamOrDevice s={})'],['../group__ops.html#ga7b15c654c7463def57857a0e239989a3',1,'mlx::core::argpartition(const array &amp;a, int kth, int axis, StreamOrDevice s={})']]],
+  ['argreduce_55',['ArgReduce',['../classmlx_1_1core_1_1_arg_reduce.html#aaccf8021dc24895656e25142eb65aa03',1,'mlx::core::ArgReduce']]],
+  ['argsort_56',['ArgSort',['../classmlx_1_1core_1_1_arg_sort.html#a38507a8445302a81cb44674c4a5fc0b0',1,'mlx::core::ArgSort']]],
+  ['argsort_57',['argsort',['../group__ops.html#ga8df3b2703bf671457422894dd870cdc5',1,'mlx::core::argsort(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga7878e0daa5a75f44e57b5fe948fa3ef6',1,'mlx::core::argsort(const array &amp;a, int axis, StreamOrDevice s={})']]],
+  ['argument_5fencoder_58',['argument_encoder',['../classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881',1,'mlx::core::metal::Device']]],
+  ['arr_59',['arr',['../classpocketfft_1_1detail_1_1arr.html#a961a24410638b35129cd6b81850d2a42',1,'pocketfft::detail::arr::arr()'],['../classpocketfft_1_1detail_1_1arr.html#a04f832b780a4453fdf3b69bf75b182bd',1,'pocketfft::detail::arr::arr(size_t n)'],['../classpocketfft_1_1detail_1_1arr.html#a0cd8fb4a588a74d428a7349d38b477d0',1,'pocketfft::detail::arr::arr(arr &amp;&amp;other)']]],
+  ['arr_5finfo_60',['arr_info',['../classpocketfft_1_1detail_1_1arr__info.html#a0dbddb7d86ca306159fc9ef9a453b21e',1,'pocketfft::detail::arr_info']]],
+  ['array_61',['array',['../classmlx_1_1core_1_1array.html#a75fac72da3ce214fa3737df92a64b232',1,'mlx::core::array::array(T val, Dtype dtype=TypeToDtype&lt; T &gt;())'],['../classmlx_1_1core_1_1array.html#a6db4b8c28c767cc16ad2785ece496dca',1,'mlx::core::array::array(const std::complex&lt; float &gt; &amp;val, Dtype dtype=complex64)'],['../classmlx_1_1core_1_1array.html#a3e506a53b9c7567448f7809dda680210',1,'mlx::core::array::array(It data, std::vector&lt; int &gt; shape, Dtype dtype=TypeToDtype&lt; typename std::iterator_traits&lt; It &gt;::value_type &gt;())'],['../classmlx_1_1core_1_1array.html#a87f170384f4fb93decf2b80ae7280f00',1,'mlx::core::array::array(std::initializer_list&lt; T &gt; data, Dtype dtype=TypeToDtype&lt; T &gt;())'],['../classmlx_1_1core_1_1array.html#a46642301da11e3eb4312c37349fbc9d7',1,'mlx::core::array::array(std::initializer_list&lt; float &gt; data)'],['../classmlx_1_1core_1_1array.html#a5e1812029394bfb1a706c83611286f49',1,'mlx::core::array::array(std::initializer_list&lt; int &gt; data, Dtype dtype)'],['../classmlx_1_1core_1_1array.html#a44e57a41819321e0d796e08cb9a06e4b',1,'mlx::core::array::array(std::initializer_list&lt; T &gt; data, std::vector&lt; int &gt; shape, Dtype dtype=TypeToDtype&lt; T &gt;())'],['../classmlx_1_1core_1_1array.html#a5b5f562ff14c150842cb61628e531663',1,'mlx::core::array::array(allocator::Buffer data, std::vector&lt; int &gt; shape, Dtype dtype, deleter_t deleter=allocator::free)'],['../classmlx_1_1core_1_1array.html#a297df274e2da5cb884257bbeffd6b187',1,'mlx::core::array::array(const array &amp;other)=default'],['../classmlx_1_1core_1_1array.html#ab6cbccbba66cc54acda4390b19f0397c',1,'mlx::core::array::array(array &amp;&amp;other)=default'],['../classmlx_1_1core_1_1array.html#adaade8f4bb7f8ecc0ba07efb17cd2620',1,'mlx::core::array::array(std::vector&lt; int &gt; shape, Dtype dtype, std::shared_ptr&lt; Primitive &gt; primitive, std::vector&lt; array &gt; inputs)']]],
+  ['array_5fequal_62',['array_equal',['../group__ops.html#ga8f3059336ee0c87207b1f8c6ab312645',1,'mlx::core::array_equal(const array &amp;a, const array &amp;b, bool equal_nan, StreamOrDevice s={})'],['../group__ops.html#gaf79cf0271ca0105d7b14295a90d0ed14',1,'mlx::core::array_equal(const array &amp;a, const array &amp;b, StreamOrDevice s={})']]],
+  ['arrayiterator_63',['ArrayIterator',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ad3afcb24c6db7642bbc06835f7f3e27a',1,'mlx::core::array::ArrayIterator']]],
+  ['as_5fstrided_64',['as_strided',['../group__ops.html#ga8de80ecef30fc560003d40f61a38b99d',1,'mlx::core']]],
+  ['asin_65',['asin',['../namespacemetal.html#a16e843194df3fd136404bf80ba5ac95c',1,'metal::asin()'],['../namespacemetal_1_1fast.html#a769455a283da99654b6e42c3acf13eb1',1,'metal::fast::asin()'],['../namespacemetal_1_1precise.html#adc7b8b6e12e320cb32030f728dcbf438',1,'metal::precise::asin()']]],
+  ['asinh_66',['asinh',['../namespacemetal.html#abcc3251866930cfe880f89e7473d0e63',1,'metal::asinh()'],['../namespacemetal_1_1fast.html#a4367034b7b3e14310803bb2be975a556',1,'metal::fast::asinh()'],['../namespacemetal_1_1precise.html#aaad1cdde6687c8011fbc5fda1bb13424',1,'metal::precise::asinh()']]],
+  ['asstrided_67',['AsStrided',['../classmlx_1_1core_1_1_as_strided.html#a80c0547f72ed53374eafc57d57b5d4af',1,'mlx::core::AsStrided']]],
+  ['astype_68',['AsType',['../classmlx_1_1core_1_1_as_type.html#a8c3241d402a8977bb4db037e225f5b47',1,'mlx::core::AsType']]],
+  ['astype_69',['astype',['../group__ops.html#ga0e58c24fc5668e5a521e5b45e8370a62',1,'mlx::core']]],
+  ['async_5feval_70',['async_eval',['../namespacemlx_1_1core.html#a15dda19aa7fa1fc5fca35df5cf963297',1,'mlx::core']]],
+  ['atan_71',['atan',['../namespacemetal.html#a80a771553d9a0012b93620d19c48b00f',1,'metal::atan()'],['../namespacemetal_1_1fast.html#a769503b4b7f89071d0983258c5a3ac5a',1,'metal::fast::atan()'],['../namespacemetal_1_1precise.html#aaaf4b5f4786a912089bbf0ae7619a6be',1,'metal::precise::atan()']]],
+  ['atan2_72',['atan2',['../namespacemetal.html#a1d430793eaa38ccf0d07145e3fcd1e61',1,'metal::atan2()'],['../namespacemetal_1_1fast.html#a00e687ea46f5affe26e6aef8fd62b89a',1,'metal::fast::atan2()'],['../namespacemetal_1_1precise.html#a6f161b049cc6884f87b09b33c2d1cd7f',1,'metal::precise::atan2()']]],
+  ['atanh_73',['atanh',['../namespacemetal.html#a57116427997ba71dd3863bfb15de33bf',1,'metal::atanh()'],['../namespacemetal_1_1fast.html#af24608fc605db9a14427d37c36dc1c53',1,'metal::fast::atanh()'],['../namespacemetal_1_1precise.html#a902994837653b90c47f4285673e712c4',1,'metal::precise::atanh()']]],
+  ['atleast_5f1d_74',['atleast_1d',['../group__ops.html#gaba4d25e7a2bf87ba4feb7837ec7fa396',1,'mlx::core::atleast_1d(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga08ca172ce80157c916c89dd0b45b95c5',1,'mlx::core::atleast_1d(const std::vector&lt; array &gt; &amp;a, StreamOrDevice s={})']]],
+  ['atleast_5f2d_75',['atleast_2d',['../group__ops.html#gaeeb7f5bb88aa32a3ac2be1f39c5f8087',1,'mlx::core::atleast_2d(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga9950299a80c2562f13448758f856d1f5',1,'mlx::core::atleast_2d(const std::vector&lt; array &gt; &amp;a, StreamOrDevice s={})']]],
+  ['atleast_5f3d_76',['atleast_3d',['../group__ops.html#ga4afd919601e67782ff964465919956a0',1,'mlx::core::atleast_3d(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaffdf742ad79440a60dda40062a8074fe',1,'mlx::core::atleast_3d(const std::vector&lt; array &gt; &amp;a, StreamOrDevice s={})']]],
+  ['atomic_5fupdate_77',['atomic_update',['../struct_none.html#aecbce7c97e8b1d5dc4afd2e788c24e06',1,'None']]],
+  ['attach_5fevent_78',['attach_event',['../classmlx_1_1core_1_1array.html#a000c3cfe13cb378bf0523b62816190da',1,'mlx::core::array']]],
+  ['attention_79',['attention',['../steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33',1,'steel_attention.h']]]
 ];
diff --git a/docs/build/html/search/functions_10.js b/docs/build/html/search/functions_10.js
index 33c9ec6d7..2bbadabe6 100644
--- a/docs/build/html/search/functions_10.js
+++ b/docs/build/html/search/functions_10.js
@@ -25,7 +25,7 @@ var searchData=
   ['primitive_22',['primitive',['../classmlx_1_1core_1_1array.html#a790548666511d8c6d9f92ee79d2ce14c',1,'mlx::core::array']]],
   ['primitive_5fid_23',['primitive_id',['../classmlx_1_1core_1_1array.html#af5ad83605d4eea81561246873bee1d7c',1,'mlx::core::array']]],
   ['primitive_5fptr_24',['primitive_ptr',['../classmlx_1_1core_1_1array.html#a5119cd616ec3c05d65878944b8889469',1,'mlx::core::array']]],
-  ['print_25',['print',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab',1,'mlx::core::distributed::AllReduce::print()'],['../classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb',1,'mlx::core::Primitive::print()'],['../classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107',1,'mlx::core::Abs::print()'],['../classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d',1,'mlx::core::Add::print()'],['../classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9',1,'mlx::core::AddMM::print()'],['../classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e',1,'mlx::core::Arange::print()'],['../classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739',1,'mlx::core::ArcCos::print()'],['../classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630',1,'mlx::core::ArcCosh::print()'],['../classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87',1,'mlx::core::ArcSin::print()'],['../classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430',1,'mlx::core::ArcSinh::print()'],['../classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05',1,'mlx::core::ArcTan::print()'],['../classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361',1,'mlx::core::ArcTan2::print()'],['../classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523',1,'mlx::core::ArcTanh::print()'],['../classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63',1,'mlx::core::ArgPartition::print()'],['../classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f',1,'mlx::core::ArgReduce::print()'],['../classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd',1,'mlx::core::ArgSort::print()'],['../classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c',1,'mlx::core::AsType::print()'],['../classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf',1,'mlx::core::AsStrided::print()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d',1,'mlx::core::BitwiseBinary::print()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159',1,'mlx::core::BlockMaskedMM::print()'],['../classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758',1,'mlx::core::GatherMM::print()'],['../classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11',1,'mlx::core::Broadcast::print()'],['../classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee',1,'mlx::core::Ceil::print()'],['../classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b',1,'mlx::core::Compiled::print()'],['../classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33',1,'mlx::core::Concatenate::print()'],['../classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4',1,'mlx::core::Conjugate::print()'],['../classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd',1,'mlx::core::Convolution::print()'],['../classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008',1,'mlx::core::Copy::print()'],['../classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696',1,'mlx::core::Cos::print()'],['../classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2',1,'mlx::core::Cosh::print()'],['../classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298',1,'mlx::core::CustomTransforms::print()'],['../classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82',1,'mlx::core::Depends::print()'],['../classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6',1,'mlx::core::Divide::print()'],['../classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1',1,'mlx::core::DivMod::print()'],['../classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7',1,'mlx::core::Select::print()'],['../classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4',1,'mlx::core::Remainder::print()'],['../classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774',1,'mlx::core::Equal::print()'],['../classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c',1,'mlx::core::Erf::print()'],['../classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9',1,'mlx::core::ErfInv::print()'],['../classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a',1,'mlx::core::Exp::print()'],['../classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1',1,'mlx::core::Expm1::print()'],['../classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf',1,'mlx::core::FFT::print()'],['../classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6',1,'mlx::core::Floor::print()'],['../classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013',1,'mlx::core::Full::print()'],['../classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91',1,'mlx::core::Gather::print()'],['../classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04',1,'mlx::core::Greater::print()'],['../classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef',1,'mlx::core::GreaterEqual::print()'],['../classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6',1,'mlx::core::Hadamard::print()'],['../classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d',1,'mlx::core::Imag::print()'],['../classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78',1,'mlx::core::Less::print()'],['../classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950',1,'mlx::core::LessEqual::print()'],['../classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa',1,'mlx::core::Load::print()'],['../classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d',1,'mlx::core::Log::print()'],['../classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4',1,'mlx::core::Log1p::print()'],['../classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c',1,'mlx::core::LogicalNot::print()'],['../classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397',1,'mlx::core::LogicalAnd::print()'],['../classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003',1,'mlx::core::LogicalOr::print()'],['../classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9',1,'mlx::core::LogAddExp::print()'],['../classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd',1,'mlx::core::Matmul::print()'],['../classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca',1,'mlx::core::Maximum::print()'],['../classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512',1,'mlx::core::Minimum::print()'],['../classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909',1,'mlx::core::Multiply::print()'],['../classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91',1,'mlx::core::Negative::print()'],['../classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09',1,'mlx::core::NotEqual::print()'],['../classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52',1,'mlx::core::NumberOfElements::print()'],['../classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a',1,'mlx::core::Pad::print()'],['../classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0',1,'mlx::core::Partition::print()'],['../classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60',1,'mlx::core::Power::print()'],['../classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db',1,'mlx::core::QuantizedMatmul::print()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0',1,'mlx::core::GatherQMM::print()'],['../classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271',1,'mlx::core::RandomBits::print()'],['../classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b',1,'mlx::core::Real::print()'],['../classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862',1,'mlx::core::Reshape::print()'],['../classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd',1,'mlx::core::Reduce::print()'],['../classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72',1,'mlx::core::Round::print()'],['../classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22',1,'mlx::core::Scan::print()'],['../classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa',1,'mlx::core::Scatter::print()'],['../classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2',1,'mlx::core::Sigmoid::print()'],['../classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a',1,'mlx::core::Sign::print()'],['../classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4',1,'mlx::core::Sin::print()'],['../classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77',1,'mlx::core::Sinh::print()'],['../classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504',1,'mlx::core::Slice::print()'],['../classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b',1,'mlx::core::SliceUpdate::print()'],['../classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83',1,'mlx::core::Softmax::print()'],['../classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2',1,'mlx::core::Sort::print()'],['../classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2',1,'mlx::core::Split::print()'],['../classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384',1,'mlx::core::Square::print()'],['../classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f',1,'mlx::core::Sqrt::print()'],['../classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50',1,'mlx::core::StopGradient::print()'],['../classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b',1,'mlx::core::Subtract::print()'],['../classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f',1,'mlx::core::Tan::print()'],['../classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e',1,'mlx::core::Tanh::print()'],['../classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d',1,'mlx::core::Uniform::print()'],['../classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c',1,'mlx::core::View::print()'],['../classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04',1,'mlx::core::Transpose::print()'],['../classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b',1,'mlx::core::QRF::print()'],['../classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53',1,'mlx::core::SVD::print()'],['../classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9',1,'mlx::core::Inverse::print()'],['../classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84',1,'mlx::core::Cholesky::print()'],['../classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84',1,'mlx::core::Eigh::print()'],['../structmlx_1_1core_1_1_print_formatter.html#a79fad4cf5844db8c92b066539146281b',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, bool val)'],['../structmlx_1_1core_1_1_print_formatter.html#a8da448a8adae671b26359341ea514316',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a9d750c134a6fbfa8251c5b1d01d73287',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#adbbb9cbff767f9db73c659a0c07ba633',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int32_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a520adb07fafd911b22bc24b295e4f6cf',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint32_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ab0c702f1ae201e17cd328c9855cf522e',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int64_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ac59a5137ddd8b32aae057bb9826ee80d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint64_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ac4b7895d1168cfc1a3d1186d8a414d2f',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, float16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ae21005f92bc641f2d657096f5d176a6d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, bfloat16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a57af5c32561b95d6ac2a3a1dc4f5d43e',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, float val)'],['../structmlx_1_1core_1_1_print_formatter.html#a9e1dc67c9afb0a09966336504790823d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, complex64_t val)']]],
+  ['print_25',['print',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a6814f9008a683c6911d5b8991ef770ab',1,'mlx::core::distributed::AllReduce::print()'],['../classmlx_1_1core_1_1_primitive.html#ae1aff91354ce036596088a3e19474ecb',1,'mlx::core::Primitive::print()'],['../classmlx_1_1core_1_1_abs.html#a643d6db5116eed978e3208804a992107',1,'mlx::core::Abs::print()'],['../classmlx_1_1core_1_1_add.html#a8a96345aa63724f22b68bca7b861211d',1,'mlx::core::Add::print()'],['../classmlx_1_1core_1_1_add_m_m.html#a1262ac2c4c6e9ff6b6047bf7605e5cc9',1,'mlx::core::AddMM::print()'],['../classmlx_1_1core_1_1_arange.html#abd73d2b793da796dc7cf04c9f7d5c19e',1,'mlx::core::Arange::print()'],['../classmlx_1_1core_1_1_arc_cos.html#aa48d8bec4efbac569d809cf11648b739',1,'mlx::core::ArcCos::print()'],['../classmlx_1_1core_1_1_arc_cosh.html#a6a9a2ab0cc360d7e2f9676db17f8e630',1,'mlx::core::ArcCosh::print()'],['../classmlx_1_1core_1_1_arc_sin.html#a895a35c9dd22fdb06e7b971bfd6fde87',1,'mlx::core::ArcSin::print()'],['../classmlx_1_1core_1_1_arc_sinh.html#aa8b2934a8a0b2eedec8257bbb5726430',1,'mlx::core::ArcSinh::print()'],['../classmlx_1_1core_1_1_arc_tan.html#ab0309e4feca36f221b3d672dc92cac05',1,'mlx::core::ArcTan::print()'],['../classmlx_1_1core_1_1_arc_tan2.html#abdfef9f572d06df1251c28222756a361',1,'mlx::core::ArcTan2::print()'],['../classmlx_1_1core_1_1_arc_tanh.html#aa9549311240d7ba225b84e1df9ad8523',1,'mlx::core::ArcTanh::print()'],['../classmlx_1_1core_1_1_arg_partition.html#aa8678d94fa1571ea71a7bf790cdb8d63',1,'mlx::core::ArgPartition::print()'],['../classmlx_1_1core_1_1_arg_reduce.html#a153a6d8dba7301c4fcd0e429154ead8f',1,'mlx::core::ArgReduce::print()'],['../classmlx_1_1core_1_1_arg_sort.html#a0b59ce43e0982d634a01631728b419bd',1,'mlx::core::ArgSort::print()'],['../classmlx_1_1core_1_1_as_type.html#aa617e29147c14bd5d1fa8ad0bf65af0c',1,'mlx::core::AsType::print()'],['../classmlx_1_1core_1_1_as_strided.html#af2e21b77ea9e6c70bca45224967745bf',1,'mlx::core::AsStrided::print()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a69b28e239da7fdb89f0a9f9467dd797d',1,'mlx::core::BitwiseBinary::print()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#a37ecf6fa296d28efb7651a3c510fe159',1,'mlx::core::BlockMaskedMM::print()'],['../classmlx_1_1core_1_1_gather_m_m.html#ae7a6f4eecb15e95b21e6c87068ebd758',1,'mlx::core::GatherMM::print()'],['../classmlx_1_1core_1_1_broadcast.html#a6a610412861c6e472f930b6721b99a11',1,'mlx::core::Broadcast::print()'],['../classmlx_1_1core_1_1_ceil.html#a14a0048dd6496341cacaddada68276ee',1,'mlx::core::Ceil::print()'],['../classmlx_1_1core_1_1_compiled.html#a271521f92eef49c39799f38e26b64a9b',1,'mlx::core::Compiled::print()'],['../classmlx_1_1core_1_1_concatenate.html#a56f29b585a6d1d958954a68dcc893f33',1,'mlx::core::Concatenate::print()'],['../classmlx_1_1core_1_1_conjugate.html#a40281539bbd543ac8fd8e28650de17e4',1,'mlx::core::Conjugate::print()'],['../classmlx_1_1core_1_1_contiguous.html#aca8a4ba9a58cc10f063e6b082fa2fc23',1,'mlx::core::Contiguous::print()'],['../classmlx_1_1core_1_1_convolution.html#a844eab7c4cc99e775cfb561265ed14fd',1,'mlx::core::Convolution::print()'],['../classmlx_1_1core_1_1_copy.html#acfa1a02ab9cdab593e928faa515a8008',1,'mlx::core::Copy::print()'],['../classmlx_1_1core_1_1_cos.html#a81858457e4bea931a4bc6f6e38b0f696',1,'mlx::core::Cos::print()'],['../classmlx_1_1core_1_1_cosh.html#ac247faad68c1050cda9f72d7d6d040e2',1,'mlx::core::Cosh::print()'],['../classmlx_1_1core_1_1_custom_transforms.html#a2ddbacbc468271b11caee0ad97005298',1,'mlx::core::CustomTransforms::print()'],['../classmlx_1_1core_1_1_depends.html#aed575b0d927f4341f60442c70adeeb82',1,'mlx::core::Depends::print()'],['../classmlx_1_1core_1_1_divide.html#af3c15337ac15522cc34ed98b97895bb6',1,'mlx::core::Divide::print()'],['../classmlx_1_1core_1_1_div_mod.html#a7edbed50d07869d921e529157931b7a1',1,'mlx::core::DivMod::print()'],['../classmlx_1_1core_1_1_select.html#a678285f2c0b9dae85692399c3aa692a7',1,'mlx::core::Select::print()'],['../classmlx_1_1core_1_1_remainder.html#aeaecac5ea8e606d7ecd393d8019029e4',1,'mlx::core::Remainder::print()'],['../classmlx_1_1core_1_1_equal.html#a0787bf32f0b405a8b2ac809d2d990774',1,'mlx::core::Equal::print()'],['../classmlx_1_1core_1_1_erf.html#a186af7b783cf832c3b25eec3a09f5a0c',1,'mlx::core::Erf::print()'],['../classmlx_1_1core_1_1_erf_inv.html#a0acb31bd5780abf61877bd1a3e0fd4f9',1,'mlx::core::ErfInv::print()'],['../classmlx_1_1core_1_1_exp.html#ad87cc1b2ae595a613b03b0fdca63ae6a',1,'mlx::core::Exp::print()'],['../classmlx_1_1core_1_1_expm1.html#af1a99266fc50aa5948cdd298e2916ef1',1,'mlx::core::Expm1::print()'],['../classmlx_1_1core_1_1_f_f_t.html#a15a2a5f7647f5fb78611a251d3270edf',1,'mlx::core::FFT::print()'],['../classmlx_1_1core_1_1_floor.html#ac289e87c5fac15e2f491e2513be610f6',1,'mlx::core::Floor::print()'],['../classmlx_1_1core_1_1_full.html#a68e08303f4960ab373b84a3312edc013',1,'mlx::core::Full::print()'],['../classmlx_1_1core_1_1_gather.html#a9d57637a8a65008683c3847251bdcf91',1,'mlx::core::Gather::print()'],['../classmlx_1_1core_1_1_greater.html#aa2980e45cd2c79ebfb394012d3108a04',1,'mlx::core::Greater::print()'],['../classmlx_1_1core_1_1_greater_equal.html#ab98045c861d2d2ffb0398c2c1d671cef',1,'mlx::core::GreaterEqual::print()'],['../classmlx_1_1core_1_1_hadamard.html#a3df6e7e3b3b71bf50be5f1a05d0870b6',1,'mlx::core::Hadamard::print()'],['../classmlx_1_1core_1_1_imag.html#a0c8d48e2a1474d80a314ea9b96dbaa8d',1,'mlx::core::Imag::print()'],['../classmlx_1_1core_1_1_less.html#ad67e6f66d7b75546fd98dbee6b631d78',1,'mlx::core::Less::print()'],['../classmlx_1_1core_1_1_less_equal.html#a409842d3862113c53cbbdf7467a06950',1,'mlx::core::LessEqual::print()'],['../classmlx_1_1core_1_1_load.html#a54e08a0ca41b7c9f1a76b00c889f0bfa',1,'mlx::core::Load::print()'],['../classmlx_1_1core_1_1_log.html#a7b946d98d4a228c6be9f606a3bd8a30d',1,'mlx::core::Log::print()'],['../classmlx_1_1core_1_1_log1p.html#a8a1569dde30440ce11ea466ccc69d2d4',1,'mlx::core::Log1p::print()'],['../classmlx_1_1core_1_1_logical_not.html#a001ff3eca46440f0d8a287e0b98a8a2c',1,'mlx::core::LogicalNot::print()'],['../classmlx_1_1core_1_1_logical_and.html#a9a5220eb56e1fd94fd879394ee5ad397',1,'mlx::core::LogicalAnd::print()'],['../classmlx_1_1core_1_1_logical_or.html#a6becc5fbfadde850de9857099dcd5003',1,'mlx::core::LogicalOr::print()'],['../classmlx_1_1core_1_1_log_add_exp.html#a702a2eff0bd1ae7b6fb829dd0b0b11b9',1,'mlx::core::LogAddExp::print()'],['../classmlx_1_1core_1_1_matmul.html#abb4a16a265a05d56a2f5d2e89d6f9dfd',1,'mlx::core::Matmul::print()'],['../classmlx_1_1core_1_1_maximum.html#a3b708a1d6b526719c62850294776f8ca',1,'mlx::core::Maximum::print()'],['../classmlx_1_1core_1_1_minimum.html#a137677bf32c626a768b732a7b8575512',1,'mlx::core::Minimum::print()'],['../classmlx_1_1core_1_1_multiply.html#aa4f1f7af68346ce80c2636df415c9909',1,'mlx::core::Multiply::print()'],['../classmlx_1_1core_1_1_negative.html#a0d5c30e267ff6468d64f1987f9f83f91',1,'mlx::core::Negative::print()'],['../classmlx_1_1core_1_1_not_equal.html#a12aa2f764880d29e627540610b63af09',1,'mlx::core::NotEqual::print()'],['../classmlx_1_1core_1_1_number_of_elements.html#aecde30826970938f3aa688979a668f52',1,'mlx::core::NumberOfElements::print()'],['../classmlx_1_1core_1_1_pad.html#af87754daaf51f6a6cf8bd4949ca1e70a',1,'mlx::core::Pad::print()'],['../classmlx_1_1core_1_1_partition.html#ab5c7aa4fed325475b33d4004649f0dc0',1,'mlx::core::Partition::print()'],['../classmlx_1_1core_1_1_power.html#a33e2d7ff078426fe66ea2370ceb5af60',1,'mlx::core::Power::print()'],['../classmlx_1_1core_1_1_quantized_matmul.html#aaef8c96d4d40b4fa08ced540d341a4db',1,'mlx::core::QuantizedMatmul::print()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a53c3fa7beb51ce2e1c2da28633406fe0',1,'mlx::core::GatherQMM::print()'],['../classmlx_1_1core_1_1_random_bits.html#a8a5593c34fd868d94b36a8ced1390271',1,'mlx::core::RandomBits::print()'],['../classmlx_1_1core_1_1_real.html#a740a0dfb54c2a4467a0a59f11fe69e1b',1,'mlx::core::Real::print()'],['../classmlx_1_1core_1_1_reshape.html#a0f2323d5d67ece0eb25ecff565b21862',1,'mlx::core::Reshape::print()'],['../classmlx_1_1core_1_1_reduce.html#a399be3a89553787a0a687706881f03cd',1,'mlx::core::Reduce::print()'],['../classmlx_1_1core_1_1_round.html#af0dfe8943109c936b35ab0082f566f72',1,'mlx::core::Round::print()'],['../classmlx_1_1core_1_1_scan.html#ad5b6308c79e9b985a49df35eadd15b22',1,'mlx::core::Scan::print()'],['../classmlx_1_1core_1_1_scatter.html#aa9d45cbfb27b814517f6016092b30efa',1,'mlx::core::Scatter::print()'],['../classmlx_1_1core_1_1_sigmoid.html#ad4cd19938e5159754aa7516f405580c2',1,'mlx::core::Sigmoid::print()'],['../classmlx_1_1core_1_1_sign.html#a2aa0720fe0a6d2408eb43c25d3d45b0a',1,'mlx::core::Sign::print()'],['../classmlx_1_1core_1_1_sin.html#a73b31005551015897f15c00e8b0222e4',1,'mlx::core::Sin::print()'],['../classmlx_1_1core_1_1_sinh.html#a5b4753d52e80799d4fea0b9172d25a77',1,'mlx::core::Sinh::print()'],['../classmlx_1_1core_1_1_slice.html#a50851148948d924b71817cfbd4401504',1,'mlx::core::Slice::print()'],['../classmlx_1_1core_1_1_slice_update.html#a751eefb9922c56479b4b0de2ad45439b',1,'mlx::core::SliceUpdate::print()'],['../classmlx_1_1core_1_1_softmax.html#aa783610ef6b82b92681e78fc99412d83',1,'mlx::core::Softmax::print()'],['../classmlx_1_1core_1_1_sort.html#ada81b9343f80958174eba708452927a2',1,'mlx::core::Sort::print()'],['../classmlx_1_1core_1_1_split.html#ad0c31fe5972643cc75fde10445fc47f2',1,'mlx::core::Split::print()'],['../classmlx_1_1core_1_1_square.html#a75feb558cd1d615e96309dd7d1e81384',1,'mlx::core::Square::print()'],['../classmlx_1_1core_1_1_sqrt.html#a8681c8de2f50049848d320c47f713c0f',1,'mlx::core::Sqrt::print()'],['../classmlx_1_1core_1_1_stop_gradient.html#acc7a7d51cbf014dae8ba3d20bedcad50',1,'mlx::core::StopGradient::print()'],['../classmlx_1_1core_1_1_subtract.html#a3834fd305435fb5a8e512566832e372b',1,'mlx::core::Subtract::print()'],['../classmlx_1_1core_1_1_tan.html#aeea7c284d595a2a928d5f28a55e9be7f',1,'mlx::core::Tan::print()'],['../classmlx_1_1core_1_1_tanh.html#a73f4976d641daf697cc1a231d773d78e',1,'mlx::core::Tanh::print()'],['../classmlx_1_1core_1_1_uniform.html#a01510998719b19df137451cc37850b8d',1,'mlx::core::Uniform::print()'],['../classmlx_1_1core_1_1_view.html#a513b034919a8a494add3155f910a360c',1,'mlx::core::View::print()'],['../classmlx_1_1core_1_1_transpose.html#ac6c87b850f4e5560aa13a5e1e9f9fe04',1,'mlx::core::Transpose::print()'],['../classmlx_1_1core_1_1_q_r_f.html#aba3526722b3a52b41fa8103b909f7f3b',1,'mlx::core::QRF::print()'],['../classmlx_1_1core_1_1_s_v_d.html#ab87a4e7ef857936bea66ba9e24662f53',1,'mlx::core::SVD::print()'],['../classmlx_1_1core_1_1_inverse.html#a543f18f1ce5c06c897141091e95a66e9',1,'mlx::core::Inverse::print()'],['../classmlx_1_1core_1_1_cholesky.html#a0a8b51ff7f5369d22bdc58910d4aaf84',1,'mlx::core::Cholesky::print()'],['../classmlx_1_1core_1_1_eigh.html#a2b8e47ecd60cd7330716761c5fb1fe84',1,'mlx::core::Eigh::print()'],['../structmlx_1_1core_1_1_print_formatter.html#a79fad4cf5844db8c92b066539146281b',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, bool val)'],['../structmlx_1_1core_1_1_print_formatter.html#a8da448a8adae671b26359341ea514316',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a9d750c134a6fbfa8251c5b1d01d73287',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#adbbb9cbff767f9db73c659a0c07ba633',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int32_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a520adb07fafd911b22bc24b295e4f6cf',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint32_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ab0c702f1ae201e17cd328c9855cf522e',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, int64_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ac59a5137ddd8b32aae057bb9826ee80d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, uint64_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ac4b7895d1168cfc1a3d1186d8a414d2f',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, float16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#ae21005f92bc641f2d657096f5d176a6d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, bfloat16_t val)'],['../structmlx_1_1core_1_1_print_formatter.html#a57af5c32561b95d6ac2a3a1dc4f5d43e',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, float val)'],['../structmlx_1_1core_1_1_print_formatter.html#a9e1dc67c9afb0a09966336504790823d',1,'mlx::core::PrintFormatter::print(std::ostream &amp;os, complex64_t val)']]],
   ['print_5fcomplex_5fconstant_26',['print_complex_constant',['../namespacemlx_1_1core.html#a2b78f270942c6eb185e8045f1c5b4286',1,'mlx::core']]],
   ['print_5fconstant_27',['print_constant',['../namespacemlx_1_1core.html#a7d11b000895d44d183260634f4192d92',1,'mlx::core']]],
   ['print_5ffloat_5fconstant_28',['print_float_constant',['../namespacemlx_1_1core.html#a93a8ac59c644b801ec8881a58368caf2',1,'mlx::core']]],
diff --git a/docs/build/html/search/functions_11.js b/docs/build/html/search/functions_11.js
index f07193647..592d7f95a 100644
--- a/docs/build/html/search/functions_11.js
+++ b/docs/build/html/search/functions_11.js
@@ -18,7 +18,7 @@ var searchData=
   ['quantize_15',['quantize',['../group__ops.html#gab43cc28690da7cdd43b43065adbd31da',1,'mlx::core']]],
   ['quantized_16',['quantized',['../namespacemlx_1_1core_1_1metal.html#a949f029424218ab5c5588563d2e076f5',1,'mlx::core::metal']]],
   ['quantized_5fmatmul_17',['quantized_matmul',['../group__ops.html#gabfa4208fb1f9b1cdd0abc563b19175af',1,'mlx::core']]],
-  ['quantizedblockloader_18',['QuantizedBlockLoader',['../struct_quantized_block_loader.html#af59b054750a65e7e79c1cd05c4acac93',1,'QuantizedBlockLoader']]],
+  ['quantizedblockloader_18',['QuantizedBlockLoader',['../struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589',1,'QuantizedBlockLoader']]],
   ['quantizedmatmul_19',['QuantizedMatmul',['../classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c',1,'mlx::core::QuantizedMatmul']]],
   ['quiet_5fnan_20',['quiet_NaN',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
   ['qvm_21',['qvm',['../quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5',1,'quantized.h']]],
diff --git a/docs/build/html/search/functions_12.js b/docs/build/html/search/functions_12.js
index a859e8b37..99c84c46a 100644
--- a/docs/build/html/search/functions_12.js
+++ b/docs/build/html/search/functions_12.js
@@ -73,10 +73,12 @@ var searchData=
   ['round_70',['Round',['../classmlx_1_1core_1_1_round.html#a1327a359b2aed91f576145a0e70d1dde',1,'mlx::core::Round']]],
   ['round_71',['round',['../namespacemetal.html#a46c667e169ff9d51a9204a045305442f',1,'metal::round()'],['../namespacemetal_1_1fast.html#a4cb687257a004726d49e496417eaa40f',1,'metal::fast::round()'],['../namespacemetal_1_1precise.html#a5295ab08055d12534cc3775da855ac12',1,'metal::precise::round()'],['../group__ops.html#ga2d74d43f007a069384e89d8416525331',1,'mlx::core::round(const array &amp;a, int decimals, StreamOrDevice s={})'],['../group__ops.html#gaf18fb7e98bf8cf3b7fbc5e64c988a95b',1,'mlx::core::round(const array &amp;a, StreamOrDevice s={})']]],
   ['round_5ferror_72',['round_error',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#afa223448fa4f04c1113a85345dd720c3',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['row_5freduce_5fgeneral_5fdispatch_73',['row_reduce_general_dispatch',['../namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b',1,'mlx::core']]],
-  ['row_5freduce_5flooped_74',['row_reduce_looped',['../reduce__row_8h.html#ad98332d74a6824aa7499df3e2f2246ae',1,'reduce_row.h']]],
-  ['row_5freduce_5fsimple_75',['row_reduce_simple',['../reduce__row_8h.html#ac01d30987668930c8b38900e47b8308b',1,'reduce_row.h']]],
-  ['row_5freduce_5fsmall_76',['row_reduce_small',['../reduce__row_8h.html#a27e75312086e31f6bd1bbf4b366679da',1,'reduce_row.h']]],
-  ['rsqrt_77',['rsqrt',['../namespacemetal.html#a1cf4b605c0aa7ff5bfe5e979a16f5157',1,'metal::rsqrt()'],['../namespacemetal_1_1fast.html#aa62097c750f1e4b69d09277f19976ab1',1,'metal::fast::rsqrt()'],['../namespacemetal_1_1precise.html#afb397b477745f12a44423934fa2b05ac',1,'metal::precise::rsqrt()'],['../group__ops.html#ga102f23aa0b0c3d3296a321c694617aa1',1,'mlx::core::rsqrt()']]],
-  ['run_78',['run',['../struct_g_e_m_v_kernel.html#ac4a7b5011a0ea938ab1949bb1767fc1a',1,'GEMVKernel::run()'],['../struct_g_e_m_v_t_kernel.html#a5d68656832de892f33db939005713927',1,'GEMVTKernel::run()'],['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5',1,'mlx::steel::GEMMKernel::run()']]]
+  ['row_5fbin_5fop_73',['row_bin_op',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a318c4279bdc7b39b7919f108b1cd8010',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::row_bin_op()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2',1,'mlx::steel::MMATile::row_bin_op()']]],
+  ['row_5freduce_74',['row_reduce',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a51d662e4cff88b5ad17d7c44bb6b6970',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::row_reduce()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88',1,'mlx::steel::MMATile::row_reduce()']]],
+  ['row_5freduce_5fgeneral_5fdispatch_75',['row_reduce_general_dispatch',['../namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b',1,'mlx::core']]],
+  ['row_5freduce_5flooped_76',['row_reduce_looped',['../reduce__row_8h.html#afba85f5a1c935c124ef52e986d4b2c49',1,'reduce_row.h']]],
+  ['row_5freduce_5fsimple_77',['row_reduce_simple',['../reduce__row_8h.html#aef628dfccdb1361da5546f8b17c510bf',1,'reduce_row.h']]],
+  ['row_5freduce_5fsmall_78',['row_reduce_small',['../reduce__row_8h.html#aeb49e89f1163cb3093770bb710df9f5e',1,'reduce_row.h']]],
+  ['rsqrt_79',['rsqrt',['../namespacemetal.html#a1cf4b605c0aa7ff5bfe5e979a16f5157',1,'metal::rsqrt()'],['../namespacemetal_1_1fast.html#aa62097c750f1e4b69d09277f19976ab1',1,'metal::fast::rsqrt()'],['../namespacemetal_1_1precise.html#afb397b477745f12a44423934fa2b05ac',1,'metal::precise::rsqrt()'],['../group__ops.html#ga102f23aa0b0c3d3296a321c694617aa1',1,'mlx::core::rsqrt()']]],
+  ['run_80',['run',['../struct_g_e_m_v_kernel.html#ac4a7b5011a0ea938ab1949bb1767fc1a',1,'GEMVKernel::run()'],['../struct_g_e_m_v_t_kernel.html#a5d68656832de892f33db939005713927',1,'GEMVTKernel::run()'],['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5',1,'mlx::steel::GEMMKernel::run(const device T *A, const device T *B, device U *D, const constant GEMMParams *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)'],['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5',1,'mlx::steel::GEMMKernel::run(const device T *A, const device T *B, device U *D, const constant GEMMParams *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)']]]
 ];
diff --git a/docs/build/html/search/functions_13.js b/docs/build/html/search/functions_13.js
index 5342f6271..d6e60eb7d 100644
--- a/docs/build/html/search/functions_13.js
+++ b/docs/build/html/search/functions_13.js
@@ -11,131 +11,136 @@ var searchData=
   ['scatter_8',['Scatter',['../classmlx_1_1core_1_1_scatter.html#ac9b3eff67389ef9aa820753379ffeaa3',1,'mlx::core::Scatter']]],
   ['scatter_9',['scatter',['../namespacemlx_1_1core_1_1metal.html#a32e902c6cd6d35fcc3119ed6685a170f',1,'mlx::core::metal::scatter()'],['../group__ops.html#gad438be8f90bae9d37c6853b8f4225d61',1,'mlx::core::scatter(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gac2c2b379a3ce959dbe1c4a68f112edfe',1,'mlx::core::scatter(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
   ['scatter_5fadd_10',['scatter_add',['../group__ops.html#gacd14c2b5cfebf343fc2d672722f8d174',1,'mlx::core::scatter_add(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gac13318518e5703f1273c5366eb523a5a',1,'mlx::core::scatter_add(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
-  ['scatter_5fimpl_11',['scatter_impl',['../scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1',1,'scatter.h']]],
+  ['scatter_5fimpl_11',['scatter_impl',['../scatter_8h.html#a0df7206d4519defb48a6275afc12f87c',1,'scatter.h']]],
   ['scatter_5fmax_12',['scatter_max',['../group__ops.html#ga05881a4157cd113c9392d168a79e6673',1,'mlx::core::scatter_max(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga9adda5f9202bb3486e4d9e1114e3a56f',1,'mlx::core::scatter_max(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
   ['scatter_5fmin_13',['scatter_min',['../group__ops.html#ga0ca16b7579dfc899f3f7fd40245ba7c5',1,'mlx::core::scatter_min(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga51fa762a997c243ca7a19e1ed3e83199',1,'mlx::core::scatter_min(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
   ['scatter_5fprod_14',['scatter_prod',['../group__ops.html#ga3708b5bcb61e2c63d213c4ce6ad0ffc0',1,'mlx::core::scatter_prod(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gaf83c53c453faa9083ba27e4b97539339',1,'mlx::core::scatter_prod(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
   ['scheduler_15',['Scheduler',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a3ae42aed78a2200e9d02776fcd2316ba',1,'mlx::core::scheduler::Scheduler::Scheduler()'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a61a74e3628899e66dde600e24a750648',1,'mlx::core::scheduler::Scheduler::Scheduler(const Scheduler &amp;)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ac3f77b7c93220dadd0b3bb2e903b7059',1,'mlx::core::scheduler::Scheduler::Scheduler(Scheduler &amp;&amp;)=delete']]],
   ['scheduler_16',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html#ae856e468c2f7c8f8ec672522cc13730b',1,'mlx::core::scheduler']]],
   ['sdpa_5fvector_17',['sdpa_vector',['../sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae',1,'sdpa_vector.h']]],
-  ['seed_18',['seed',['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a9f19c5da2031cba50d0ff996924347d8',1,'mlx::core::random::KeySequence::seed()'],['../namespacemlx_1_1core_1_1random.html#ac4ad325b613257306df74595d3d0e23b',1,'mlx::core::random::seed()']]],
-  ['seek_19',['seek',['../structmlx_1_1core_1_1_contiguous_iterator.html#a24719ee9e8667885d29c2ad74445520c',1,'mlx::core::ContiguousIterator::seek()'],['../classmlx_1_1core_1_1io_1_1_reader.html#acea55078bd39ccaa27a9a36f17a39cd1',1,'mlx::core::io::Reader::seek()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a9c1716dda53aa36faea9c8fb1a3e34d4',1,'mlx::core::io::Writer::seek()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a673c16b669f3cee13f387b7b0a1f39f7',1,'mlx::core::io::ParallelFileReader::seek()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9646f4ea048ae58719daeb588e2de433',1,'mlx::core::io::FileWriter::seek()']]],
-  ['select_20',['Select',['../classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9',1,'mlx::core::Select']]],
-  ['send_21',['Send',['../classmlx_1_1core_1_1distributed_1_1_send.html#a2481dd876b14d4a13ac466cbca9c4eac',1,'mlx::core::distributed::Send']]],
-  ['send_22',['send',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#abf33511660ac71df5fc92f2aad6c6e08',1,'mlx::core::distributed::detail::send()'],['../namespacemlx_1_1core_1_1distributed.html#a5a8360edaa3a528a3927fce4d2cf1777',1,'mlx::core::distributed::send()']]],
-  ['set_23',['Set',['../structpocketfft_1_1detail_1_1cmplx.html#a647fece372b64b13c4a7e5877d09a807',1,'pocketfft::detail::cmplx::Set(T r_, T i_)'],['../structpocketfft_1_1detail_1_1cmplx.html#a447d26b2e07f6e45f29d865e906c0a98',1,'pocketfft::detail::cmplx::Set(T r_)']]],
-  ['set_5fcache_5flimit_24',['set_cache_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#af392bced29d9e4e3f1a7cc4725d83764',1,'mlx::core::metal::MetalAllocator::set_cache_limit()'],['../namespacemlx_1_1core_1_1metal.html#ab09c9b60f1e886ab859e6a066c9a5b9d',1,'mlx::core::metal::set_cache_limit()']]],
-  ['set_5fcompile_5fmode_25',['set_compile_mode',['../namespacemlx_1_1core.html#a49445a55f976c4397f25ea18e1e92bef',1,'mlx::core']]],
-  ['set_5fdata_26',['set_data',['../classmlx_1_1core_1_1array.html#a631acd8e318189640b8338f9ae1a554d',1,'mlx::core::array::set_data(allocator::Buffer buffer, deleter_t d=allocator::free)'],['../classmlx_1_1core_1_1array.html#a2112af5fba37b3135cd2e6ac9e851606',1,'mlx::core::array::set_data(allocator::Buffer buffer, size_t data_size, std::vector&lt; size_t &gt; strides, Flags flags, deleter_t d=allocator::free)']]],
-  ['set_5fdefault_5fdevice_27',['set_default_device',['../namespacemlx_1_1core.html#a312a2de41367fe52caeaf8c0f596a120',1,'mlx::core']]],
-  ['set_5fdefault_5fstream_28',['set_default_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a6d15314ac9cf25efc9bd1278de9a66bb',1,'mlx::core::scheduler::Scheduler::set_default_stream()'],['../namespacemlx_1_1core.html#af35a2b06517d8bb7dbb469692b4f841c',1,'mlx::core::set_default_stream()']]],
-  ['set_5finput_5farray_29',['set_input_array',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4',1,'mlx::core::metal::CommandEncoder']]],
-  ['set_5fmemory_5flimit_30',['set_memory_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a179e3127ef9377ce54295f771c34ba1b',1,'mlx::core::metal::MetalAllocator::set_memory_limit()'],['../namespacemlx_1_1core_1_1metal.html#a3fb2c4a237fa4bfdff798156146c4937',1,'mlx::core::metal::set_memory_limit()']]],
-  ['set_5foutput_5farray_31',['set_output_array',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522',1,'mlx::core::metal::CommandEncoder']]],
-  ['set_5fresidency_5fset_32',['set_residency_set',['../classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f',1,'mlx::core::metal::Device']]],
-  ['set_5fsiblings_33',['set_siblings',['../classmlx_1_1core_1_1array.html#a8fccbe7a4edfd8cca168161124e263b1',1,'mlx::core::array']]],
-  ['set_5fstatus_34',['set_status',['../classmlx_1_1core_1_1array.html#a63598018999b49f1340b183cb303f05c',1,'mlx::core::array']]],
-  ['set_5ftracer_35',['set_tracer',['../classmlx_1_1core_1_1array.html#af26e6be1a9e6239471a4c24310c0c7c8',1,'mlx::core::array']]],
-  ['set_5fvalue_36',['set_value',['../classmlx_1_1core_1_1_event.html#a0d077b11f4b28f882b42440b7ac6d40d',1,'mlx::core::Event']]],
-  ['set_5fvector_5fbytes_37',['set_vector_bytes',['../namespacemlx_1_1core.html#a62340bbaa8b216539688a60adcb568bf',1,'mlx::core::set_vector_bytes(CommandEncoder &amp;enc, const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)'],['../namespacemlx_1_1core.html#ae309cb543dfb0239cfccc53a8ad0408e',1,'mlx::core::set_vector_bytes(CommandEncoder &amp;enc, const std::vector&lt; T &gt; &amp;vec, int idx)']]],
-  ['set_5fwired_5flimit_38',['set_wired_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a84fa0347da18055bc13ba0a5c4b57253',1,'mlx::core::metal::MetalAllocator::set_wired_limit()'],['../namespacemlx_1_1core_1_1metal.html#a31eab4828d31d292bc84e07b0d961e1e',1,'mlx::core::metal::set_wired_limit()']]],
-  ['shape_39',['shape',['../classpocketfft_1_1detail_1_1arr__info.html#accada8146cb8d3ab7facb4c1e3413ec0',1,'pocketfft::detail::arr_info::shape() const'],['../classpocketfft_1_1detail_1_1arr__info.html#ac601c660c64a4c252aa8be4ae7dfa7a8',1,'pocketfft::detail::arr_info::shape(size_t i) const'],['../classmlx_1_1core_1_1array.html#a4a2a2c8a4a5beafd723fc13f2055d55d',1,'mlx::core::array::shape() const'],['../classmlx_1_1core_1_1array.html#a51ed0c45666264dc172d06fba159eb8f',1,'mlx::core::array::shape(int dim) const']]],
-  ['shapes_5fwithout_5freduction_5faxes_40',['shapes_without_reduction_axes',['../namespacemlx_1_1core.html#a44c3ea6db6553c3f6552b9ba64a69494',1,'mlx::core']]],
-  ['shared_5fbuffer_5fslice_41',['shared_buffer_slice',['../namespacemlx_1_1core.html#aea2a6a4eddfd4cfac89d20786059de2a',1,'mlx::core']]],
-  ['shutdown_42',['shutdown',['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a0c7c29290fde806031c497f24c4ad411',1,'pocketfft::detail::threading::thread_pool']]],
-  ['siblings_43',['siblings',['../classmlx_1_1core_1_1array.html#acf80fde8f743f65ad5b4be69fcb7a74d',1,'mlx::core::array::siblings() const'],['../classmlx_1_1core_1_1array.html#a7263f23e70a580a9bc2129fbcde36e6c',1,'mlx::core::array::siblings()']]],
-  ['sigmoid_44',['Sigmoid',['../classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b',1,'mlx::core::Sigmoid']]],
-  ['sigmoid_45',['sigmoid',['../group__ops.html#ga708abf8f79609cd6831db7c38cafac0e',1,'mlx::core']]],
-  ['sign_46',['Sign',['../classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763',1,'mlx::core::Sign']]],
-  ['sign_47',['sign',['../group__ops.html#ga20f1a1a8c0cd6206485f9363f3915faa',1,'mlx::core']]],
-  ['signal_48',['signal',['../classmlx_1_1core_1_1_event.html#a65a858445506a61be5889ae0e3651b89',1,'mlx::core::Event']]],
-  ['signaling_5fnan_49',['signaling_NaN',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['simd_5fbroadcast_50',['simd_broadcast',['../namespacemetal.html#a498f1e85107eb5f01ba4435977f8efe0',1,'metal']]],
-  ['simd_5fexclusive_5fscan_51',['simd_exclusive_scan',['../struct_cum_prod_3_01bool_01_4.html#a1a86e9398bae24182b7be0a6577bf223',1,'CumProd&lt; bool &gt;::simd_exclusive_scan()'],['../struct_cum_max.html#ae11b67aa6c998e9a01615b2a79af4403',1,'CumMax::simd_exclusive_scan()'],['../struct_cum_min.html#a83e65017ff33018b585c043fb803773b',1,'CumMin::simd_exclusive_scan()']]],
-  ['simd_5fmax_52',['simd_max',['../namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49',1,'metal']]],
-  ['simd_5fmin_53',['simd_min',['../namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b',1,'metal']]],
-  ['simd_5fprefix_5fexclusive_5fproduct_54',['simd_prefix_exclusive_product',['../namespacemetal.html#a5ca40242390b632f737e29636829b2e4',1,'metal']]],
-  ['simd_5fprefix_5fexclusive_5fsum_55',['simd_prefix_exclusive_sum',['../namespacemetal.html#abfbb70c7471f28bf7ff36a612ad014b2',1,'metal']]],
-  ['simd_5fprefix_5finclusive_5fproduct_56',['simd_prefix_inclusive_product',['../namespacemetal.html#a6ca6a7e1996228fa536e969e9e45c446',1,'metal']]],
-  ['simd_5fprefix_5finclusive_5fsum_57',['simd_prefix_inclusive_sum',['../namespacemetal.html#a567acb18199ac0107712eb8cb8aeb8e9',1,'metal']]],
-  ['simd_5fproduct_58',['simd_product',['../namespacemetal.html#ac6e883a04e2265a9790d7db76059e1b4',1,'metal']]],
-  ['simd_5fscan_59',['simd_scan',['../struct_cum_prod_3_01bool_01_4.html#abeb5ec4237b330e7219f4e881cf10d7a',1,'CumProd&lt; bool &gt;::simd_scan()'],['../struct_cum_max.html#adc9ec8bb09b4433d4c2f03022c43d781',1,'CumMax::simd_scan()'],['../struct_cum_min.html#a0a1005d91b1c90e90e2c6dbd6c296649',1,'CumMin::simd_scan()']]],
-  ['simd_5fshuffle_60',['simd_shuffle',['../namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4',1,'metal::simd_shuffle()'],['../backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2',1,'simd_shuffle(uint64_t data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a3bdbdfeb7a1dde40cd3ce1df8d9213b5',1,'simd_shuffle(int64_t data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ab4cbcdb054f9165130da91a3334da0cf',1,'simd_shuffle(bool data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ab8175b66bcc080fb89f738143568c30b',1,'simd_shuffle(complex64_t data, uint16_t lane):&#160;utils.h']]],
-  ['simd_5fshuffle_5fand_5ffill_5fdown_61',['simd_shuffle_and_fill_down',['../namespacemetal.html#ae29a06f0eac636ad7af21dea5b04938b',1,'metal::simd_shuffle_and_fill_down(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)'],['../namespacemetal.html#a0ee6239fa29a5f9ee0201e0dc5ddc8e0',1,'metal::simd_shuffle_and_fill_down(bfloat16_t data, bfloat16_t filling_data, ushort delta)']]],
-  ['simd_5fshuffle_5fand_5ffill_5fup_62',['simd_shuffle_and_fill_up',['../namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6',1,'metal::simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)'],['../namespacemetal.html#a5138d5cdc18139e135707916a243cd8e',1,'metal::simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta)'],['../backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4',1,'simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a7bb56415c5412a6a26f70a990915f064',1,'simd_shuffle_and_fill_up(int64_t data, int64_t filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ad55bd473647f2c6c68e65e5312c132d1',1,'simd_shuffle_and_fill_up(bool data, bool filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a94e02a6ae8c39cbf4cb23aa44df9dbd5',1,'simd_shuffle_and_fill_up(complex64_t data, complex64_t filling, uint16_t delta):&#160;utils.h']]],
-  ['simd_5fshuffle_5fdown_63',['simd_shuffle_down',['../namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c',1,'metal::simd_shuffle_down()'],['../backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c',1,'simd_shuffle_down(uint64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a0c1e4d782fcc56e1ab5565cef12430dd',1,'simd_shuffle_down(int64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a48ae83a8caf5c74810df60b6c6cdb062',1,'simd_shuffle_down(bool data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ad9a671a5f9aaa729ae7a77026f16bcb0',1,'simd_shuffle_down(complex64_t data, uint16_t delta):&#160;utils.h']]],
-  ['simd_5fshuffle_5frotate_5fdown_64',['simd_shuffle_rotate_down',['../namespacemetal.html#a4bb203647a421032db47e73cd649841b',1,'metal']]],
-  ['simd_5fshuffle_5frotate_5fup_65',['simd_shuffle_rotate_up',['../namespacemetal.html#a729b22077d6c944491a6027c18ea80c9',1,'metal']]],
-  ['simd_5fshuffle_5fup_66',['simd_shuffle_up',['../namespacemetal.html#afe81c5fbde3f4890458b081909242c55',1,'metal::simd_shuffle_up()'],['../backend_2metal_2kernels_2utils_8h.html#a39e436e0a942912266aae7e0bd82d7c0',1,'simd_shuffle_up(uint64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a617f3857caf33c569afa6148135f8b7a',1,'simd_shuffle_up(int64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ae0f5c42020275a588234e69f1eb7a485',1,'simd_shuffle_up(bool data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a92b455bac6a23af51c35ea83de2383eb',1,'simd_shuffle_up(complex64_t data, uint16_t delta):&#160;utils.h']]],
-  ['simd_5fshuffle_5fxor_67',['simd_shuffle_xor',['../namespacemetal.html#a5017efc9605e069cfb507137cd1a1852',1,'metal']]],
-  ['simd_5fsum_68',['simd_sum',['../namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5',1,'metal']]],
-  ['simd_5fxor_69',['simd_xor',['../namespacemetal.html#a1308decbf2d5c33d34d6be523ea1c30f',1,'metal']]],
-  ['simple_5fiter_70',['simple_iter',['../classpocketfft_1_1detail_1_1simple__iter.html#a1e455c615825bebd5f1f62665027b398',1,'pocketfft::detail::simple_iter']]],
-  ['sin_71',['Sin',['../classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea',1,'mlx::core::Sin']]],
-  ['sin_72',['sin',['../namespacepocketfft_1_1detail.html#a07745f4a069f811859308281b2982258',1,'pocketfft::detail::sin()'],['../namespacemetal.html#a619a159ca5f2ddfe3647d3a6bb6e804c',1,'metal::sin()'],['../namespacemetal_1_1fast.html#a3af771cfe7a135104f9d063147dba270',1,'metal::fast::sin()'],['../namespacemetal_1_1precise.html#a71acf77ffd29c56f56afae0195c98a1c',1,'metal::precise::sin()'],['../group__ops.html#gaebf0a73ad3732fba39df37826c235692',1,'mlx::core::sin()']]],
-  ['sincos_5f2pibyn_73',['sincos_2pibyn',['../classpocketfft_1_1detail_1_1sincos__2pibyn.html#a88518f2182d854c557edacd4ab8cbc40',1,'pocketfft::detail::sincos_2pibyn']]],
-  ['sinh_74',['Sinh',['../classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96',1,'mlx::core::Sinh']]],
-  ['sinh_75',['sinh',['../namespacemetal.html#a83ba4235ae350ab8880a9df09158620b',1,'metal::sinh()'],['../namespacemetal_1_1fast.html#a990d90b3440e38d1fb4ff5065c6c189b',1,'metal::fast::sinh()'],['../namespacemetal_1_1precise.html#abc8f4f59dd6e7204ab5d84f0af96331c',1,'metal::precise::sinh()'],['../group__ops.html#gaf532375c6563dbd6e329bdedf0224dd7',1,'mlx::core::sinh()']]],
-  ['sinpi_76',['sinpi',['../namespacemetal.html#ae9655f7fa2ba6c0625ca25fbb278e269',1,'metal::sinpi()'],['../namespacemetal_1_1fast.html#ab07a32fe544aa304577d29e0251e87b2',1,'metal::fast::sinpi()'],['../namespacemetal_1_1precise.html#a78b17dab93519d9c82c2575dafec49c9',1,'metal::precise::sinpi()']]],
-  ['size_77',['size',['../classpocketfft_1_1detail_1_1arr.html#a95bca00060957f540ff25b69632c6952',1,'pocketfft::detail::arr::size()'],['../classpocketfft_1_1detail_1_1arr__info.html#a003a7106f7fa59a3c55ac1f0116313a5',1,'pocketfft::detail::arr_info::size()'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a2adf9a9c968f113dde830cc0dc27dcc6',1,'mlx::core::allocator::Allocator::size()'],['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html#aafa92e8310db089b1ac72b840777e26b',1,'mlx::core::allocator::CommonAllocator::size()'],['../classmlx_1_1core_1_1array.html#a598f87161926d9e0b516860f0ea2c8f6',1,'mlx::core::array::size()'],['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a51f6587e8065be16f0418ca42a796e05',1,'mlx::core::metal::MetalAllocator::size()'],['../structmlx_1_1core_1_1distributed_1_1_group.html#abd96a09217e3d1bcc522888257d22cef',1,'mlx::core::distributed::Group::size()'],['../structmlx_1_1core_1_1_dtype.html#ab54051563d85212c7f0f049166bc9971',1,'mlx::core::Dtype::size()']]],
-  ['size_5fof_78',['size_of',['../namespacemlx_1_1core.html#add4794cc0ffe5d717fc146084a235d95',1,'mlx::core']]],
-  ['slice_79',['Slice',['../classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f',1,'mlx::core::Slice']]],
-  ['slice_80',['slice',['../group__ops.html#gad66135407dbb41b3c5d2cdfd51226c21',1,'mlx::core::slice(const array &amp;a, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, std::vector&lt; int &gt; strides, StreamOrDevice s={})'],['../group__ops.html#gaa97ce866c5e38b92b093e9321affcc57',1,'mlx::core::slice(const array &amp;a, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, StreamOrDevice s={})']]],
-  ['slice_5fgpu_81',['slice_gpu',['../namespacemlx_1_1core.html#a59048c5ff114c101a496bf33f62e3de9',1,'mlx::core']]],
-  ['slice_5fupdate_82',['slice_update',['../group__ops.html#ga3875660e4ce2c8add8bfcf8144078708',1,'mlx::core::slice_update(const array &amp;src, const array &amp;update, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, std::vector&lt; int &gt; strides, StreamOrDevice s={})'],['../group__ops.html#ga03ffbbb4d989a463ef43f41ebf7eabef',1,'mlx::core::slice_update(const array &amp;src, const array &amp;update, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, StreamOrDevice s={})']]],
-  ['sliceupdate_83',['SliceUpdate',['../classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990',1,'mlx::core::SliceUpdate']]],
-  ['softmax_84',['Softmax',['../classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb',1,'mlx::core::Softmax']]],
-  ['softmax_85',['softmax',['../namespacemlx_1_1core_1_1metal.html#a4fe937c2c584fd646926057f31d54ca6',1,'mlx::core::metal::softmax()'],['../group__ops.html#ga7e9bb08b43c8fd0444b7d3c9e09dc1c6',1,'mlx::core::softmax(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool precise=false, StreamOrDevice s={})'],['../group__ops.html#ga1ae3614d07d873892a530d14c3857d0b',1,'mlx::core::softmax(const array &amp;a, bool precise=false, StreamOrDevice s={})'],['../group__ops.html#ga06f570d73716a24303e6de3aaba4457b',1,'mlx::core::softmax(const array &amp;a, int axis, bool precise=false, StreamOrDevice s={})']]],
-  ['softmax_5fexp_86',['softmax_exp',['../kernels_2softmax_8h.html#a440d4031ee5e86159a4dd715e44a438b',1,'softmax.h']]],
-  ['softmax_5flooped_87',['softmax_looped',['../kernels_2softmax_8h.html#a8c47b0924ebfeebcca25f3dd17373276',1,'softmax.h']]],
-  ['softmax_5fsingle_5frow_88',['softmax_single_row',['../kernels_2softmax_8h.html#a815fe70f879f318e5d6e99acf043f52b',1,'softmax.h']]],
-  ['sort_89',['Sort',['../classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44',1,'mlx::core::Sort']]],
-  ['sort_90',['sort',['../struct_thread_sort.html#ad9ab3e6b47f7e9b91c0f3b773596986d',1,'ThreadSort::sort()'],['../struct_block_merge_sort.html#acc970f5eb963f7f2010f5ae5ea8b8bc0',1,'BlockMergeSort::sort()'],['../namespacemlx_1_1core_1_1metal.html#ab77c9a9ecaeeab8c66b712862777c24b',1,'mlx::core::metal::sort()'],['../group__ops.html#ga7fb616054665b3c2d61fa234f501f079',1,'mlx::core::sort(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaae1bc47aa737f705d0e5884270063fea',1,'mlx::core::sort(const array &amp;a, int axis, StreamOrDevice s={})']]],
-  ['special_5fmul_91',['special_mul',['../structpocketfft_1_1detail_1_1cmplx.html#a2e79f5c73c1d926361ad126cf57c8874',1,'pocketfft::detail::cmplx::special_mul()'],['../namespacepocketfft_1_1detail.html#a8da1f3d4a0b712a0285529f24187fe76',1,'pocketfft::detail::special_mul()']]],
-  ['split_92',['Split',['../classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385',1,'mlx::core::Split']]],
-  ['split_93',['split',['../structmlx_1_1core_1_1distributed_1_1_group.html#abbf40f8979488806bc5bca9ecc4130e9',1,'mlx::core::distributed::Group::split()'],['../group__ops.html#ga7534290bceab5fb3831a05d67bebce7d',1,'mlx::core::split(const array &amp;a, int num_splits, int axis, StreamOrDevice s={})'],['../group__ops.html#ga56882d24e5fde59c266774624c892d41',1,'mlx::core::split(const array &amp;a, int num_splits, StreamOrDevice s={})'],['../group__ops.html#ga2cfcb1a53924882e30476c9016c5de74',1,'mlx::core::split(const array &amp;a, const std::vector&lt; int &gt; &amp;indices, int axis, StreamOrDevice s={})'],['../group__ops.html#gac324dfa3e26d3a14a35ab7962e36f0e1',1,'mlx::core::split(const array &amp;a, const std::vector&lt; int &gt; &amp;indices, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a42847b435d037a977592e355eed072af',1,'mlx::core::random::split(const array &amp;key, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a7ec057064c7326c41b536f08178861e5',1,'mlx::core::random::split(const array &amp;key, int num, StreamOrDevice s={})']]],
-  ['sqrt_94',['Sqrt',['../classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29',1,'mlx::core::Sqrt']]],
-  ['sqrt_95',['sqrt',['../namespacepocketfft_1_1detail.html#a774f8b73f28259d4276bd188b540a3e3',1,'pocketfft::detail::sqrt()'],['../namespacemetal.html#ab3f4d4852ca0e591104fbd8e5b50d31b',1,'metal::sqrt()'],['../namespacemetal_1_1fast.html#a4218a85c7d8a74cb8055b4755205627e',1,'metal::fast::sqrt()'],['../namespacemetal_1_1precise.html#acb213467361cd2cab93a8d5ea1aa5bfd',1,'metal::precise::sqrt()'],['../group__ops.html#ga297f853b3d90ec8ae81263977ba2ddb1',1,'mlx::core::sqrt()']]],
-  ['square_96',['Square',['../classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4',1,'mlx::core::Square']]],
-  ['square_97',['square',['../group__ops.html#ga1234e4c39cfa79f19d4bdb5b8ea4d45e',1,'mlx::core']]],
-  ['squeeze_98',['squeeze',['../group__ops.html#ga710daa7ec721bd4d3f326082cb195576',1,'mlx::core::squeeze(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga700dd51b77379a3d2260a55783e8ebf3',1,'mlx::core::squeeze(const array &amp;a, int axis, StreamOrDevice s={})'],['../group__ops.html#ga58bad3c61fd85b95927a987ba1cf5dad',1,'mlx::core::squeeze(const array &amp;a, StreamOrDevice s={})']]],
-  ['stack_99',['stack',['../group__ops.html#gaf8f2ec2b98a4b59eca73d7471df6e032',1,'mlx::core::stack(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#ga82216209dce901296fc737fe8efa5c94',1,'mlx::core::stack(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
-  ['start_5fcapture_100',['start_capture',['../namespacemlx_1_1core_1_1metal.html#aa47cb5651bf3b65c46ab216b7e504d77',1,'mlx::core::metal']]],
-  ['start_5fconcurrent_101',['start_concurrent',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034',1,'mlx::core::metal::CommandEncoder']]],
-  ['status_102',['status',['../classmlx_1_1core_1_1array.html#a7102659be87e9ef62966696ab9b07dad',1,'mlx::core::array']]],
-  ['std_103',['std',['../group__ops.html#ga2a466024f8061febc0a64be557644cb0',1,'mlx::core::std(const array &amp;a, bool keepdims, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#gafdcb04d77c64405a3990078a77dd984c',1,'mlx::core::std(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga7f649970bf38b987b6ef847054f3c2f8',1,'mlx::core::std(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga62721a206df671ef5797449eea97af9f',1,'mlx::core::std(const array &amp;a, int axis, bool keepdims=false, int ddof=0, StreamOrDevice s={})']]],
-  ['steel_5fconv_104',['steel_conv',['../namespacemlx_1_1core_1_1metal.html#a92f1e559b1121d545746f81ff86eaca1',1,'mlx::core::metal']]],
-  ['steel_5fconv_5fgeneral_105',['steel_conv_general',['../namespacemlx_1_1core_1_1metal.html#a02edb6a90bdf30f4c9f0d6c25b0267b5',1,'mlx::core::metal']]],
-  ['steel_5fgemm_5ffused_106',['steel_gemm_fused',['../namespacemlx_1_1core_1_1metal.html#a17764366deed71c160fb26091400a803',1,'mlx::core::metal']]],
-  ['steel_5fgemm_5fmasked_107',['steel_gemm_masked',['../namespacemlx_1_1core_1_1metal.html#a962272ca73d26c08f76f706a128fd71f',1,'mlx::core::metal']]],
-  ['steel_5fgemm_5fsplitk_108',['steel_gemm_splitk',['../namespacemlx_1_1core_1_1metal.html#ad0dfd40ba7c09755711ceb731e57a5ac',1,'mlx::core::metal']]],
-  ['steel_5fmatmul_109',['steel_matmul',['../namespacemlx_1_1core.html#ab43a7633794498e1c6775cca829eb886',1,'mlx::core']]],
-  ['steel_5fmatmul_5fregular_110',['steel_matmul_regular',['../namespacemlx_1_1core.html#a227588758ccc9ee869dba147e830bb74',1,'mlx::core']]],
-  ['step_111',['step',['../structmlx_1_1core_1_1_contiguous_iterator.html#ae230bd52b70a0bbdf560090f8a6589ef',1,'mlx::core::ContiguousIterator']]],
-  ['stop_5fcapture_112',['stop_capture',['../namespacemlx_1_1core_1_1metal.html#ac90714424e36fb01e04550de69b8314f',1,'mlx::core::metal']]],
-  ['stop_5fgradient_113',['stop_gradient',['../group__ops.html#ga36bc28f1deb2fe668ca9ae1e447b6b1f',1,'mlx::core']]],
-  ['stopgradient_114',['StopGradient',['../classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f',1,'mlx::core::StopGradient']]],
-  ['store_115',['store',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98',1,'mlx::steel::MMATile::store(threadgroup U *dst) const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f',1,'mlx::steel::MMATile::store(device U *dst, const int ld) const']]],
-  ['store_5fresult_116',['store_result',['../structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const']]],
-  ['store_5fresult_5fsafe_117',['store_result_safe',['../structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, short2 dst_tile_dims)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const']]],
-  ['store_5fsafe_118',['store_safe',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba',1,'mlx::steel::MMATile::store_safe()']]],
-  ['stream_119',['Stream',['../structmlx_1_1core_1_1_stream.html#a7f0815ff4886da74cbbff5f93d82dd3e',1,'mlx::core::Stream']]],
-  ['stream_120',['stream',['../classmlx_1_1core_1_1_event.html#a193143bad31b68c699fa27f135b45614',1,'mlx::core::Event::stream()'],['../classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a',1,'mlx::core::Primitive::stream()']]],
-  ['streamcontext_121',['StreamContext',['../structmlx_1_1core_1_1_stream_context.html#a89d803151e9d7dce29382aa83d5c6ef1',1,'mlx::core::StreamContext']]],
-  ['streamthread_122',['StreamThread',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#ac528109a11abcb82e6e221c5efa4493c',1,'mlx::core::scheduler::StreamThread']]],
-  ['stride_123',['stride',['../classpocketfft_1_1detail_1_1arr__info.html#a9d10aa83a1117e75d36f7396b8c2a093',1,'pocketfft::detail::arr_info::stride() const'],['../classpocketfft_1_1detail_1_1arr__info.html#ac1f6a9bd6703eceef6003f5f6315d39b',1,'pocketfft::detail::arr_info::stride(size_t i) const']]],
-  ['stride_5fin_124',['stride_in',['../classpocketfft_1_1detail_1_1multi__iter.html#ac947f03b1cfcb63436a7e61ff020a88c',1,'pocketfft::detail::multi_iter']]],
-  ['stride_5fout_125',['stride_out',['../classpocketfft_1_1detail_1_1multi__iter.html#a81d71a13bf0b85e556fbb9834167ecc7',1,'pocketfft::detail::multi_iter']]],
-  ['strided_5freduce_5fgeneral_5fdispatch_126',['strided_reduce_general_dispatch',['../namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000',1,'mlx::core']]],
-  ['strided_5fscan_127',['strided_scan',['../scan_8h.html#a7abb6ffb6c3b96b88c2a63cd4cc2f7ae',1,'scan.h']]],
-  ['strides_128',['strides',['../classmlx_1_1core_1_1array.html#a186cf2648da92584d5c1c8b24e69629b',1,'mlx::core::array::strides() const'],['../classmlx_1_1core_1_1array.html#a919f850ca087d1c40aa68f854cb30be2',1,'mlx::core::array::strides(int dim) const']]],
-  ['submit_129',['submit',['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a8698d49e8f406cdb88006aac6a91f9a4',1,'pocketfft::detail::threading::thread_pool']]],
-  ['subtract_130',['Subtract',['../classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c',1,'mlx::core::Subtract']]],
-  ['subtract_131',['subtract',['../group__ops.html#ga196c240d3d0fcbb4713802c485e15133',1,'mlx::core']]],
-  ['sum_132',['sum',['../namespacemlx_1_1steel.html#ab4a6ddea4beb7c447cf5b69b9d46cc3b',1,'mlx::steel::sum(T x)'],['../namespacemlx_1_1steel.html#acd6e194d37b617d7a5818bc384a97fe4',1,'mlx::steel::sum(T x, Us... us)'],['../group__ops.html#gade905ee92eb6ab7edfc312aeddfbaeb6',1,'mlx::core::sum(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga3627754d7868487bdab1bd83f05d9c81',1,'mlx::core::sum(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaccd0a6be2c5b5128fdc2d87b5c8e67f4',1,'mlx::core::sum(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gafcd39b0bf39a56c26a967981c7ab8a8d',1,'mlx::core::sum(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['svd_133',['SVD',['../classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1',1,'mlx::core::SVD']]],
-  ['svd_134',['svd',['../namespacemlx_1_1core_1_1linalg.html#a64364b880e99914cf47bf756fa8dbaf0',1,'mlx::core::linalg']]],
-  ['swapaxes_135',['swapaxes',['../group__ops.html#gabc46eed81ab6c6247903e4ec0c4ec1fb',1,'mlx::core']]],
-  ['swizzle_136',['swizzle',['../structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760',1,'mlx::steel::BlockSwizzle']]],
-  ['synchronize_137',['synchronize',['../namespacemlx_1_1core.html#a14287949d82ffefad0306cef5eb5f9e4',1,'mlx::core::synchronize()'],['../namespacemlx_1_1core.html#a6648a71937b055e5ff513d98056c2fb5',1,'mlx::core::synchronize(Stream)']]]
+  ['sdpa_5fvector_5f2pass_5f1_18',['sdpa_vector_2pass_1',['../sdpa__vector_8h.html#ae070ec482c79c5b3bd19dd03ea42ec74',1,'sdpa_vector.h']]],
+  ['sdpa_5fvector_5f2pass_5f2_19',['sdpa_vector_2pass_2',['../sdpa__vector_8h.html#a1368cf3618a4e03dbf743b3463205efe',1,'sdpa_vector.h']]],
+  ['seed_20',['seed',['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a9f19c5da2031cba50d0ff996924347d8',1,'mlx::core::random::KeySequence::seed()'],['../namespacemlx_1_1core_1_1random.html#ac4ad325b613257306df74595d3d0e23b',1,'mlx::core::random::seed()']]],
+  ['seek_21',['seek',['../structmlx_1_1core_1_1_contiguous_iterator.html#a24719ee9e8667885d29c2ad74445520c',1,'mlx::core::ContiguousIterator::seek()'],['../classmlx_1_1core_1_1io_1_1_reader.html#acea55078bd39ccaa27a9a36f17a39cd1',1,'mlx::core::io::Reader::seek()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a9c1716dda53aa36faea9c8fb1a3e34d4',1,'mlx::core::io::Writer::seek()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a673c16b669f3cee13f387b7b0a1f39f7',1,'mlx::core::io::ParallelFileReader::seek()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9646f4ea048ae58719daeb588e2de433',1,'mlx::core::io::FileWriter::seek()']]],
+  ['select_22',['Select',['../classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9',1,'mlx::core::Select']]],
+  ['send_23',['Send',['../classmlx_1_1core_1_1distributed_1_1_send.html#a2481dd876b14d4a13ac466cbca9c4eac',1,'mlx::core::distributed::Send']]],
+  ['send_24',['send',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#abf33511660ac71df5fc92f2aad6c6e08',1,'mlx::core::distributed::detail::send()'],['../namespacemlx_1_1core_1_1distributed.html#a5a8360edaa3a528a3927fce4d2cf1777',1,'mlx::core::distributed::send()']]],
+  ['set_25',['Set',['../structpocketfft_1_1detail_1_1cmplx.html#a647fece372b64b13c4a7e5877d09a807',1,'pocketfft::detail::cmplx::Set(T r_, T i_)'],['../structpocketfft_1_1detail_1_1cmplx.html#a447d26b2e07f6e45f29d865e906c0a98',1,'pocketfft::detail::cmplx::Set(T r_)']]],
+  ['set_5fbytes_26',['set_bytes',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9c343f791812a45c6c03a5c9f27f74d5',1,'mlx::core::metal::CommandEncoder::set_bytes(const T *v, int n, int idx)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#abc52d18ea87d213c47fd26062c829849',1,'mlx::core::metal::CommandEncoder::set_bytes(const T &amp;v, int idx)']]],
+  ['set_5fcache_5flimit_27',['set_cache_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#af392bced29d9e4e3f1a7cc4725d83764',1,'mlx::core::metal::MetalAllocator::set_cache_limit()'],['../namespacemlx_1_1core_1_1metal.html#ab09c9b60f1e886ab859e6a066c9a5b9d',1,'mlx::core::metal::set_cache_limit()']]],
+  ['set_5fcompile_5fmode_28',['set_compile_mode',['../namespacemlx_1_1core.html#a49445a55f976c4397f25ea18e1e92bef',1,'mlx::core']]],
+  ['set_5fcompute_5fpipeline_5fstate_29',['set_compute_pipeline_state',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6d4c03a6585deedb5ccd1a1057d0c6ef',1,'mlx::core::metal::CommandEncoder']]],
+  ['set_5fdata_30',['set_data',['../classmlx_1_1core_1_1array.html#a631acd8e318189640b8338f9ae1a554d',1,'mlx::core::array::set_data(allocator::Buffer buffer, deleter_t d=allocator::free)'],['../classmlx_1_1core_1_1array.html#a2112af5fba37b3135cd2e6ac9e851606',1,'mlx::core::array::set_data(allocator::Buffer buffer, size_t data_size, std::vector&lt; size_t &gt; strides, Flags flags, deleter_t d=allocator::free)']]],
+  ['set_5fdefault_5fdevice_31',['set_default_device',['../namespacemlx_1_1core.html#a312a2de41367fe52caeaf8c0f596a120',1,'mlx::core']]],
+  ['set_5fdefault_5fstream_32',['set_default_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a6d15314ac9cf25efc9bd1278de9a66bb',1,'mlx::core::scheduler::Scheduler::set_default_stream()'],['../namespacemlx_1_1core.html#af35a2b06517d8bb7dbb469692b4f841c',1,'mlx::core::set_default_stream()']]],
+  ['set_5finput_5farray_33',['set_input_array',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4',1,'mlx::core::metal::CommandEncoder']]],
+  ['set_5fmemory_5flimit_34',['set_memory_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a179e3127ef9377ce54295f771c34ba1b',1,'mlx::core::metal::MetalAllocator::set_memory_limit()'],['../namespacemlx_1_1core_1_1metal.html#a3fb2c4a237fa4bfdff798156146c4937',1,'mlx::core::metal::set_memory_limit()']]],
+  ['set_5foutput_5farray_35',['set_output_array',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522',1,'mlx::core::metal::CommandEncoder']]],
+  ['set_5fresidency_5fset_36',['set_residency_set',['../classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f',1,'mlx::core::metal::Device']]],
+  ['set_5fsiblings_37',['set_siblings',['../classmlx_1_1core_1_1array.html#a8fccbe7a4edfd8cca168161124e263b1',1,'mlx::core::array']]],
+  ['set_5fstatus_38',['set_status',['../classmlx_1_1core_1_1array.html#a63598018999b49f1340b183cb303f05c',1,'mlx::core::array']]],
+  ['set_5ftracer_39',['set_tracer',['../classmlx_1_1core_1_1array.html#af26e6be1a9e6239471a4c24310c0c7c8',1,'mlx::core::array']]],
+  ['set_5fvalue_40',['set_value',['../classmlx_1_1core_1_1_event.html#a0d077b11f4b28f882b42440b7ac6d40d',1,'mlx::core::Event']]],
+  ['set_5fvector_5fbytes_41',['set_vector_bytes',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b',1,'mlx::core::metal::CommandEncoder::set_vector_bytes(const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a7375adf9ee5355bcf4b7f5f210efd115',1,'mlx::core::metal::CommandEncoder::set_vector_bytes(const std::vector&lt; T &gt; &amp;vec, int idx)']]],
+  ['set_5fwired_5flimit_42',['set_wired_limit',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a84fa0347da18055bc13ba0a5c4b57253',1,'mlx::core::metal::MetalAllocator::set_wired_limit()'],['../namespacemlx_1_1core_1_1metal.html#a31eab4828d31d292bc84e07b0d961e1e',1,'mlx::core::metal::set_wired_limit()']]],
+  ['shape_43',['shape',['../classpocketfft_1_1detail_1_1arr__info.html#accada8146cb8d3ab7facb4c1e3413ec0',1,'pocketfft::detail::arr_info::shape() const'],['../classpocketfft_1_1detail_1_1arr__info.html#ac601c660c64a4c252aa8be4ae7dfa7a8',1,'pocketfft::detail::arr_info::shape(size_t i) const'],['../classmlx_1_1core_1_1array.html#a4a2a2c8a4a5beafd723fc13f2055d55d',1,'mlx::core::array::shape() const'],['../classmlx_1_1core_1_1array.html#a51ed0c45666264dc172d06fba159eb8f',1,'mlx::core::array::shape(int dim) const']]],
+  ['shape2d_44',['Shape2D',['../structmlx_1_1steel_1_1_shape2_d.html#a070ce70eb6d84361c7f313159c438a5c',1,'mlx::steel::Shape2D']]],
+  ['shapes_5fwithout_5freduction_5faxes_45',['shapes_without_reduction_axes',['../namespacemlx_1_1core.html#a44c3ea6db6553c3f6552b9ba64a69494',1,'mlx::core']]],
+  ['shared_5fbuffer_5fslice_46',['shared_buffer_slice',['../namespacemlx_1_1core.html#aea2a6a4eddfd4cfac89d20786059de2a',1,'mlx::core']]],
+  ['shutdown_47',['shutdown',['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a0c7c29290fde806031c497f24c4ad411',1,'pocketfft::detail::threading::thread_pool']]],
+  ['siblings_48',['siblings',['../classmlx_1_1core_1_1array.html#acf80fde8f743f65ad5b4be69fcb7a74d',1,'mlx::core::array::siblings() const'],['../classmlx_1_1core_1_1array.html#a7263f23e70a580a9bc2129fbcde36e6c',1,'mlx::core::array::siblings()']]],
+  ['sigmoid_49',['Sigmoid',['../classmlx_1_1core_1_1_sigmoid.html#a47eca99113ec19f0eb60b6a0472c592b',1,'mlx::core::Sigmoid']]],
+  ['sigmoid_50',['sigmoid',['../group__ops.html#ga708abf8f79609cd6831db7c38cafac0e',1,'mlx::core']]],
+  ['sign_51',['Sign',['../classmlx_1_1core_1_1_sign.html#afe951e50907bc23a601ec5fa9eae5763',1,'mlx::core::Sign']]],
+  ['sign_52',['sign',['../group__ops.html#ga20f1a1a8c0cd6206485f9363f3915faa',1,'mlx::core']]],
+  ['signal_53',['signal',['../classmlx_1_1core_1_1_event.html#a65a858445506a61be5889ae0e3651b89',1,'mlx::core::Event']]],
+  ['signaling_5fnan_54',['signaling_NaN',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ad1f76a43c7d51a3765174aa6e0dd9f80',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['simd_5fbroadcast_55',['simd_broadcast',['../namespacemetal.html#a498f1e85107eb5f01ba4435977f8efe0',1,'metal']]],
+  ['simd_5fexclusive_5fscan_56',['simd_exclusive_scan',['../struct_cum_prod_3_01bool_01_4.html#a1a86e9398bae24182b7be0a6577bf223',1,'CumProd&lt; bool &gt;::simd_exclusive_scan()'],['../struct_cum_max.html#ae11b67aa6c998e9a01615b2a79af4403',1,'CumMax::simd_exclusive_scan()'],['../struct_cum_min.html#a83e65017ff33018b585c043fb803773b',1,'CumMin::simd_exclusive_scan()']]],
+  ['simd_5fmax_57',['simd_max',['../namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49',1,'metal']]],
+  ['simd_5fmin_58',['simd_min',['../namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b',1,'metal']]],
+  ['simd_5fprefix_5fexclusive_5fproduct_59',['simd_prefix_exclusive_product',['../namespacemetal.html#a5ca40242390b632f737e29636829b2e4',1,'metal']]],
+  ['simd_5fprefix_5fexclusive_5fsum_60',['simd_prefix_exclusive_sum',['../namespacemetal.html#abfbb70c7471f28bf7ff36a612ad014b2',1,'metal']]],
+  ['simd_5fprefix_5finclusive_5fproduct_61',['simd_prefix_inclusive_product',['../namespacemetal.html#a6ca6a7e1996228fa536e969e9e45c446',1,'metal']]],
+  ['simd_5fprefix_5finclusive_5fsum_62',['simd_prefix_inclusive_sum',['../namespacemetal.html#a567acb18199ac0107712eb8cb8aeb8e9',1,'metal']]],
+  ['simd_5fproduct_63',['simd_product',['../namespacemetal.html#ac6e883a04e2265a9790d7db76059e1b4',1,'metal']]],
+  ['simd_5fscan_64',['simd_scan',['../struct_cum_prod_3_01bool_01_4.html#abeb5ec4237b330e7219f4e881cf10d7a',1,'CumProd&lt; bool &gt;::simd_scan()'],['../struct_cum_max.html#adc9ec8bb09b4433d4c2f03022c43d781',1,'CumMax::simd_scan()'],['../struct_cum_min.html#a0a1005d91b1c90e90e2c6dbd6c296649',1,'CumMin::simd_scan()']]],
+  ['simd_5fshuffle_65',['simd_shuffle',['../namespacemetal.html#a259ed115bc3c58f88eb35830916b26d4',1,'metal::simd_shuffle()'],['../backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2',1,'simd_shuffle(uint64_t data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a3bdbdfeb7a1dde40cd3ce1df8d9213b5',1,'simd_shuffle(int64_t data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ab4cbcdb054f9165130da91a3334da0cf',1,'simd_shuffle(bool data, uint16_t lane):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ab8175b66bcc080fb89f738143568c30b',1,'simd_shuffle(complex64_t data, uint16_t lane):&#160;utils.h']]],
+  ['simd_5fshuffle_5fand_5ffill_5fdown_66',['simd_shuffle_and_fill_down',['../namespacemetal.html#ae29a06f0eac636ad7af21dea5b04938b',1,'metal::simd_shuffle_and_fill_down(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)'],['../namespacemetal.html#a0ee6239fa29a5f9ee0201e0dc5ddc8e0',1,'metal::simd_shuffle_and_fill_down(bfloat16_t data, bfloat16_t filling_data, ushort delta)']]],
+  ['simd_5fshuffle_5fand_5ffill_5fup_67',['simd_shuffle_and_fill_up',['../namespacemetal.html#a1ca14116bf50639b214d8414b5bbaaa6',1,'metal::simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta, ushort modulo)'],['../namespacemetal.html#a5138d5cdc18139e135707916a243cd8e',1,'metal::simd_shuffle_and_fill_up(bfloat16_t data, bfloat16_t filling_data, ushort delta)'],['../backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4',1,'simd_shuffle_and_fill_up(uint64_t data, uint64_t filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a7bb56415c5412a6a26f70a990915f064',1,'simd_shuffle_and_fill_up(int64_t data, int64_t filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ad55bd473647f2c6c68e65e5312c132d1',1,'simd_shuffle_and_fill_up(bool data, bool filling, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a94e02a6ae8c39cbf4cb23aa44df9dbd5',1,'simd_shuffle_and_fill_up(complex64_t data, complex64_t filling, uint16_t delta):&#160;utils.h']]],
+  ['simd_5fshuffle_5fdown_68',['simd_shuffle_down',['../namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c',1,'metal::simd_shuffle_down()'],['../backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c',1,'simd_shuffle_down(uint64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a0c1e4d782fcc56e1ab5565cef12430dd',1,'simd_shuffle_down(int64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a48ae83a8caf5c74810df60b6c6cdb062',1,'simd_shuffle_down(bool data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ad9a671a5f9aaa729ae7a77026f16bcb0',1,'simd_shuffle_down(complex64_t data, uint16_t delta):&#160;utils.h']]],
+  ['simd_5fshuffle_5frotate_5fdown_69',['simd_shuffle_rotate_down',['../namespacemetal.html#a4bb203647a421032db47e73cd649841b',1,'metal']]],
+  ['simd_5fshuffle_5frotate_5fup_70',['simd_shuffle_rotate_up',['../namespacemetal.html#a729b22077d6c944491a6027c18ea80c9',1,'metal']]],
+  ['simd_5fshuffle_5fup_71',['simd_shuffle_up',['../namespacemetal.html#afe81c5fbde3f4890458b081909242c55',1,'metal::simd_shuffle_up()'],['../backend_2metal_2kernels_2utils_8h.html#a39e436e0a942912266aae7e0bd82d7c0',1,'simd_shuffle_up(uint64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a617f3857caf33c569afa6148135f8b7a',1,'simd_shuffle_up(int64_t data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#ae0f5c42020275a588234e69f1eb7a485',1,'simd_shuffle_up(bool data, uint16_t delta):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a92b455bac6a23af51c35ea83de2383eb',1,'simd_shuffle_up(complex64_t data, uint16_t delta):&#160;utils.h']]],
+  ['simd_5fshuffle_5fxor_72',['simd_shuffle_xor',['../namespacemetal.html#a5017efc9605e069cfb507137cd1a1852',1,'metal']]],
+  ['simd_5fsum_73',['simd_sum',['../namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5',1,'metal']]],
+  ['simd_5fxor_74',['simd_xor',['../namespacemetal.html#a1308decbf2d5c33d34d6be523ea1c30f',1,'metal']]],
+  ['simple_5fiter_75',['simple_iter',['../classpocketfft_1_1detail_1_1simple__iter.html#a1e455c615825bebd5f1f62665027b398',1,'pocketfft::detail::simple_iter']]],
+  ['sin_76',['Sin',['../classmlx_1_1core_1_1_sin.html#a10d1ecc0ca96e79cdf55b57073d126ea',1,'mlx::core::Sin']]],
+  ['sin_77',['sin',['../namespacepocketfft_1_1detail.html#a07745f4a069f811859308281b2982258',1,'pocketfft::detail::sin()'],['../namespacemetal.html#a619a159ca5f2ddfe3647d3a6bb6e804c',1,'metal::sin()'],['../namespacemetal_1_1fast.html#a3af771cfe7a135104f9d063147dba270',1,'metal::fast::sin()'],['../namespacemetal_1_1precise.html#a71acf77ffd29c56f56afae0195c98a1c',1,'metal::precise::sin()'],['../group__ops.html#gaebf0a73ad3732fba39df37826c235692',1,'mlx::core::sin()']]],
+  ['sincos_5f2pibyn_78',['sincos_2pibyn',['../classpocketfft_1_1detail_1_1sincos__2pibyn.html#a88518f2182d854c557edacd4ab8cbc40',1,'pocketfft::detail::sincos_2pibyn']]],
+  ['sinh_79',['Sinh',['../classmlx_1_1core_1_1_sinh.html#a4a4f6814d403c2ce5d6c574b0dca3c96',1,'mlx::core::Sinh']]],
+  ['sinh_80',['sinh',['../namespacemetal.html#a83ba4235ae350ab8880a9df09158620b',1,'metal::sinh()'],['../namespacemetal_1_1fast.html#a990d90b3440e38d1fb4ff5065c6c189b',1,'metal::fast::sinh()'],['../namespacemetal_1_1precise.html#abc8f4f59dd6e7204ab5d84f0af96331c',1,'metal::precise::sinh()'],['../group__ops.html#gaf532375c6563dbd6e329bdedf0224dd7',1,'mlx::core::sinh()']]],
+  ['sinpi_81',['sinpi',['../namespacemetal.html#ae9655f7fa2ba6c0625ca25fbb278e269',1,'metal::sinpi()'],['../namespacemetal_1_1fast.html#ab07a32fe544aa304577d29e0251e87b2',1,'metal::fast::sinpi()'],['../namespacemetal_1_1precise.html#a78b17dab93519d9c82c2575dafec49c9',1,'metal::precise::sinpi()']]],
+  ['size_82',['size',['../classpocketfft_1_1detail_1_1arr.html#a95bca00060957f540ff25b69632c6952',1,'pocketfft::detail::arr::size()'],['../classpocketfft_1_1detail_1_1arr__info.html#a003a7106f7fa59a3c55ac1f0116313a5',1,'pocketfft::detail::arr_info::size()'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a2adf9a9c968f113dde830cc0dc27dcc6',1,'mlx::core::allocator::Allocator::size()'],['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html#aafa92e8310db089b1ac72b840777e26b',1,'mlx::core::allocator::CommonAllocator::size()'],['../classmlx_1_1core_1_1array.html#a598f87161926d9e0b516860f0ea2c8f6',1,'mlx::core::array::size()'],['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a51f6587e8065be16f0418ca42a796e05',1,'mlx::core::metal::MetalAllocator::size()'],['../structmlx_1_1core_1_1distributed_1_1_group.html#abd96a09217e3d1bcc522888257d22cef',1,'mlx::core::distributed::Group::size()'],['../structmlx_1_1core_1_1_dtype.html#ab54051563d85212c7f0f049166bc9971',1,'mlx::core::Dtype::size()']]],
+  ['size_5fof_83',['size_of',['../namespacemlx_1_1core.html#add4794cc0ffe5d717fc146084a235d95',1,'mlx::core']]],
+  ['slice_84',['Slice',['../classmlx_1_1core_1_1_slice.html#a8a38feb7bb6b72bdeebb83f053e2fd7f',1,'mlx::core::Slice']]],
+  ['slice_85',['slice',['../group__ops.html#gad66135407dbb41b3c5d2cdfd51226c21',1,'mlx::core::slice(const array &amp;a, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, std::vector&lt; int &gt; strides, StreamOrDevice s={})'],['../group__ops.html#gaa97ce866c5e38b92b093e9321affcc57',1,'mlx::core::slice(const array &amp;a, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, StreamOrDevice s={})']]],
+  ['slice_5fgpu_86',['slice_gpu',['../namespacemlx_1_1core.html#a59048c5ff114c101a496bf33f62e3de9',1,'mlx::core']]],
+  ['slice_5fupdate_87',['slice_update',['../group__ops.html#ga3875660e4ce2c8add8bfcf8144078708',1,'mlx::core::slice_update(const array &amp;src, const array &amp;update, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, std::vector&lt; int &gt; strides, StreamOrDevice s={})'],['../group__ops.html#ga03ffbbb4d989a463ef43f41ebf7eabef',1,'mlx::core::slice_update(const array &amp;src, const array &amp;update, std::vector&lt; int &gt; start, std::vector&lt; int &gt; stop, StreamOrDevice s={})']]],
+  ['sliceupdate_88',['SliceUpdate',['../classmlx_1_1core_1_1_slice_update.html#aa30a7f22f557c56e1a2b5fcf44488990',1,'mlx::core::SliceUpdate']]],
+  ['softmax_89',['Softmax',['../classmlx_1_1core_1_1_softmax.html#a4ec686aac4e06f0dfe2cbd6801af40eb',1,'mlx::core::Softmax']]],
+  ['softmax_90',['softmax',['../namespacemlx_1_1core_1_1metal.html#a4fe937c2c584fd646926057f31d54ca6',1,'mlx::core::metal::softmax()'],['../group__ops.html#ga7e9bb08b43c8fd0444b7d3c9e09dc1c6',1,'mlx::core::softmax(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool precise=false, StreamOrDevice s={})'],['../group__ops.html#ga1ae3614d07d873892a530d14c3857d0b',1,'mlx::core::softmax(const array &amp;a, bool precise=false, StreamOrDevice s={})'],['../group__ops.html#ga06f570d73716a24303e6de3aaba4457b',1,'mlx::core::softmax(const array &amp;a, int axis, bool precise=false, StreamOrDevice s={})']]],
+  ['softmax_5fexp_91',['softmax_exp',['../kernels_2softmax_8h.html#a440d4031ee5e86159a4dd715e44a438b',1,'softmax.h']]],
+  ['softmax_5flooped_92',['softmax_looped',['../kernels_2softmax_8h.html#a8c47b0924ebfeebcca25f3dd17373276',1,'softmax.h']]],
+  ['softmax_5fsingle_5frow_93',['softmax_single_row',['../kernels_2softmax_8h.html#a815fe70f879f318e5d6e99acf043f52b',1,'softmax.h']]],
+  ['sort_94',['Sort',['../classmlx_1_1core_1_1_sort.html#a62943032dbd72e85ceb9b4b7211f4a44',1,'mlx::core::Sort']]],
+  ['sort_95',['sort',['../struct_thread_sort.html#ad9ab3e6b47f7e9b91c0f3b773596986d',1,'ThreadSort::sort()'],['../struct_block_merge_sort.html#acc970f5eb963f7f2010f5ae5ea8b8bc0',1,'BlockMergeSort::sort()'],['../namespacemlx_1_1core_1_1metal.html#ab77c9a9ecaeeab8c66b712862777c24b',1,'mlx::core::metal::sort()'],['../group__ops.html#ga7fb616054665b3c2d61fa234f501f079',1,'mlx::core::sort(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaae1bc47aa737f705d0e5884270063fea',1,'mlx::core::sort(const array &amp;a, int axis, StreamOrDevice s={})']]],
+  ['special_5fmul_96',['special_mul',['../structpocketfft_1_1detail_1_1cmplx.html#a2e79f5c73c1d926361ad126cf57c8874',1,'pocketfft::detail::cmplx::special_mul()'],['../namespacepocketfft_1_1detail.html#a8da1f3d4a0b712a0285529f24187fe76',1,'pocketfft::detail::special_mul()']]],
+  ['split_97',['Split',['../classmlx_1_1core_1_1_split.html#a897c746ecfdff5119cc5ae3f20499385',1,'mlx::core::Split']]],
+  ['split_98',['split',['../structmlx_1_1core_1_1distributed_1_1_group.html#abbf40f8979488806bc5bca9ecc4130e9',1,'mlx::core::distributed::Group::split()'],['../group__ops.html#ga7534290bceab5fb3831a05d67bebce7d',1,'mlx::core::split(const array &amp;a, int num_splits, int axis, StreamOrDevice s={})'],['../group__ops.html#ga56882d24e5fde59c266774624c892d41',1,'mlx::core::split(const array &amp;a, int num_splits, StreamOrDevice s={})'],['../group__ops.html#ga2cfcb1a53924882e30476c9016c5de74',1,'mlx::core::split(const array &amp;a, const std::vector&lt; int &gt; &amp;indices, int axis, StreamOrDevice s={})'],['../group__ops.html#gac324dfa3e26d3a14a35ab7962e36f0e1',1,'mlx::core::split(const array &amp;a, const std::vector&lt; int &gt; &amp;indices, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a42847b435d037a977592e355eed072af',1,'mlx::core::random::split(const array &amp;key, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a7ec057064c7326c41b536f08178861e5',1,'mlx::core::random::split(const array &amp;key, int num, StreamOrDevice s={})']]],
+  ['sqrt_99',['Sqrt',['../classmlx_1_1core_1_1_sqrt.html#a6682a7c31ca427c9d2c5ddb6a479bf29',1,'mlx::core::Sqrt']]],
+  ['sqrt_100',['sqrt',['../namespacepocketfft_1_1detail.html#a774f8b73f28259d4276bd188b540a3e3',1,'pocketfft::detail::sqrt()'],['../namespacemetal.html#ab3f4d4852ca0e591104fbd8e5b50d31b',1,'metal::sqrt()'],['../namespacemetal_1_1fast.html#a4218a85c7d8a74cb8055b4755205627e',1,'metal::fast::sqrt()'],['../namespacemetal_1_1precise.html#acb213467361cd2cab93a8d5ea1aa5bfd',1,'metal::precise::sqrt()'],['../group__ops.html#ga297f853b3d90ec8ae81263977ba2ddb1',1,'mlx::core::sqrt()']]],
+  ['square_101',['Square',['../classmlx_1_1core_1_1_square.html#ab94e28d5c92e6febc1c74e525f730dc4',1,'mlx::core::Square']]],
+  ['square_102',['square',['../group__ops.html#ga1234e4c39cfa79f19d4bdb5b8ea4d45e',1,'mlx::core']]],
+  ['squeeze_103',['squeeze',['../group__ops.html#ga710daa7ec721bd4d3f326082cb195576',1,'mlx::core::squeeze(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#ga700dd51b77379a3d2260a55783e8ebf3',1,'mlx::core::squeeze(const array &amp;a, int axis, StreamOrDevice s={})'],['../group__ops.html#ga58bad3c61fd85b95927a987ba1cf5dad',1,'mlx::core::squeeze(const array &amp;a, StreamOrDevice s={})']]],
+  ['stack_104',['stack',['../group__ops.html#gaf8f2ec2b98a4b59eca73d7471df6e032',1,'mlx::core::stack(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#ga82216209dce901296fc737fe8efa5c94',1,'mlx::core::stack(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
+  ['start_5fcapture_105',['start_capture',['../namespacemlx_1_1core_1_1metal.html#aa47cb5651bf3b65c46ab216b7e504d77',1,'mlx::core::metal']]],
+  ['start_5fconcurrent_106',['start_concurrent',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034',1,'mlx::core::metal::CommandEncoder']]],
+  ['status_107',['status',['../classmlx_1_1core_1_1array.html#a7102659be87e9ef62966696ab9b07dad',1,'mlx::core::array']]],
+  ['std_108',['std',['../group__ops.html#ga2a466024f8061febc0a64be557644cb0',1,'mlx::core::std(const array &amp;a, bool keepdims, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#gafdcb04d77c64405a3990078a77dd984c',1,'mlx::core::std(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga7f649970bf38b987b6ef847054f3c2f8',1,'mlx::core::std(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga62721a206df671ef5797449eea97af9f',1,'mlx::core::std(const array &amp;a, int axis, bool keepdims=false, int ddof=0, StreamOrDevice s={})']]],
+  ['steel_5fconv_109',['steel_conv',['../namespacemlx_1_1core_1_1metal.html#a92f1e559b1121d545746f81ff86eaca1',1,'mlx::core::metal']]],
+  ['steel_5fconv_5fgeneral_110',['steel_conv_general',['../namespacemlx_1_1core_1_1metal.html#a02edb6a90bdf30f4c9f0d6c25b0267b5',1,'mlx::core::metal']]],
+  ['steel_5fgemm_5ffused_111',['steel_gemm_fused',['../namespacemlx_1_1core_1_1metal.html#a17764366deed71c160fb26091400a803',1,'mlx::core::metal']]],
+  ['steel_5fgemm_5fmasked_112',['steel_gemm_masked',['../namespacemlx_1_1core_1_1metal.html#a962272ca73d26c08f76f706a128fd71f',1,'mlx::core::metal']]],
+  ['steel_5fgemm_5fsplitk_113',['steel_gemm_splitk',['../namespacemlx_1_1core_1_1metal.html#ad0dfd40ba7c09755711ceb731e57a5ac',1,'mlx::core::metal']]],
+  ['steel_5fmatmul_114',['steel_matmul',['../namespacemlx_1_1core.html#ab43a7633794498e1c6775cca829eb886',1,'mlx::core']]],
+  ['steel_5fmatmul_5fregular_115',['steel_matmul_regular',['../namespacemlx_1_1core.html#a227588758ccc9ee869dba147e830bb74',1,'mlx::core']]],
+  ['step_116',['step',['../structmlx_1_1core_1_1_contiguous_iterator.html#ae230bd52b70a0bbdf560090f8a6589ef',1,'mlx::core::ContiguousIterator']]],
+  ['stop_5fcapture_117',['stop_capture',['../namespacemlx_1_1core_1_1metal.html#ac90714424e36fb01e04550de69b8314f',1,'mlx::core::metal']]],
+  ['stop_5fgradient_118',['stop_gradient',['../group__ops.html#ga36bc28f1deb2fe668ca9ae1e447b6b1f',1,'mlx::core']]],
+  ['stopgradient_119',['StopGradient',['../classmlx_1_1core_1_1_stop_gradient.html#ac70d1ab819d04e00f76bc25aeebaf84f',1,'mlx::core::StopGradient']]],
+  ['store_120',['store',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98',1,'mlx::steel::MMATile::store(threadgroup U *dst) const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f',1,'mlx::steel::MMATile::store(device U *dst, const int ld) const'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98',1,'mlx::steel::MMATile::store(threadgroup U *dst) const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f',1,'mlx::steel::MMATile::store(device U *dst, const int ld) const']]],
+  ['store_5fresult_121',['store_result',['../structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3',1,'mlx::steel::BlockMMA::store_result(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const']]],
+  ['store_5fresult_5fsafe_122',['store_result_safe',['../structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, short2 dst_tile_dims)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, short2 dst_tile_dims)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391',1,'mlx::steel::BlockMMA::store_result_safe(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const']]],
+  ['store_5fsafe_123',['store_safe',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba',1,'mlx::steel::MMATile::store_safe()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::store_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba',1,'mlx::steel::MMATile::store_safe()']]],
+  ['stream_124',['Stream',['../structmlx_1_1core_1_1_stream.html#a7f0815ff4886da74cbbff5f93d82dd3e',1,'mlx::core::Stream']]],
+  ['stream_125',['stream',['../classmlx_1_1core_1_1_event.html#a193143bad31b68c699fa27f135b45614',1,'mlx::core::Event::stream()'],['../classmlx_1_1core_1_1_primitive.html#a46e6257397a662528f9f831842ac456a',1,'mlx::core::Primitive::stream()']]],
+  ['streamcontext_126',['StreamContext',['../structmlx_1_1core_1_1_stream_context.html#a89d803151e9d7dce29382aa83d5c6ef1',1,'mlx::core::StreamContext']]],
+  ['streamthread_127',['StreamThread',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#ac528109a11abcb82e6e221c5efa4493c',1,'mlx::core::scheduler::StreamThread']]],
+  ['stride_128',['stride',['../classpocketfft_1_1detail_1_1arr__info.html#a9d10aa83a1117e75d36f7396b8c2a093',1,'pocketfft::detail::arr_info::stride() const'],['../classpocketfft_1_1detail_1_1arr__info.html#ac1f6a9bd6703eceef6003f5f6315d39b',1,'pocketfft::detail::arr_info::stride(size_t i) const']]],
+  ['stride_5fin_129',['stride_in',['../classpocketfft_1_1detail_1_1multi__iter.html#ac947f03b1cfcb63436a7e61ff020a88c',1,'pocketfft::detail::multi_iter']]],
+  ['stride_5fout_130',['stride_out',['../classpocketfft_1_1detail_1_1multi__iter.html#a81d71a13bf0b85e556fbb9834167ecc7',1,'pocketfft::detail::multi_iter']]],
+  ['strided_5freduce_5fgeneral_5fdispatch_131',['strided_reduce_general_dispatch',['../namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000',1,'mlx::core']]],
+  ['strided_5fscan_132',['strided_scan',['../scan_8h.html#a7abb6ffb6c3b96b88c2a63cd4cc2f7ae',1,'scan.h']]],
+  ['strides_133',['strides',['../classmlx_1_1core_1_1array.html#a186cf2648da92584d5c1c8b24e69629b',1,'mlx::core::array::strides() const'],['../classmlx_1_1core_1_1array.html#a919f850ca087d1c40aa68f854cb30be2',1,'mlx::core::array::strides(int dim) const']]],
+  ['submit_134',['submit',['../classpocketfft_1_1detail_1_1threading_1_1thread__pool.html#a8698d49e8f406cdb88006aac6a91f9a4',1,'pocketfft::detail::threading::thread_pool']]],
+  ['subtract_135',['Subtract',['../classmlx_1_1core_1_1_subtract.html#a834854757394f8de7082af65bf86ed9c',1,'mlx::core::Subtract']]],
+  ['subtract_136',['subtract',['../group__ops.html#ga196c240d3d0fcbb4713802c485e15133',1,'mlx::core']]],
+  ['sum_137',['sum',['../namespacemlx_1_1steel.html#ab4a6ddea4beb7c447cf5b69b9d46cc3b',1,'mlx::steel::sum(T x)'],['../namespacemlx_1_1steel.html#acd6e194d37b617d7a5818bc384a97fe4',1,'mlx::steel::sum(T x, Us... us)'],['../group__ops.html#gade905ee92eb6ab7edfc312aeddfbaeb6',1,'mlx::core::sum(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga3627754d7868487bdab1bd83f05d9c81',1,'mlx::core::sum(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gaccd0a6be2c5b5128fdc2d87b5c8e67f4',1,'mlx::core::sum(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gafcd39b0bf39a56c26a967981c7ab8a8d',1,'mlx::core::sum(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['svd_138',['SVD',['../classmlx_1_1core_1_1_s_v_d.html#ae89ff583e34fa894cccb8e7a475ee6d1',1,'mlx::core::SVD']]],
+  ['svd_139',['svd',['../namespacemlx_1_1core_1_1linalg.html#a64364b880e99914cf47bf756fa8dbaf0',1,'mlx::core::linalg']]],
+  ['swapaxes_140',['swapaxes',['../group__ops.html#gabc46eed81ab6c6247903e4ec0c4ec1fb',1,'mlx::core']]],
+  ['swizzle_141',['swizzle',['../structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760',1,'mlx::steel::BlockSwizzle::swizzle(uint3 tid, const int swizzle_log)'],['../structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760',1,'mlx::steel::BlockSwizzle::swizzle(uint3 tid, const int swizzle_log)']]],
+  ['synchronize_142',['synchronize',['../namespacemlx_1_1core.html#a14287949d82ffefad0306cef5eb5f9e4',1,'mlx::core::synchronize()'],['../namespacemlx_1_1core.html#a6648a71937b055e5ff513d98056c2fb5',1,'mlx::core::synchronize(Stream)']]]
 ];
diff --git a/docs/build/html/search/functions_14.js b/docs/build/html/search/functions_14.js
index 132353d8c..884f5242e 100644
--- a/docs/build/html/search/functions_14.js
+++ b/docs/build/html/search/functions_14.js
@@ -14,10 +14,10 @@ var searchData=
   ['tell_11',['tell',['../classmlx_1_1core_1_1io_1_1_reader.html#a27697ccc1ce45da0233db3bd4f298aed',1,'mlx::core::io::Reader::tell()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a11ad80749894993232fbb5c70fd7b282',1,'mlx::core::io::Writer::tell()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a2e92131428f0ffa98fff781b8c35d9e5',1,'mlx::core::io::ParallelFileReader::tell()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#aa883a722789c962164fd0ddcc5f6ffc5',1,'mlx::core::io::FileWriter::tell()']]],
   ['tensordot_12',['tensordot',['../group__ops.html#gaf5c9735f4690327e1500e04e728fae70',1,'mlx::core::tensordot(const array &amp;a, const array &amp;b, const int axis=2, StreamOrDevice s={})'],['../group__ops.html#gad7fe00b566f89d607639c1a497cabbc6',1,'mlx::core::tensordot(const array &amp;a, const array &amp;b, const std::vector&lt; int &gt; &amp;axes_a, const std::vector&lt; int &gt; &amp;axes_b, StreamOrDevice s={})']]],
   ['ternary_13',['ternary',['../namespacemlx_1_1core_1_1metal.html#a2d1c92ba6897c0a7a428fed63279b61f',1,'mlx::core::metal']]],
-  ['ternary_5fg_14',['ternary_g',['../metal_2kernels_2ternary_8h.html#adf8b5989de971e43829875dc0097cdfb',1,'ternary.h']]],
+  ['ternary_5fg_14',['ternary_g',['../metal_2kernels_2ternary_8h.html#ab2051fd944c2e24c57d5b4af54894d72',1,'ternary.h']]],
   ['ternary_5fg_5fnd1_15',['ternary_g_nd1',['../metal_2kernels_2ternary_8h.html#a1bd5918559850f3f80e3adee2391fe6a',1,'ternary.h']]],
-  ['ternary_5fg_5fnd2_16',['ternary_g_nd2',['../metal_2kernels_2ternary_8h.html#afdf0d9d0cb21fcb3f176500785076af8',1,'ternary.h']]],
-  ['ternary_5fg_5fnd3_17',['ternary_g_nd3',['../metal_2kernels_2ternary_8h.html#a113df0c8a841b0e986900d580644e047',1,'ternary.h']]],
+  ['ternary_5fg_5fnd2_16',['ternary_g_nd2',['../metal_2kernels_2ternary_8h.html#adec9ca8a8bf527cb15d70da5857af15d',1,'ternary.h']]],
+  ['ternary_5fg_5fnd3_17',['ternary_g_nd3',['../metal_2kernels_2ternary_8h.html#a046dcbf67cd2318d45355dc7516e3ff4',1,'ternary.h']]],
   ['ternary_5fop_5fgpu_18',['ternary_op_gpu',['../namespacemlx_1_1core.html#aa63e62b6d3906e4cac871d498515a1cd',1,'mlx::core']]],
   ['ternary_5fop_5fgpu_5finplace_19',['ternary_op_gpu_inplace',['../namespacemlx_1_1core.html#a37645c0adccb3eb46844115def1a68d7',1,'mlx::core']]],
   ['ternary_5fops_20',['ternary_ops',['../namespacemlx_1_1core_1_1metal.html#a11b593b07e9a33e5f78fe4695fb99ec9',1,'mlx::core::metal']]],
@@ -39,16 +39,17 @@ var searchData=
   ['to_5fstream_36',['to_stream',['../namespacemlx_1_1core.html#a4734a596e57434492ddfe79f2cb9dbf9',1,'mlx::core']]],
   ['topk_37',['topk',['../group__ops.html#ga5487dd887c43e5341f3e68ffe47f0f5a',1,'mlx::core::topk(const array &amp;a, int k, StreamOrDevice s={})'],['../group__ops.html#ga35b8436c79ff953f6c809598b646f498',1,'mlx::core::topk(const array &amp;a, int k, int axis, StreamOrDevice s={})']]],
   ['trace_38',['trace',['../group__ops.html#gabf786129c7660ed8d5acb5499bc6fefd',1,'mlx::core::trace(const array &amp;a, int offset, int axis1, int axis2, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga5ed43c2dbf7d6cbddbaa2fd682deaafd',1,'mlx::core::trace(const array &amp;a, int offset, int axis1, int axis2, StreamOrDevice s={})'],['../group__ops.html#gaf25c00108feaafaa6350a4434cb0062e',1,'mlx::core::trace(const array &amp;a, StreamOrDevice s={})']]],
-  ['transformadd_39',['TransformAdd',['../structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae',1,'mlx::steel::TransformAdd']]],
-  ['transformaxpby_40',['TransformAxpby',['../structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9',1,'mlx::steel::TransformAxpby']]],
-  ['transpose_41',['Transpose',['../classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a',1,'mlx::core::Transpose']]],
-  ['transpose_42',['transpose',['../group__ops.html#gac1869f3b7094869b44fe7ac4ce58638b',1,'mlx::core::transpose(const array &amp;a, std::vector&lt; int &gt; axes, StreamOrDevice s={})'],['../group__ops.html#ga260ac332956f3a6bf1dfdb9095c84dc5',1,'mlx::core::transpose(const array &amp;a, std::initializer_list&lt; int &gt; axes, StreamOrDevice s={})'],['../group__ops.html#ga68da0176fefbe0c0096783c6fd926c6a',1,'mlx::core::transpose(const array &amp;a, StreamOrDevice s={})']]],
-  ['tri_43',['tri',['../group__ops.html#ga4f3389e5b89e70e862e7d2b40d6c7f78',1,'mlx::core::tri(int n, int m, int k, Dtype type, StreamOrDevice s={})'],['../group__ops.html#gac19a1bd6ed6d5c7bc9d258820189dbb5',1,'mlx::core::tri(int n, Dtype type, StreamOrDevice s={})']]],
-  ['tri_5finv_44',['tri_inv',['../namespacemlx_1_1core_1_1linalg.html#aba1994571326326717b5b5e38c2e0661',1,'mlx::core::linalg']]],
-  ['tril_45',['tril',['../group__ops.html#ga83e0bb45dc770cf014531d873b78c5a2',1,'mlx::core']]],
-  ['triu_46',['triu',['../group__ops.html#gaa9df5917876eeb0cb28b7fa81f880412',1,'mlx::core']]],
-  ['trunc_47',['trunc',['../namespacemetal.html#a93cb75a11a362bfc8310ea19c554c887',1,'metal::trunc()'],['../namespacemetal_1_1fast.html#aa62e1075e86c626d97038f16e9433415',1,'metal::fast::trunc()'],['../namespacemetal_1_1precise.html#a334183e7a2dd49b983d072d1e8ee2b27',1,'metal::precise::trunc()']]],
-  ['truncated_5fnormal_48',['truncated_normal',['../namespacemlx_1_1core_1_1random.html#a00aa5746bac6d729d2ba9465153bb279',1,'mlx::core::random::truncated_normal(const array &amp;lower, const array &amp;upper, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a39663eda0fd7b274d01499a7b1c9035f',1,'mlx::core::random::truncated_normal(const array &amp;lower, const array &amp;upper, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
-  ['try_5fpop_49',['try_pop',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#aa3807d46a126d229f9054c779105ea43',1,'pocketfft::detail::threading::concurrent_queue']]],
-  ['type_5fto_5fname_50',['type_to_name',['../namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae',1,'mlx::core']]]
+  ['transformadd_39',['TransformAdd',['../structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae',1,'mlx::steel::TransformAdd::TransformAdd(const float, const float)'],['../structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae',1,'mlx::steel::TransformAdd::TransformAdd(const float, const float)']]],
+  ['transformaxpby_40',['TransformAxpby',['../structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9',1,'mlx::steel::TransformAxpby::TransformAxpby(const float alpha_, const float beta_)'],['../structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9',1,'mlx::steel::TransformAxpby::TransformAxpby(const float alpha_, const float beta_)']]],
+  ['transformscale_41',['TransformScale',['../struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70',1,'TransformScale']]],
+  ['transpose_42',['Transpose',['../classmlx_1_1core_1_1_transpose.html#a1a9ba023584c61c7ac93d6dce536760a',1,'mlx::core::Transpose']]],
+  ['transpose_43',['transpose',['../group__ops.html#gac1869f3b7094869b44fe7ac4ce58638b',1,'mlx::core::transpose(const array &amp;a, std::vector&lt; int &gt; axes, StreamOrDevice s={})'],['../group__ops.html#ga260ac332956f3a6bf1dfdb9095c84dc5',1,'mlx::core::transpose(const array &amp;a, std::initializer_list&lt; int &gt; axes, StreamOrDevice s={})'],['../group__ops.html#ga68da0176fefbe0c0096783c6fd926c6a',1,'mlx::core::transpose(const array &amp;a, StreamOrDevice s={})']]],
+  ['tri_44',['tri',['../group__ops.html#ga4f3389e5b89e70e862e7d2b40d6c7f78',1,'mlx::core::tri(int n, int m, int k, Dtype type, StreamOrDevice s={})'],['../group__ops.html#gac19a1bd6ed6d5c7bc9d258820189dbb5',1,'mlx::core::tri(int n, Dtype type, StreamOrDevice s={})']]],
+  ['tri_5finv_45',['tri_inv',['../namespacemlx_1_1core_1_1linalg.html#aba1994571326326717b5b5e38c2e0661',1,'mlx::core::linalg']]],
+  ['tril_46',['tril',['../group__ops.html#ga83e0bb45dc770cf014531d873b78c5a2',1,'mlx::core']]],
+  ['triu_47',['triu',['../group__ops.html#gaa9df5917876eeb0cb28b7fa81f880412',1,'mlx::core']]],
+  ['trunc_48',['trunc',['../namespacemetal.html#a93cb75a11a362bfc8310ea19c554c887',1,'metal::trunc()'],['../namespacemetal_1_1fast.html#aa62e1075e86c626d97038f16e9433415',1,'metal::fast::trunc()'],['../namespacemetal_1_1precise.html#a334183e7a2dd49b983d072d1e8ee2b27',1,'metal::precise::trunc()']]],
+  ['truncated_5fnormal_49',['truncated_normal',['../namespacemlx_1_1core_1_1random.html#a00aa5746bac6d729d2ba9465153bb279',1,'mlx::core::random::truncated_normal(const array &amp;lower, const array &amp;upper, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a39663eda0fd7b274d01499a7b1c9035f',1,'mlx::core::random::truncated_normal(const array &amp;lower, const array &amp;upper, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
+  ['try_5fpop_50',['try_pop',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#aa3807d46a126d229f9054c779105ea43',1,'pocketfft::detail::threading::concurrent_queue']]],
+  ['type_5fto_5fname_51',['type_to_name',['../namespacemlx_1_1core.html#aef60e3a8d9c987c9c338b193673d2164',1,'mlx::core::type_to_name(const Dtype &amp;t)'],['../namespacemlx_1_1core.html#af1fdfdaa5644394362e6baba30701bae',1,'mlx::core::type_to_name(const array &amp;a)']]]
 ];
diff --git a/docs/build/html/search/functions_15.js b/docs/build/html/search/functions_15.js
index 3f6a191f5..db9d76bdf 100644
--- a/docs/build/html/search/functions_15.js
+++ b/docs/build/html/search/functions_15.js
@@ -1,14 +1,16 @@
 var searchData=
 [
-  ['unary_0',['unary',['../namespacemlx_1_1core_1_1metal.html#afac64fd56ac492d6baf6de7e8a00b039',1,'mlx::core::metal']]],
-  ['unary_5fg_1',['unary_g',['../metal_2kernels_2unary_8h.html#ac965f8d3ed62f8580dbfb645e83d4ae5',1,'unary.h']]],
-  ['unary_5fop_5fgpu_2',['unary_op_gpu',['../namespacemlx_1_1core.html#aba2b4accc059f30d4dca88db9f7a6e13',1,'mlx::core']]],
-  ['unary_5fop_5fgpu_5finplace_3',['unary_op_gpu_inplace',['../namespacemlx_1_1core.html#a668fde2bd280a88f63a68b68a343d375',1,'mlx::core']]],
-  ['unary_5fops_4',['unary_ops',['../namespacemlx_1_1core_1_1metal.html#a17b471fa52ea5f24ee63e081f46528f5',1,'mlx::core::metal']]],
-  ['unary_5fv_5',['unary_v',['../metal_2kernels_2unary_8h.html#a64e4f6737edddb72122e262977ee3014',1,'unary.h']]],
-  ['unary_5fv2_6',['unary_v2',['../metal_2kernels_2unary_8h.html#a7c7690f0df9d2acc60b63be58d9c7777',1,'unary.h']]],
-  ['unaryprimitive_7',['UnaryPrimitive',['../classmlx_1_1core_1_1_unary_primitive.html#a189f6d4ed369f82a4b724a29eb056d4e',1,'mlx::core::UnaryPrimitive::UnaryPrimitive(Stream stream)'],['../classmlx_1_1core_1_1_unary_primitive.html#a9935cffc4f246d3d883bc3d26c5163f2',1,'mlx::core::UnaryPrimitive::UnaryPrimitive(const UnaryPrimitive &amp;other)=delete'],['../classmlx_1_1core_1_1_unary_primitive.html#a780281fb04e2daf1be630c124bd605e3',1,'mlx::core::UnaryPrimitive::UnaryPrimitive(UnaryPrimitive &amp;&amp;other)=delete']]],
-  ['uniform_8',['Uniform',['../classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1',1,'mlx::core::Uniform']]],
-  ['uniform_9',['uniform',['../namespacemlx_1_1core_1_1random.html#adaa626cf75ab891978954bd1eb79a38b',1,'mlx::core::random::uniform(const array &amp;low, const array &amp;high, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#ac0dd00f7e37377d621f9f5bfb5a3f8e4',1,'mlx::core::random::uniform(T low, U high, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a2922e133d9f82dcf925bae0a784cc4a7',1,'mlx::core::random::uniform(const std::vector&lt; int &gt; &amp;shape, Dtype dtype, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a133d2855ff4d8daf41029cffdf43cdf9',1,'mlx::core::random::uniform(const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
-  ['utils_10',['utils',['../namespacemlx_1_1core_1_1metal.html#a529dc6c2d4a37ba544b66b2c3cd792cc',1,'mlx::core::metal']]]
+  ['uint16_5fto_5fbfloat16_0',['uint16_to_bfloat16',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4',1,'uint16_to_bfloat16(const uint16_t x):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a8d066e48cf3e2a0583c71816fa40f7f4',1,'uint16_to_bfloat16(const uint16_t x):&#160;bf16.h']]],
+  ['unary_1',['unary',['../namespacemlx_1_1core_1_1metal.html#afac64fd56ac492d6baf6de7e8a00b039',1,'mlx::core::metal']]],
+  ['unary_5fg_2',['unary_g',['../metal_2kernels_2unary_8h.html#ac2a85fee50af49620ff62c1a71e2575d',1,'unary.h']]],
+  ['unary_5fop_5fgpu_3',['unary_op_gpu',['../namespacemlx_1_1core.html#aba2b4accc059f30d4dca88db9f7a6e13',1,'mlx::core']]],
+  ['unary_5fop_5fgpu_5finplace_4',['unary_op_gpu_inplace',['../namespacemlx_1_1core.html#a668fde2bd280a88f63a68b68a343d375',1,'mlx::core']]],
+  ['unary_5fops_5',['unary_ops',['../namespacemlx_1_1core_1_1metal.html#a17b471fa52ea5f24ee63e081f46528f5',1,'mlx::core::metal']]],
+  ['unary_5fv_6',['unary_v',['../metal_2kernels_2unary_8h.html#a64e4f6737edddb72122e262977ee3014',1,'unary.h']]],
+  ['unary_5fv2_7',['unary_v2',['../metal_2kernels_2unary_8h.html#a7c7690f0df9d2acc60b63be58d9c7777',1,'unary.h']]],
+  ['unaryprimitive_8',['UnaryPrimitive',['../classmlx_1_1core_1_1_unary_primitive.html#a189f6d4ed369f82a4b724a29eb056d4e',1,'mlx::core::UnaryPrimitive::UnaryPrimitive(Stream stream)'],['../classmlx_1_1core_1_1_unary_primitive.html#a9935cffc4f246d3d883bc3d26c5163f2',1,'mlx::core::UnaryPrimitive::UnaryPrimitive(const UnaryPrimitive &amp;other)=delete'],['../classmlx_1_1core_1_1_unary_primitive.html#a780281fb04e2daf1be630c124bd605e3',1,'mlx::core::UnaryPrimitive::UnaryPrimitive(UnaryPrimitive &amp;&amp;other)=delete']]],
+  ['uniform_9',['Uniform',['../classmlx_1_1core_1_1_uniform.html#a626aa1091aa77b4a32c02290106b85e1',1,'mlx::core::Uniform']]],
+  ['uniform_10',['uniform',['../namespacemlx_1_1core_1_1random.html#adaa626cf75ab891978954bd1eb79a38b',1,'mlx::core::random::uniform(const array &amp;low, const array &amp;high, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#ac0dd00f7e37377d621f9f5bfb5a3f8e4',1,'mlx::core::random::uniform(T low, U high, const std::vector&lt; int &gt; &amp;shape, Dtype dtype=float32, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a2922e133d9f82dcf925bae0a784cc4a7',1,'mlx::core::random::uniform(const std::vector&lt; int &gt; &amp;shape, Dtype dtype, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a133d2855ff4d8daf41029cffdf43cdf9',1,'mlx::core::random::uniform(const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
+  ['update_5ffence_11',['update_fence',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#aeef08f5f3c015578d40de756a6465aa2',1,'mlx::core::metal::CommandEncoder']]],
+  ['utils_12',['utils',['../namespacemlx_1_1core_1_1metal.html#a529dc6c2d4a37ba544b66b2c3cd792cc',1,'mlx::core::metal']]]
 ];
diff --git a/docs/build/html/search/functions_16.js b/docs/build/html/search/functions_16.js
index 3642762f0..48658abe0 100644
--- a/docs/build/html/search/functions_16.js
+++ b/docs/build/html/search/functions_16.js
@@ -7,8 +7,8 @@ var searchData=
   ['var_4',['var',['../group__ops.html#ga7e133df686439588a8cd1fb10ce0c6e9',1,'mlx::core::var(const array &amp;a, bool keepdims, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga7d7b38d118fa2613214078ef0f7d5a42',1,'mlx::core::var(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga78ddeb966cbe7a5b0aa17e1de43025f2',1,'mlx::core::var(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, int ddof=0, StreamOrDevice s={})'],['../group__ops.html#ga4fbf3e3f98f2e4956faf87af320aa9d0',1,'mlx::core::var(const array &amp;a, int axis, bool keepdims=false, int ddof=0, StreamOrDevice s={})']]],
   ['view_5',['View',['../classmlx_1_1core_1_1_view.html#ad7eed156c308e9a29a8b41f965ec941e',1,'mlx::core::View']]],
   ['view_6',['view',['../group__ops.html#ga3602aa91b7b124a0b41ec1b2137a1b02',1,'mlx::core']]],
-  ['vjp_7',['vjp',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225',1,'mlx::core::distributed::AllReduce::vjp()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb',1,'mlx::core::distributed::AllGather::vjp()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91',1,'mlx::core::fast::Custom::vjp()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb',1,'mlx::core::fast::RMSNorm::vjp()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6',1,'mlx::core::fast::LayerNorm::vjp()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533',1,'mlx::core::fast::RoPE::vjp()'],['../classmlx_1_1core_1_1_primitive.html#a1dcb6807326eeab62474c6a0e3836d42',1,'mlx::core::Primitive::vjp()'],['../classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592',1,'mlx::core::Abs::vjp()'],['../classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607',1,'mlx::core::Add::vjp()'],['../classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6',1,'mlx::core::AddMM::vjp()'],['../classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92',1,'mlx::core::ArcCos::vjp()'],['../classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26',1,'mlx::core::ArcCosh::vjp()'],['../classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1',1,'mlx::core::ArcSin::vjp()'],['../classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e',1,'mlx::core::ArcSinh::vjp()'],['../classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2',1,'mlx::core::ArcTan::vjp()'],['../classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2',1,'mlx::core::ArcTan2::vjp()'],['../classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72',1,'mlx::core::ArcTanh::vjp()'],['../classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0',1,'mlx::core::ArgPartition::vjp()'],['../classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a',1,'mlx::core::ArgReduce::vjp()'],['../classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18',1,'mlx::core::AsType::vjp()'],['../classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062',1,'mlx::core::AsStrided::vjp()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61',1,'mlx::core::BitwiseBinary::vjp()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120',1,'mlx::core::BlockMaskedMM::vjp()'],['../classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda',1,'mlx::core::GatherMM::vjp()'],['../classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18',1,'mlx::core::Broadcast::vjp()'],['../classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb',1,'mlx::core::Ceil::vjp()'],['../classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133',1,'mlx::core::Compiled::vjp()'],['../classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0',1,'mlx::core::Concatenate::vjp()'],['../classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690',1,'mlx::core::Convolution::vjp()'],['../classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd',1,'mlx::core::Copy::vjp()'],['../classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00',1,'mlx::core::Cos::vjp()'],['../classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4',1,'mlx::core::Cosh::vjp()'],['../classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209',1,'mlx::core::CustomTransforms::vjp()'],['../classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0',1,'mlx::core::Depends::vjp()'],['../classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6',1,'mlx::core::Divide::vjp()'],['../classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1',1,'mlx::core::DivMod::vjp()'],['../classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3',1,'mlx::core::Select::vjp()'],['../classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6',1,'mlx::core::Remainder::vjp()'],['../classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736',1,'mlx::core::Equal::vjp()'],['../classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909',1,'mlx::core::Erf::vjp()'],['../classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189',1,'mlx::core::ErfInv::vjp()'],['../classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8',1,'mlx::core::Exp::vjp()'],['../classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43',1,'mlx::core::Expm1::vjp()'],['../classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090',1,'mlx::core::FFT::vjp()'],['../classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e',1,'mlx::core::Floor::vjp()'],['../classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969',1,'mlx::core::Full::vjp()'],['../classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426',1,'mlx::core::Gather::vjp()'],['../classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679',1,'mlx::core::Greater::vjp()'],['../classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee',1,'mlx::core::GreaterEqual::vjp()'],['../classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656',1,'mlx::core::Hadamard::vjp()'],['../classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b',1,'mlx::core::Imag::vjp()'],['../classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50',1,'mlx::core::Less::vjp()'],['../classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028',1,'mlx::core::LessEqual::vjp()'],['../classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280',1,'mlx::core::Log::vjp()'],['../classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880',1,'mlx::core::Log1p::vjp()'],['../classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50',1,'mlx::core::LogicalNot::vjp()'],['../classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54',1,'mlx::core::LogicalAnd::vjp()'],['../classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847',1,'mlx::core::LogicalOr::vjp()'],['../classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4',1,'mlx::core::LogAddExp::vjp()'],['../classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0',1,'mlx::core::Matmul::vjp()'],['../classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3',1,'mlx::core::Maximum::vjp()'],['../classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204',1,'mlx::core::Minimum::vjp()'],['../classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1',1,'mlx::core::Multiply::vjp()'],['../classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a',1,'mlx::core::Negative::vjp()'],['../classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b',1,'mlx::core::NotEqual::vjp()'],['../classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038',1,'mlx::core::Pad::vjp()'],['../classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9',1,'mlx::core::Partition::vjp()'],['../classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082',1,'mlx::core::Power::vjp()'],['../classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26',1,'mlx::core::QuantizedMatmul::vjp()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab',1,'mlx::core::GatherQMM::vjp()'],['../classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe',1,'mlx::core::Real::vjp()'],['../classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365',1,'mlx::core::Reshape::vjp()'],['../classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e',1,'mlx::core::Reduce::vjp()'],['../classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce',1,'mlx::core::Round::vjp()'],['../classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e',1,'mlx::core::Scan::vjp()'],['../classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a',1,'mlx::core::Scatter::vjp()'],['../classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf',1,'mlx::core::Sigmoid::vjp()'],['../classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce',1,'mlx::core::Sign::vjp()'],['../classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0',1,'mlx::core::Sin::vjp()'],['../classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0',1,'mlx::core::Sinh::vjp()'],['../classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f',1,'mlx::core::Slice::vjp()'],['../classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77',1,'mlx::core::SliceUpdate::vjp()'],['../classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b',1,'mlx::core::Softmax::vjp()'],['../classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358',1,'mlx::core::Sort::vjp()'],['../classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674',1,'mlx::core::Split::vjp()'],['../classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263',1,'mlx::core::Square::vjp()'],['../classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3',1,'mlx::core::Sqrt::vjp()'],['../classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b',1,'mlx::core::Subtract::vjp()'],['../classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7',1,'mlx::core::Tan::vjp()'],['../classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95',1,'mlx::core::Tanh::vjp()'],['../classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80',1,'mlx::core::Transpose::vjp()'],['../namespacemlx_1_1core.html#a1b33e2c2e3471420490cf0be2de6de18',1,'mlx::core::vjp(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotangents)'],['../namespacemlx_1_1core.html#a2065a11249c3f4356ffd69b7a8c487ff',1,'mlx::core::vjp(const std::function&lt; array(const array &amp;)&gt; &amp;fun, const array &amp;primal, const array &amp;cotangent)']]],
-  ['vmap_8',['vmap',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a',1,'mlx::core::distributed::AllReduce::vmap()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031',1,'mlx::core::distributed::AllGather::vmap()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93',1,'mlx::core::distributed::Send::vmap()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d',1,'mlx::core::fast::Custom::vmap()'],['../classmlx_1_1core_1_1_primitive.html#ac632b9619dd7a6a0f177bd36202e8103',1,'mlx::core::Primitive::vmap()'],['../classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f',1,'mlx::core::Abs::vmap()'],['../classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646',1,'mlx::core::Add::vmap()'],['../classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81',1,'mlx::core::AddMM::vmap()'],['../classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83',1,'mlx::core::ArcCos::vmap()'],['../classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461',1,'mlx::core::ArcCosh::vmap()'],['../classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82',1,'mlx::core::ArcSin::vmap()'],['../classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d',1,'mlx::core::ArcSinh::vmap()'],['../classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556',1,'mlx::core::ArcTan::vmap()'],['../classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634',1,'mlx::core::ArcTan2::vmap()'],['../classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040',1,'mlx::core::ArcTanh::vmap()'],['../classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a',1,'mlx::core::ArgPartition::vmap()'],['../classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba',1,'mlx::core::ArgReduce::vmap()'],['../classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e',1,'mlx::core::ArgSort::vmap()'],['../classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc',1,'mlx::core::AsType::vmap()'],['../classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965',1,'mlx::core::BitwiseBinary::vmap()'],['../classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f',1,'mlx::core::Broadcast::vmap()'],['../classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4',1,'mlx::core::Ceil::vmap()'],['../classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d',1,'mlx::core::Compiled::vmap()'],['../classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1',1,'mlx::core::Concatenate::vmap()'],['../classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60',1,'mlx::core::Conjugate::vmap()'],['../classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61',1,'mlx::core::Copy::vmap()'],['../classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6',1,'mlx::core::Cos::vmap()'],['../classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406',1,'mlx::core::Cosh::vmap()'],['../classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b',1,'mlx::core::CustomTransforms::vmap()'],['../classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242',1,'mlx::core::Divide::vmap()'],['../classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942',1,'mlx::core::DivMod::vmap()'],['../classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f',1,'mlx::core::Select::vmap()'],['../classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d',1,'mlx::core::Remainder::vmap()'],['../classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca',1,'mlx::core::Equal::vmap()'],['../classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa',1,'mlx::core::Erf::vmap()'],['../classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9',1,'mlx::core::ErfInv::vmap()'],['../classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37',1,'mlx::core::Exp::vmap()'],['../classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296',1,'mlx::core::Expm1::vmap()'],['../classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1',1,'mlx::core::FFT::vmap()'],['../classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10',1,'mlx::core::Floor::vmap()'],['../classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95',1,'mlx::core::Full::vmap()'],['../classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275',1,'mlx::core::Gather::vmap()'],['../classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0',1,'mlx::core::Greater::vmap()'],['../classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d',1,'mlx::core::GreaterEqual::vmap()'],['../classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c',1,'mlx::core::Hadamard::vmap()'],['../classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3',1,'mlx::core::Imag::vmap()'],['../classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e',1,'mlx::core::Less::vmap()'],['../classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480',1,'mlx::core::LessEqual::vmap()'],['../classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49',1,'mlx::core::Log::vmap()'],['../classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71',1,'mlx::core::Log1p::vmap()'],['../classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d',1,'mlx::core::LogicalNot::vmap()'],['../classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5',1,'mlx::core::LogicalAnd::vmap()'],['../classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3',1,'mlx::core::LogicalOr::vmap()'],['../classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78',1,'mlx::core::LogAddExp::vmap()'],['../classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2',1,'mlx::core::Matmul::vmap()'],['../classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3',1,'mlx::core::Maximum::vmap()'],['../classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980',1,'mlx::core::Minimum::vmap()'],['../classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf',1,'mlx::core::Multiply::vmap()'],['../classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0',1,'mlx::core::Negative::vmap()'],['../classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5',1,'mlx::core::NotEqual::vmap()'],['../classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2',1,'mlx::core::NumberOfElements::vmap()'],['../classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf',1,'mlx::core::Pad::vmap()'],['../classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c',1,'mlx::core::Partition::vmap()'],['../classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f',1,'mlx::core::Power::vmap()'],['../classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763',1,'mlx::core::QuantizedMatmul::vmap()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f',1,'mlx::core::GatherQMM::vmap()'],['../classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415',1,'mlx::core::RandomBits::vmap()'],['../classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6',1,'mlx::core::Real::vmap()'],['../classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d',1,'mlx::core::Reshape::vmap()'],['../classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38',1,'mlx::core::Reduce::vmap()'],['../classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd',1,'mlx::core::Round::vmap()'],['../classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804',1,'mlx::core::Scan::vmap()'],['../classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322',1,'mlx::core::Scatter::vmap()'],['../classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85',1,'mlx::core::Sigmoid::vmap()'],['../classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295',1,'mlx::core::Sign::vmap()'],['../classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba',1,'mlx::core::Sin::vmap()'],['../classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788',1,'mlx::core::Sinh::vmap()'],['../classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2',1,'mlx::core::Slice::vmap()'],['../classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3',1,'mlx::core::SliceUpdate::vmap()'],['../classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19',1,'mlx::core::Softmax::vmap()'],['../classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c',1,'mlx::core::Sort::vmap()'],['../classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6',1,'mlx::core::Split::vmap()'],['../classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5',1,'mlx::core::Square::vmap()'],['../classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e',1,'mlx::core::Sqrt::vmap()'],['../classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0',1,'mlx::core::StopGradient::vmap()'],['../classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098',1,'mlx::core::Subtract::vmap()'],['../classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7',1,'mlx::core::Tan::vmap()'],['../classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f',1,'mlx::core::Tanh::vmap()'],['../classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926',1,'mlx::core::Uniform::vmap()'],['../classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121',1,'mlx::core::View::vmap()'],['../classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe',1,'mlx::core::Transpose::vmap()'],['../classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8',1,'mlx::core::SVD::vmap()'],['../classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2',1,'mlx::core::Inverse::vmap()'],['../classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5',1,'mlx::core::Cholesky::vmap()'],['../classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f',1,'mlx::core::Eigh::vmap()'],['../namespacemlx_1_1core.html#ac3caec2fa65375ed4c3bf1206177b84c',1,'mlx::core::vmap(const std::function&lt; array(const array &amp;)&gt; &amp;fun, int in_axis=0, int out_axis=0)'],['../namespacemlx_1_1core.html#a8481a3bb4c12c2b7dc6ba576c2be3d0d',1,'mlx::core::vmap(const std::function&lt; array(const array &amp;, const array &amp;)&gt; &amp;fun, int in_axis_a=0, int in_axis_b=0, int out_axis=0)'],['../namespacemlx_1_1core.html#a95a7757e8d18fced38acfc6a3e8d686a',1,'mlx::core::vmap(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;in_axes={}, const std::vector&lt; int &gt; &amp;out_axes={})']]],
+  ['vjp_7',['vjp',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abbf6d1d63dcda207ad7d9eeb4fc36225',1,'mlx::core::distributed::AllReduce::vjp()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#aa5eff6fc128b71220899aab8ab9116fb',1,'mlx::core::distributed::AllGather::vjp()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a74be4bcd0382f7f6400bf73fd5569c91',1,'mlx::core::fast::Custom::vjp()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#aacfbbbc15fcee0a5ce4f519ca3cca5eb',1,'mlx::core::fast::RMSNorm::vjp()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#ae5e1b5df0705a6b1d141691a4396b0b6',1,'mlx::core::fast::LayerNorm::vjp()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#ad999105414badd66c8fd9e069454a533',1,'mlx::core::fast::RoPE::vjp()'],['../classmlx_1_1core_1_1_primitive.html#a1dcb6807326eeab62474c6a0e3836d42',1,'mlx::core::Primitive::vjp()'],['../classmlx_1_1core_1_1_abs.html#aa2dd8ec0989e716b77394ac349b34592',1,'mlx::core::Abs::vjp()'],['../classmlx_1_1core_1_1_add.html#ac28e581862880e24ed2b99bb6a916607',1,'mlx::core::Add::vjp()'],['../classmlx_1_1core_1_1_add_m_m.html#ac1562a37cec6928e01281926ebeb47c6',1,'mlx::core::AddMM::vjp()'],['../classmlx_1_1core_1_1_arc_cos.html#a78e73e5e639d1249c7fe9614bf157c92',1,'mlx::core::ArcCos::vjp()'],['../classmlx_1_1core_1_1_arc_cosh.html#a856c677f16e2b3f2edd2491e35db2d26',1,'mlx::core::ArcCosh::vjp()'],['../classmlx_1_1core_1_1_arc_sin.html#ab4057cd5ef1a8359f97493018e10d3a1',1,'mlx::core::ArcSin::vjp()'],['../classmlx_1_1core_1_1_arc_sinh.html#a7988ee5b9e1e7e498dcab73d61ba147e',1,'mlx::core::ArcSinh::vjp()'],['../classmlx_1_1core_1_1_arc_tan.html#a5fefc3634b96a67ff8ae011a8ee180c2',1,'mlx::core::ArcTan::vjp()'],['../classmlx_1_1core_1_1_arc_tan2.html#a99840c282e37b2b2a9c312e6e8ade1d2',1,'mlx::core::ArcTan2::vjp()'],['../classmlx_1_1core_1_1_arc_tanh.html#a07da5797f7aaf3dfe43bf24e8562ac72',1,'mlx::core::ArcTanh::vjp()'],['../classmlx_1_1core_1_1_arg_partition.html#ade23d014717a0b0235d00073503aeac0',1,'mlx::core::ArgPartition::vjp()'],['../classmlx_1_1core_1_1_arg_reduce.html#a60d272685a373e6fe879416481a1ce1a',1,'mlx::core::ArgReduce::vjp()'],['../classmlx_1_1core_1_1_as_type.html#ac38a4f889311a3b5e5be9a67dcb93e18',1,'mlx::core::AsType::vjp()'],['../classmlx_1_1core_1_1_as_strided.html#a34783284c9b2f5b4a62c3c3ee5dd4062',1,'mlx::core::AsStrided::vjp()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a6131ed1c317ff8700a3e9b13fdaa9d61',1,'mlx::core::BitwiseBinary::vjp()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#a1adf20087ee2f685bf39c2724b8e7120',1,'mlx::core::BlockMaskedMM::vjp()'],['../classmlx_1_1core_1_1_gather_m_m.html#a76c9f27c57354f6230b43944882e1bda',1,'mlx::core::GatherMM::vjp()'],['../classmlx_1_1core_1_1_broadcast.html#a0318847c9be40f00b23907ad56037d18',1,'mlx::core::Broadcast::vjp()'],['../classmlx_1_1core_1_1_ceil.html#ac2f5a2bd84b8f013e5ce688419a88acb',1,'mlx::core::Ceil::vjp()'],['../classmlx_1_1core_1_1_compiled.html#a32462e65c52f84b708188130cc508133',1,'mlx::core::Compiled::vjp()'],['../classmlx_1_1core_1_1_concatenate.html#a8155db9100ec3b8bd0bc94baeaeee3b0',1,'mlx::core::Concatenate::vjp()'],['../classmlx_1_1core_1_1_contiguous.html#abf488f02057fd5852f38b2e8a600ad2a',1,'mlx::core::Contiguous::vjp()'],['../classmlx_1_1core_1_1_convolution.html#af8eb9c0c055ad20aa74b547016917690',1,'mlx::core::Convolution::vjp()'],['../classmlx_1_1core_1_1_copy.html#a6c4dee582001e9983e9517485ee37efd',1,'mlx::core::Copy::vjp()'],['../classmlx_1_1core_1_1_cos.html#a51d84113728e651ef9d4a1fe671c4d00',1,'mlx::core::Cos::vjp()'],['../classmlx_1_1core_1_1_cosh.html#a0791abd4305a333fb3b181a5357ce0f4',1,'mlx::core::Cosh::vjp()'],['../classmlx_1_1core_1_1_custom_transforms.html#aa1da36cef632df767cd9809d6cf06209',1,'mlx::core::CustomTransforms::vjp()'],['../classmlx_1_1core_1_1_depends.html#a02996fa45f01f7cb9f37074d5f8ccab0',1,'mlx::core::Depends::vjp()'],['../classmlx_1_1core_1_1_divide.html#ad3af7c70cad22c1a1a75b4a78ef793b6',1,'mlx::core::Divide::vjp()'],['../classmlx_1_1core_1_1_div_mod.html#a8c914a07f666a1d9377a27ed5d55e7c1',1,'mlx::core::DivMod::vjp()'],['../classmlx_1_1core_1_1_select.html#a9b522487b78fceeca7f827cd1c29a9a3',1,'mlx::core::Select::vjp()'],['../classmlx_1_1core_1_1_remainder.html#ab18f7bca1027ae71847a50da0933cec6',1,'mlx::core::Remainder::vjp()'],['../classmlx_1_1core_1_1_equal.html#af3c1bfcd1bf50922fc00e302bb193736',1,'mlx::core::Equal::vjp()'],['../classmlx_1_1core_1_1_erf.html#a1f529e95a42a2d69a8b18979d3ee2909',1,'mlx::core::Erf::vjp()'],['../classmlx_1_1core_1_1_erf_inv.html#a48afff12a58ddefae7ae0245c3580189',1,'mlx::core::ErfInv::vjp()'],['../classmlx_1_1core_1_1_exp.html#a94b9b7d137c3640d290b96c5e8b7e1a8',1,'mlx::core::Exp::vjp()'],['../classmlx_1_1core_1_1_expm1.html#af6ce416169190479c9792bb9cdbe2f43',1,'mlx::core::Expm1::vjp()'],['../classmlx_1_1core_1_1_f_f_t.html#aafc895614a6e368c0e6d64af20d01090',1,'mlx::core::FFT::vjp()'],['../classmlx_1_1core_1_1_floor.html#a589e2cf99b6fd1a5ba85534a2a31338e',1,'mlx::core::Floor::vjp()'],['../classmlx_1_1core_1_1_full.html#a49e76e7a8641f990701abc1b3bd49969',1,'mlx::core::Full::vjp()'],['../classmlx_1_1core_1_1_gather.html#aacf612a8f5f1cdbbfd19707d8d33c426',1,'mlx::core::Gather::vjp()'],['../classmlx_1_1core_1_1_greater.html#a341766a8a7e41d2a1160d35d4e781679',1,'mlx::core::Greater::vjp()'],['../classmlx_1_1core_1_1_greater_equal.html#a62f07a4ac54c708307c82aac0e5693ee',1,'mlx::core::GreaterEqual::vjp()'],['../classmlx_1_1core_1_1_hadamard.html#af4134775427b8998d66f489468b98656',1,'mlx::core::Hadamard::vjp()'],['../classmlx_1_1core_1_1_imag.html#a80da5fdd0fa549eebd7804c0e261848b',1,'mlx::core::Imag::vjp()'],['../classmlx_1_1core_1_1_less.html#aaf205d389b5e602e0814b68f66de8f50',1,'mlx::core::Less::vjp()'],['../classmlx_1_1core_1_1_less_equal.html#aab2aab7590c299885e815c18eedd1028',1,'mlx::core::LessEqual::vjp()'],['../classmlx_1_1core_1_1_log.html#a40885dccfbf928c4d035881be1d49280',1,'mlx::core::Log::vjp()'],['../classmlx_1_1core_1_1_log1p.html#a3113c1d2b4c5e73d0b470f42dc48a880',1,'mlx::core::Log1p::vjp()'],['../classmlx_1_1core_1_1_logical_not.html#af2c3c241cf3910fbaba013c69d052a50',1,'mlx::core::LogicalNot::vjp()'],['../classmlx_1_1core_1_1_logical_and.html#ae42f8fc454577b0fd6410cae9d5f3b54',1,'mlx::core::LogicalAnd::vjp()'],['../classmlx_1_1core_1_1_logical_or.html#a51aed488f52d5031998689af9cb17847',1,'mlx::core::LogicalOr::vjp()'],['../classmlx_1_1core_1_1_log_add_exp.html#ae231af0ed24a93eb647ee58c2d2b20b4',1,'mlx::core::LogAddExp::vjp()'],['../classmlx_1_1core_1_1_matmul.html#a524136cca481598ea20894d85ca66bb0',1,'mlx::core::Matmul::vjp()'],['../classmlx_1_1core_1_1_maximum.html#a7de15d7b28784e24bbfc7e85ddcbcff3',1,'mlx::core::Maximum::vjp()'],['../classmlx_1_1core_1_1_minimum.html#a48a0cbe3a6c4f7473c00e343f63b5204',1,'mlx::core::Minimum::vjp()'],['../classmlx_1_1core_1_1_multiply.html#a74b7556ec03e2c3d3f971666d06f5db1',1,'mlx::core::Multiply::vjp()'],['../classmlx_1_1core_1_1_negative.html#a889585f056d33bda30c30311257af52a',1,'mlx::core::Negative::vjp()'],['../classmlx_1_1core_1_1_not_equal.html#a0361f29f4ae1235bdf3f3304527e2d4b',1,'mlx::core::NotEqual::vjp()'],['../classmlx_1_1core_1_1_pad.html#ad8a7e547644f2717a24322968e971038',1,'mlx::core::Pad::vjp()'],['../classmlx_1_1core_1_1_partition.html#a7110772b6cd2d430a2b825cf5c952ca9',1,'mlx::core::Partition::vjp()'],['../classmlx_1_1core_1_1_power.html#a1453bb8307d6ff33134f1e00263bf082',1,'mlx::core::Power::vjp()'],['../classmlx_1_1core_1_1_quantized_matmul.html#acb975e272b4a88ab232ef7f7c3a2bf26',1,'mlx::core::QuantizedMatmul::vjp()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#ae08a4b7d28902d46f39e66beeb0e23ab',1,'mlx::core::GatherQMM::vjp()'],['../classmlx_1_1core_1_1_real.html#a29f6109339c5141a862ceae72c8b80fe',1,'mlx::core::Real::vjp()'],['../classmlx_1_1core_1_1_reshape.html#ab17294ecc6b5d4e89626fb48c7516365',1,'mlx::core::Reshape::vjp()'],['../classmlx_1_1core_1_1_reduce.html#a684883d2a96315f548ca769510e28e4e',1,'mlx::core::Reduce::vjp()'],['../classmlx_1_1core_1_1_round.html#af8f085e08b7fa8840c52a20b12ca35ce',1,'mlx::core::Round::vjp()'],['../classmlx_1_1core_1_1_scan.html#aaf13f72620b4b5d6a20e1228930e848e',1,'mlx::core::Scan::vjp()'],['../classmlx_1_1core_1_1_scatter.html#a0b51287fba789bb139ed61d40a0c636a',1,'mlx::core::Scatter::vjp()'],['../classmlx_1_1core_1_1_sigmoid.html#aac2f56a4c8362e36a28e232758ca52cf',1,'mlx::core::Sigmoid::vjp()'],['../classmlx_1_1core_1_1_sign.html#aa60ac52edd739fbdf388a997acd01bce',1,'mlx::core::Sign::vjp()'],['../classmlx_1_1core_1_1_sin.html#aedefe550ab4b0687858981bc0bcfbfa0',1,'mlx::core::Sin::vjp()'],['../classmlx_1_1core_1_1_sinh.html#a6b39fdd429bbb4de389e7c904fd561f0',1,'mlx::core::Sinh::vjp()'],['../classmlx_1_1core_1_1_slice.html#a291746a527ff991b66249fb2b54b685f',1,'mlx::core::Slice::vjp()'],['../classmlx_1_1core_1_1_slice_update.html#aedcdc60a0477997a96306c02b66d3f77',1,'mlx::core::SliceUpdate::vjp()'],['../classmlx_1_1core_1_1_softmax.html#abb68c311c45ee422a7c966accde9041b',1,'mlx::core::Softmax::vjp()'],['../classmlx_1_1core_1_1_sort.html#a3a8900dce53ee4eb7a1b83806e629358',1,'mlx::core::Sort::vjp()'],['../classmlx_1_1core_1_1_split.html#a7e8730f9cffa9872fff6f8d577031674',1,'mlx::core::Split::vjp()'],['../classmlx_1_1core_1_1_square.html#abcd9516da7f02dc906368c23b0bca263',1,'mlx::core::Square::vjp()'],['../classmlx_1_1core_1_1_sqrt.html#a08a21bd2c3a016f042d95aca294e68f3',1,'mlx::core::Sqrt::vjp()'],['../classmlx_1_1core_1_1_subtract.html#a3a3322be7c3bcaa0397cf099091df16b',1,'mlx::core::Subtract::vjp()'],['../classmlx_1_1core_1_1_tan.html#a4639836cff03d73c769387d6943e92d7',1,'mlx::core::Tan::vjp()'],['../classmlx_1_1core_1_1_tanh.html#afe7b05e2b36b99c3a1b66f0cd3544e95',1,'mlx::core::Tanh::vjp()'],['../classmlx_1_1core_1_1_transpose.html#ac7805aa29b34afdf8852554f1e759f80',1,'mlx::core::Transpose::vjp()'],['../namespacemlx_1_1core.html#a1b33e2c2e3471420490cf0be2de6de18',1,'mlx::core::vjp(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;cotangents)'],['../namespacemlx_1_1core.html#a2065a11249c3f4356ffd69b7a8c487ff',1,'mlx::core::vjp(const std::function&lt; array(const array &amp;)&gt; &amp;fun, const array &amp;primal, const array &amp;cotangent)']]],
+  ['vmap_8',['vmap',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a3f2dc71859847ca675ec4bfbe125035a',1,'mlx::core::distributed::AllReduce::vmap()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#ad532d1d51f089dec3c84799b724ea031',1,'mlx::core::distributed::AllGather::vmap()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a5cfb66191b9e8b86649da77af55b0f93',1,'mlx::core::distributed::Send::vmap()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a7f4c3a4c48c6807faa36fb31e39dad8d',1,'mlx::core::fast::Custom::vmap()'],['../classmlx_1_1core_1_1_primitive.html#ac632b9619dd7a6a0f177bd36202e8103',1,'mlx::core::Primitive::vmap()'],['../classmlx_1_1core_1_1_abs.html#a4c9c98f1d71432fd3752ad9a6a8e7f2f',1,'mlx::core::Abs::vmap()'],['../classmlx_1_1core_1_1_add.html#a0e557d4d896153f84a25532562e4c646',1,'mlx::core::Add::vmap()'],['../classmlx_1_1core_1_1_add_m_m.html#a73ce80b3a37ec2523943028d50ebce81',1,'mlx::core::AddMM::vmap()'],['../classmlx_1_1core_1_1_arc_cos.html#a7548e23ace6827674aa6d284d44ccf83',1,'mlx::core::ArcCos::vmap()'],['../classmlx_1_1core_1_1_arc_cosh.html#af8ff78e910a9e485a203e1d3347bd461',1,'mlx::core::ArcCosh::vmap()'],['../classmlx_1_1core_1_1_arc_sin.html#a7cabb1e5a2bda44944378822c671ec82',1,'mlx::core::ArcSin::vmap()'],['../classmlx_1_1core_1_1_arc_sinh.html#a9e72b9751939387c333b5d4e19a37f6d',1,'mlx::core::ArcSinh::vmap()'],['../classmlx_1_1core_1_1_arc_tan.html#a1fb921554544a56498bc54f82e4a0556',1,'mlx::core::ArcTan::vmap()'],['../classmlx_1_1core_1_1_arc_tan2.html#ae02cb9fbf25e93dc1d7fbc9e3fb28634',1,'mlx::core::ArcTan2::vmap()'],['../classmlx_1_1core_1_1_arc_tanh.html#a6ddcae68873559211cb91e7740dfc040',1,'mlx::core::ArcTanh::vmap()'],['../classmlx_1_1core_1_1_arg_partition.html#a441093795bcc31495ab5fbc9957b740a',1,'mlx::core::ArgPartition::vmap()'],['../classmlx_1_1core_1_1_arg_reduce.html#abfec42fa06ea15edaf393593751fb1ba',1,'mlx::core::ArgReduce::vmap()'],['../classmlx_1_1core_1_1_arg_sort.html#a3522bbbe4626a467394c1a8a9d7ac34e',1,'mlx::core::ArgSort::vmap()'],['../classmlx_1_1core_1_1_as_type.html#a7ebaf86fd6cad4a1ecfd7cde1ee0b0cc',1,'mlx::core::AsType::vmap()'],['../classmlx_1_1core_1_1_bitwise_binary.html#aa10be55f05bc1868bf4b375dc475f965',1,'mlx::core::BitwiseBinary::vmap()'],['../classmlx_1_1core_1_1_broadcast.html#aee4c71c2588ad01eb57e10f346cd666f',1,'mlx::core::Broadcast::vmap()'],['../classmlx_1_1core_1_1_ceil.html#ae86819990b43bdb0c2b3a25719b3a7a4',1,'mlx::core::Ceil::vmap()'],['../classmlx_1_1core_1_1_compiled.html#a732e7548f53977b4513bb7f30a04c30d',1,'mlx::core::Compiled::vmap()'],['../classmlx_1_1core_1_1_concatenate.html#a58c54dcf8e4b045d25edd3afc2caffc1',1,'mlx::core::Concatenate::vmap()'],['../classmlx_1_1core_1_1_conjugate.html#a2c7632c8ae0ca07777e23a0a79344e60',1,'mlx::core::Conjugate::vmap()'],['../classmlx_1_1core_1_1_contiguous.html#a563221e90b15aa90bfae23d29c10e4ec',1,'mlx::core::Contiguous::vmap()'],['../classmlx_1_1core_1_1_copy.html#a669b10253c15b769d90058d1ad7d0e61',1,'mlx::core::Copy::vmap()'],['../classmlx_1_1core_1_1_cos.html#aec9460daf0131156734013d03b230cd6',1,'mlx::core::Cos::vmap()'],['../classmlx_1_1core_1_1_cosh.html#a1ab2386e7d96219b6e4a525f7dac0406',1,'mlx::core::Cosh::vmap()'],['../classmlx_1_1core_1_1_custom_transforms.html#a906a2ff30d9c5281fbf1fa927e4c021b',1,'mlx::core::CustomTransforms::vmap()'],['../classmlx_1_1core_1_1_divide.html#a83e7da52831165b3a026e97b63770242',1,'mlx::core::Divide::vmap()'],['../classmlx_1_1core_1_1_div_mod.html#ae709e0fdd83994bd1d156e0d0e6a7942',1,'mlx::core::DivMod::vmap()'],['../classmlx_1_1core_1_1_select.html#a84e80361c8cf02536b4b98098793550f',1,'mlx::core::Select::vmap()'],['../classmlx_1_1core_1_1_remainder.html#a79867e1099a2e3c2d3e87407b2ab6e3d',1,'mlx::core::Remainder::vmap()'],['../classmlx_1_1core_1_1_equal.html#aea9cc3c88924ac824d72c39c2e83b0ca',1,'mlx::core::Equal::vmap()'],['../classmlx_1_1core_1_1_erf.html#abe554f553356654a3e800ba368108aaa',1,'mlx::core::Erf::vmap()'],['../classmlx_1_1core_1_1_erf_inv.html#ad5d7634e8568af8cc4a54a558a48d0e9',1,'mlx::core::ErfInv::vmap()'],['../classmlx_1_1core_1_1_exp.html#a0fcd579fe148b4c3dbc72e514b81bb37',1,'mlx::core::Exp::vmap()'],['../classmlx_1_1core_1_1_expm1.html#aa4caa848b2ea97e71ee3dd33de039296',1,'mlx::core::Expm1::vmap()'],['../classmlx_1_1core_1_1_f_f_t.html#ac32d6cc9b67289124f855ea68a61ede1',1,'mlx::core::FFT::vmap()'],['../classmlx_1_1core_1_1_floor.html#aea4dc79a65774990e775ad49519a5d10',1,'mlx::core::Floor::vmap()'],['../classmlx_1_1core_1_1_full.html#afc57ab6bd9ebdbbf042af54a59785d95',1,'mlx::core::Full::vmap()'],['../classmlx_1_1core_1_1_gather.html#abab0c4c204e66489825ce80d2194a275',1,'mlx::core::Gather::vmap()'],['../classmlx_1_1core_1_1_greater.html#a6d8267411fc4951de781f9e8e6c53aa0',1,'mlx::core::Greater::vmap()'],['../classmlx_1_1core_1_1_greater_equal.html#ab0e1be93eb01b0ce7fa83e953f5e3e1d',1,'mlx::core::GreaterEqual::vmap()'],['../classmlx_1_1core_1_1_hadamard.html#a9f1a172e6246859e813002abe9b8f99c',1,'mlx::core::Hadamard::vmap()'],['../classmlx_1_1core_1_1_imag.html#ace9906672bd88df0573653883d58ecb3',1,'mlx::core::Imag::vmap()'],['../classmlx_1_1core_1_1_less.html#a5fee5956cf087d8405359121aa62ba7e',1,'mlx::core::Less::vmap()'],['../classmlx_1_1core_1_1_less_equal.html#a3d5df21db184f2b7620cda9da1684480',1,'mlx::core::LessEqual::vmap()'],['../classmlx_1_1core_1_1_log.html#a007ddbcf911093231f607a8b9ed5cd49',1,'mlx::core::Log::vmap()'],['../classmlx_1_1core_1_1_log1p.html#a7122576f95ce479926bbbbc690891f71',1,'mlx::core::Log1p::vmap()'],['../classmlx_1_1core_1_1_logical_not.html#a5308a271619ee74df561b0aaf525915d',1,'mlx::core::LogicalNot::vmap()'],['../classmlx_1_1core_1_1_logical_and.html#aacc5f6f53ffc327b7771485e3da2a4e5',1,'mlx::core::LogicalAnd::vmap()'],['../classmlx_1_1core_1_1_logical_or.html#a6e2e77e6aaf47872b2e96b151c32daf3',1,'mlx::core::LogicalOr::vmap()'],['../classmlx_1_1core_1_1_log_add_exp.html#a82190aa1421a9734b6e9480debffac78',1,'mlx::core::LogAddExp::vmap()'],['../classmlx_1_1core_1_1_matmul.html#a3a1c6e70bac300240760fe41a58340c2',1,'mlx::core::Matmul::vmap()'],['../classmlx_1_1core_1_1_maximum.html#ab664918e0d71cfec1318a9879e78c5d3',1,'mlx::core::Maximum::vmap()'],['../classmlx_1_1core_1_1_minimum.html#adab0f31acf68075a0be908d8eb882980',1,'mlx::core::Minimum::vmap()'],['../classmlx_1_1core_1_1_multiply.html#ae7e82c8fc8cbaf4e00c27eb54fac7dbf',1,'mlx::core::Multiply::vmap()'],['../classmlx_1_1core_1_1_negative.html#a1f8a6079e272f1a0599f88a1a8419cf0',1,'mlx::core::Negative::vmap()'],['../classmlx_1_1core_1_1_not_equal.html#ab8b57932f03c8eee664bf89adeaa43b5',1,'mlx::core::NotEqual::vmap()'],['../classmlx_1_1core_1_1_number_of_elements.html#a977d83eae845b8bd8c0b98b48cb1c6c2',1,'mlx::core::NumberOfElements::vmap()'],['../classmlx_1_1core_1_1_pad.html#a85658812a0f3275ba3eb74b7c75686cf',1,'mlx::core::Pad::vmap()'],['../classmlx_1_1core_1_1_partition.html#aa0cc55e4d4d2cb5d129d32832321df2c',1,'mlx::core::Partition::vmap()'],['../classmlx_1_1core_1_1_power.html#a5e22749592413a9adbdc877b03b87c8f',1,'mlx::core::Power::vmap()'],['../classmlx_1_1core_1_1_quantized_matmul.html#a3434394140177b285f971c9ffe7e8763',1,'mlx::core::QuantizedMatmul::vmap()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a13ce5e138ebddb8780a034452f68892f',1,'mlx::core::GatherQMM::vmap()'],['../classmlx_1_1core_1_1_random_bits.html#a0dc12f053c6492f934bc18031412c415',1,'mlx::core::RandomBits::vmap()'],['../classmlx_1_1core_1_1_real.html#a07fbbefb6a1bc1ebd3985b24c36693b6',1,'mlx::core::Real::vmap()'],['../classmlx_1_1core_1_1_reshape.html#ae239dd3c6cab147e4af572dc58204f9d',1,'mlx::core::Reshape::vmap()'],['../classmlx_1_1core_1_1_reduce.html#abab1b5aa01ccad44f213f510c3596b38',1,'mlx::core::Reduce::vmap()'],['../classmlx_1_1core_1_1_round.html#a6fad8799a7982e1ccbe05be7cc38a7fd',1,'mlx::core::Round::vmap()'],['../classmlx_1_1core_1_1_scan.html#a297c7cc89c9bf9d186ebdebb634c7804',1,'mlx::core::Scan::vmap()'],['../classmlx_1_1core_1_1_scatter.html#a696c38b373a7a7c71bc112bd1117e322',1,'mlx::core::Scatter::vmap()'],['../classmlx_1_1core_1_1_sigmoid.html#a12712c23037e38192cbccd2d4b14cc85',1,'mlx::core::Sigmoid::vmap()'],['../classmlx_1_1core_1_1_sign.html#aa7296045907015b4e0ae8a93e5e6e295',1,'mlx::core::Sign::vmap()'],['../classmlx_1_1core_1_1_sin.html#a45533996f3d72d9dd97d4c61cd684fba',1,'mlx::core::Sin::vmap()'],['../classmlx_1_1core_1_1_sinh.html#ae171df22bc34c32e31b8135dc4caa788',1,'mlx::core::Sinh::vmap()'],['../classmlx_1_1core_1_1_slice.html#ae33583b0db22fcfeae34dfe1c0e3eaa2',1,'mlx::core::Slice::vmap()'],['../classmlx_1_1core_1_1_slice_update.html#adbf1c76de6ab2f986758530d351d6fa3',1,'mlx::core::SliceUpdate::vmap()'],['../classmlx_1_1core_1_1_softmax.html#ad22d3dcc71054d3dba529cf2dc981e19',1,'mlx::core::Softmax::vmap()'],['../classmlx_1_1core_1_1_sort.html#abfabb9e625cc0cb9335c7454ed27505c',1,'mlx::core::Sort::vmap()'],['../classmlx_1_1core_1_1_split.html#ab7c40e02a842e83bdb4698608472c7a6',1,'mlx::core::Split::vmap()'],['../classmlx_1_1core_1_1_square.html#a55bf43f878d4741c57a08d5fef472ea5',1,'mlx::core::Square::vmap()'],['../classmlx_1_1core_1_1_sqrt.html#a9d30e306ce08980c27d98c898577017e',1,'mlx::core::Sqrt::vmap()'],['../classmlx_1_1core_1_1_stop_gradient.html#aca680c8befef81da414c4375b11b16b0',1,'mlx::core::StopGradient::vmap()'],['../classmlx_1_1core_1_1_subtract.html#aa98f960e621a767c8a03624fd292f098',1,'mlx::core::Subtract::vmap()'],['../classmlx_1_1core_1_1_tan.html#ae2f67ca2adc83b10009cf28498bf58b7',1,'mlx::core::Tan::vmap()'],['../classmlx_1_1core_1_1_tanh.html#a32df3564c1ecb858c1ba9f855376762f',1,'mlx::core::Tanh::vmap()'],['../classmlx_1_1core_1_1_uniform.html#ad795037d5b1820e98f4268f166609926',1,'mlx::core::Uniform::vmap()'],['../classmlx_1_1core_1_1_view.html#a2230d3e5f434fb2b888de50b529ac121',1,'mlx::core::View::vmap()'],['../classmlx_1_1core_1_1_transpose.html#a5ef848b69def9a246665b67e6e3ffdfe',1,'mlx::core::Transpose::vmap()'],['../classmlx_1_1core_1_1_s_v_d.html#a0366c958f6cdac8d1d9e1a4eda53fae8',1,'mlx::core::SVD::vmap()'],['../classmlx_1_1core_1_1_inverse.html#a98419b9f0b8a6c9185fe012d523552c2',1,'mlx::core::Inverse::vmap()'],['../classmlx_1_1core_1_1_cholesky.html#ab5c3f6199ec3b399c91243a05d116aa5',1,'mlx::core::Cholesky::vmap()'],['../classmlx_1_1core_1_1_eigh.html#ab2f2ea5326e2f6045f9b7250692c240f',1,'mlx::core::Eigh::vmap()'],['../namespacemlx_1_1core.html#ac3caec2fa65375ed4c3bf1206177b84c',1,'mlx::core::vmap(const std::function&lt; array(const array &amp;)&gt; &amp;fun, int in_axis=0, int out_axis=0)'],['../namespacemlx_1_1core.html#a8481a3bb4c12c2b7dc6ba576c2be3d0d',1,'mlx::core::vmap(const std::function&lt; array(const array &amp;, const array &amp;)&gt; &amp;fun, int in_axis_a=0, int in_axis_b=0, int out_axis=0)'],['../namespacemlx_1_1core.html#a95a7757e8d18fced38acfc6a3e8d686a',1,'mlx::core::vmap(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;in_axes={}, const std::vector&lt; int &gt; &amp;out_axes={})']]],
   ['vmap_5freplace_9',['vmap_replace',['../namespacemlx_1_1core_1_1detail.html#a31a5582530faea230eb8acafc0f7e154',1,'mlx::core::detail']]],
   ['vmap_5ftrace_10',['vmap_trace',['../namespacemlx_1_1core_1_1detail.html#a5ba794afe1a557e0505887cfb481c515',1,'mlx::core::detail']]]
 ];
diff --git a/docs/build/html/search/functions_17.js b/docs/build/html/search/functions_17.js
index 7c63226fb..67970a5d0 100644
--- a/docs/build/html/search/functions_17.js
+++ b/docs/build/html/search/functions_17.js
@@ -1,11 +1,12 @@
 var searchData=
 [
   ['wait_0',['wait',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#af503189cc9247047fbdfc3ebf1daacc1',1,'pocketfft::detail::threading::latch::wait()'],['../classmlx_1_1core_1_1array.html#a648592006f1c92287734ba2428eaa45e',1,'mlx::core::array::wait()'],['../classmlx_1_1core_1_1_event.html#a634afd918e6ed847f354531ba9f48252',1,'mlx::core::Event::wait()']]],
-  ['wait_5ffor_5fone_1',['wait_for_one',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a01c574bb388f10d67aaaaa541894d807',1,'mlx::core::scheduler::Scheduler::wait_for_one()'],['../namespacemlx_1_1core_1_1scheduler.html#a8cc4d5fd1f5ce722b377ead1863a2291',1,'mlx::core::scheduler::wait_for_one()']]],
-  ['where_2',['where',['../group__ops.html#ga8a2056f8c9bb30914c40bcf509386491',1,'mlx::core']]],
-  ['write_3',['write',['../struct_read_writer.html#ac2ea71e41740ddc863890e3e8e6f09d0',1,'ReadWriter::write()'],['../classmlx_1_1core_1_1io_1_1_writer.html#ad9515b7f007338674de1e124cf77e125',1,'mlx::core::io::Writer::write()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#abca32838c9886f734d93430c34c07d7f',1,'mlx::core::io::FileWriter::write()'],['../struct_read_writer.html#a7a3d1396b0f83aa7506207bd6e7336bf',1,'ReadWriter::write() const'],['../struct_read_writer.html#ae1f0d3555b74998cc2d2288bce72a1f4',1,'ReadWriter::write() const']]],
-  ['write_5fpadded_4',['write_padded',['../struct_read_writer.html#a95367307acace2aa88226cf8956d2d88',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#abaf2a6ad4c88bd9f65fe1db1f73a8d87',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#a420453a56e77d6b3891ed4b5f178af9c',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const']]],
-  ['write_5fsafe_5',['write_safe',['../scan_8h.html#ae86aef08e5ebc8790031eb51eefa754c',1,'scan.h']]],
-  ['write_5fstrided_6',['write_strided',['../struct_read_writer.html#a77a4d7eac217305e22a3c25b3756ef67',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a12e7f43cd9de2d9990054184c0a32839',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a959ccaa08f2999c50cea063b01e492e4',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a5592b24dad5ad030a1e4769b0a278f35',1,'ReadWriter::write_strided(int stride, int overall_n)']]],
-  ['write_5funsafe_7',['write_unsafe',['../scan_8h.html#a8010e7bdf7a72cbd35ce7cd7ecb08e32',1,'scan.h']]]
+  ['wait_5ffor_5ffence_1',['wait_for_fence',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefdadbff4e003dc6f77506840babc088',1,'mlx::core::metal::CommandEncoder']]],
+  ['wait_5ffor_5fone_2',['wait_for_one',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a01c574bb388f10d67aaaaa541894d807',1,'mlx::core::scheduler::Scheduler::wait_for_one()'],['../namespacemlx_1_1core_1_1scheduler.html#a8cc4d5fd1f5ce722b377ead1863a2291',1,'mlx::core::scheduler::wait_for_one()']]],
+  ['where_3',['where',['../group__ops.html#ga8a2056f8c9bb30914c40bcf509386491',1,'mlx::core']]],
+  ['write_4',['write',['../struct_read_writer.html#ac2ea71e41740ddc863890e3e8e6f09d0',1,'ReadWriter::write()'],['../classmlx_1_1core_1_1io_1_1_writer.html#ad9515b7f007338674de1e124cf77e125',1,'mlx::core::io::Writer::write()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#abca32838c9886f734d93430c34c07d7f',1,'mlx::core::io::FileWriter::write()'],['../struct_read_writer.html#a7a3d1396b0f83aa7506207bd6e7336bf',1,'ReadWriter::write() const'],['../struct_read_writer.html#ae1f0d3555b74998cc2d2288bce72a1f4',1,'ReadWriter::write() const']]],
+  ['write_5fpadded_5',['write_padded',['../struct_read_writer.html#a95367307acace2aa88226cf8956d2d88',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#abaf2a6ad4c88bd9f65fe1db1f73a8d87',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#a420453a56e77d6b3891ed4b5f178af9c',1,'ReadWriter::write_padded(int length, const device float2 *w_k) const']]],
+  ['write_5fsafe_6',['write_safe',['../scan_8h.html#ae86aef08e5ebc8790031eb51eefa754c',1,'scan.h']]],
+  ['write_5fstrided_7',['write_strided',['../struct_read_writer.html#a77a4d7eac217305e22a3c25b3756ef67',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a12e7f43cd9de2d9990054184c0a32839',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a959ccaa08f2999c50cea063b01e492e4',1,'ReadWriter::write_strided(int stride, int overall_n)'],['../struct_read_writer.html#a5592b24dad5ad030a1e4769b0a278f35',1,'ReadWriter::write_strided(int stride, int overall_n)']]],
+  ['write_5funsafe_8',['write_unsafe',['../scan_8h.html#a8010e7bdf7a72cbd35ce7cd7ecb08e32',1,'scan.h']]]
 ];
diff --git a/docs/build/html/search/functions_2.js b/docs/build/html/search/functions_2.js
index 10a541d60..afd8977e5 100644
--- a/docs/build/html/search/functions_2.js
+++ b/docs/build/html/search/functions_2.js
@@ -2,48 +2,51 @@ var searchData=
 [
   ['begin_0',['begin',['../classmlx_1_1core_1_1array.html#a76b258b169d7d73419ebbf85340fb914',1,'mlx::core::array']]],
   ['bernoulli_1',['bernoulli',['../namespacemlx_1_1core_1_1random.html#acb3f278fea2c4f06dea947d3bac2e9b7',1,'mlx::core::random::bernoulli(const array &amp;p, const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#aaa49f6c2af5496822fa09435e54275cb',1,'mlx::core::random::bernoulli(const array &amp;p, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#aa9e360f9cb7bd23221352ed9e31d83c2',1,'mlx::core::random::bernoulli(T p, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a37fcba120a1d246176db5256d3201cd4',1,'mlx::core::random::bernoulli(T p, const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#ad7eb4467e2f9d5f74a5607b29a935b6e',1,'mlx::core::random::bernoulli(const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
-  ['bfloat_5fbits_5fto_5ffloat_2',['bfloat_bits_to_float',['../backend_2metal_2kernels_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1',1,'bf16.h']]],
-  ['binary_3',['binary',['../namespacemlx_1_1core_1_1metal.html#a269d591ec02e2f7c0f7a718fbfa37f73',1,'mlx::core::metal']]],
-  ['binary_5fg_4',['binary_g',['../metal_2kernels_2binary_8h.html#a1f3f5d6bfbf3914f365790dd1434c10b',1,'binary_g(device const T *a, device const T *b, device U *c, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a6297badf47dece518bb4e67f02cffea8',1,'binary_g(device const T *a, device const T *b, device U *c, device U *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim):&#160;binary_two.h']]],
-  ['binary_5fg_5fnd1_5',['binary_g_nd1',['../metal_2kernels_2binary_8h.html#a6808bfb006cb5473da087a2758d0d867',1,'binary_g_nd1(device const T *a, device const T *b, device U *c, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ad1fad37c168192b212a4294f4cf78133',1,'binary_g_nd1(device const T *a, device const T *b, device U *c, device U *d, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index):&#160;binary_two.h']]],
-  ['binary_5fg_5fnd2_6',['binary_g_nd2',['../metal_2kernels_2binary_8h.html#a8cd5989852ec704c6fd132ae28f4fc14',1,'binary_g_nd2(device const T *a, device const T *b, device U *c, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a03f7c15a1607576755abb65c542ae347',1,'binary_g_nd2(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
-  ['binary_5fg_5fnd3_7',['binary_g_nd3',['../metal_2kernels_2binary_8h.html#ac4979e60b993f7ffb602bcb91cd68bc9',1,'binary_g_nd3(device const T *a, device const T *b, device U *c, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a43e5943460996c43060d1f3aa1309ba6',1,'binary_g_nd3(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim):&#160;binary_two.h']]],
-  ['binary_5fop_5fgpu_8',['binary_op_gpu',['../namespacemlx_1_1core.html#ad884f4a36308b5b4f8a5d990d2e086df',1,'mlx::core::binary_op_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs, const std::string &amp;op, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a094876ea5a2a2445ab64efc8222da202',1,'mlx::core::binary_op_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out, const std::string &amp;op, const Stream &amp;s)']]],
-  ['binary_5fop_5fgpu_5finplace_9',['binary_op_gpu_inplace',['../namespacemlx_1_1core.html#a8616c0b7b0fc118a75400bc86404c367',1,'mlx::core::binary_op_gpu_inplace(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs, const std::string &amp;op, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a7e6af6624e322e7ad60a3873a66e18a3',1,'mlx::core::binary_op_gpu_inplace(const std::vector&lt; array &gt; &amp;inputs, array &amp;out, const std::string &amp;op, const Stream &amp;s)']]],
-  ['binary_5fops_10',['binary_ops',['../namespacemlx_1_1core_1_1metal.html#a8db7f9cc781d4bfb08423a401665f322',1,'mlx::core::metal']]],
-  ['binary_5fss_11',['binary_ss',['../metal_2kernels_2binary_8h.html#a242b8b29a852c255467e50628c6dccf5',1,'binary_ss(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#af8a791ac7ca88d32cd8f4e9ac0f9ab4f',1,'binary_ss(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
-  ['binary_5fsv_12',['binary_sv',['../metal_2kernels_2binary_8h.html#a4116c35f2e4632366d1611d5a95ba141',1,'binary_sv(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ab18c6ecf5065275c93701efd095c916c',1,'binary_sv(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
-  ['binary_5fsv2_13',['binary_sv2',['../metal_2kernels_2binary_8h.html#aa8c48b1b21d8f5a181f5443de2346589',1,'binary_sv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a08822ff98ea6f61a98b49a9e9a38b891',1,'binary_sv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
-  ['binary_5ftwo_14',['binary_two',['../namespacemlx_1_1core_1_1metal.html#aed047eec38b030ec5f29b9da54abf8cb',1,'mlx::core::metal']]],
-  ['binary_5fvs_15',['binary_vs',['../metal_2kernels_2binary_8h.html#a649851d133358dd5832a73b1061b3313',1,'binary_vs(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a12dbda74fa460812177ccb9aeee6e1ca',1,'binary_vs(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
-  ['binary_5fvs2_16',['binary_vs2',['../metal_2kernels_2binary_8h.html#a48bd82eb10f9c623ce7d28daec4fa512',1,'binary_vs2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a273d2f31691f2c64623c2a97eab344be',1,'binary_vs2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
-  ['binary_5fvv_17',['binary_vv',['../metal_2kernels_2binary_8h.html#add6a9aeee3cb0ba909574f27fa9ecd5b',1,'binary_vv(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ab4324f594c007a6895540b77ad5d89d9',1,'binary_vv(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
-  ['binary_5fvv2_18',['binary_vv2',['../metal_2kernels_2binary_8h.html#a19dbbf8fea68b64bdd25dc8d36865171',1,'binary_vv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a12e80730e43dfaa4c79ce8d5f99edc50',1,'binary_vv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
-  ['bits_19',['bits',['../namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825',1,'mlx::core::random::bits(const std::vector&lt; int &gt; &amp;shape, int width, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a548ffed4ba3107b89885ff850ffce5f4',1,'mlx::core::random::bits(const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
-  ['bits_5fto_5fbfloat_20',['bits_to_bfloat',['../struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca',1,'_MLX_BFloat16']]],
-  ['bitwise_5fand_21',['bitwise_and',['../group__ops.html#ga752fd2707dabb05d0308ba3d55346ada',1,'mlx::core']]],
-  ['bitwise_5for_22',['bitwise_or',['../group__ops.html#ga8af4f22c08c11c4ffab7e3d45e0f3cd6',1,'mlx::core']]],
-  ['bitwise_5fxor_23',['bitwise_xor',['../group__ops.html#ga3188638fba3a60e264baf69956a1e08b',1,'mlx::core']]],
-  ['bitwisebinary_24',['BitwiseBinary',['../classmlx_1_1core_1_1_bitwise_binary.html#a0d8b3a94951621ffcdebc6fda748a172',1,'mlx::core::BitwiseBinary']]],
-  ['block_5fmasked_5fgemm_25',['block_masked_gemm',['../steel__gemm__masked_8h.html#af805e998b2046ee30c2b4be813e3af97',1,'block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device out_mask_t *out_mask, const device op_mask_t *lhs_mask, const device op_mask_t *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid):&#160;steel_gemm_masked.h'],['../steel__gemm__masked_8h.html#a477932e2ae9d49366f7ede6db63f9cac',1,'block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device bool *out_mask, const device bool *lhs_mask, const device bool *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid):&#160;steel_gemm_masked.h']]],
-  ['block_5fmasked_5fmm_26',['block_masked_mm',['../group__ops.html#ga6b76c8ea46b19e6866af155fa5910be6',1,'mlx::core']]],
-  ['block_5fsort_27',['block_sort',['../struct_kernel_merge_sort.html#a56b644ec66f7fb5c01b280f124304be9',1,'KernelMergeSort::block_sort()'],['../struct_kernel_multi_block_merge_sort.html#a322ed2eac315a561e0fd90af2fd577eb',1,'KernelMultiBlockMergeSort::block_sort()'],['../sort_8h.html#a93f14092416169c4449141043ac45ffd',1,'block_sort(const device T *inp, device U *out, const constant int &amp;size_sorted_axis, const constant int &amp;in_stride_sorted_axis, const constant int &amp;out_stride_sorted_axis, const constant int &amp;in_stride_segment_axis, const constant int &amp;out_stride_segment_axis, uint3 tid, uint3 lid):&#160;sort.h']]],
-  ['block_5fsort_5fnc_28',['block_sort_nc',['../sort_8h.html#a4ee3de195a6f9c33aa91ac52461808ad',1,'sort.h']]],
-  ['blockloader_29',['BlockLoader',['../structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335',1,'mlx::steel::BlockLoader']]],
-  ['blockmaskedmm_30',['BlockMaskedMM',['../classmlx_1_1core_1_1_block_masked_m_m.html#ad26509deb5306d0c5eb72477e9a57477',1,'mlx::core::BlockMaskedMM']]],
-  ['blockmma_31',['BlockMMA',['../structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8',1,'mlx::steel::BlockMMA']]],
-  ['bluestein_5ffft_32',['bluestein_fft',['../backend_2metal_2kernels_2fft_8h.html#a0abc609e9756475800e996775a96a87e',1,'fft.h']]],
-  ['broadcast_33',['Broadcast',['../classmlx_1_1core_1_1_broadcast.html#accbab8433c93e281608a268d11afaefb',1,'mlx::core::Broadcast']]],
-  ['broadcast_5farrays_34',['broadcast_arrays',['../group__ops.html#gab783890428b596f715dc7dd2057eae99',1,'mlx::core']]],
-  ['broadcast_5fshapes_35',['broadcast_shapes',['../namespacemlx_1_1core.html#a075e07def338cd9d815182d0e6a656c0',1,'mlx::core']]],
-  ['broadcast_5fto_36',['broadcast_to',['../group__ops.html#gad256e86cc1a6e6b3832e392baa90318d',1,'mlx::core']]],
-  ['bs_5fqmm_5fn_37',['bs_qmm_n',['../quantized_8h.html#a1a66b061c46383952a0f067c3848971f',1,'quantized.h']]],
-  ['bs_5fqmm_5ft_38',['bs_qmm_t',['../quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84',1,'quantized.h']]],
-  ['bs_5fqmv_39',['bs_qmv',['../quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed',1,'quantized.h']]],
-  ['bs_5fqmv_5ffast_40',['bs_qmv_fast',['../quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7',1,'quantized.h']]],
-  ['bs_5fqvm_41',['bs_qvm',['../quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494',1,'quantized.h']]],
-  ['buffer_42',['Buffer',['../classmlx_1_1core_1_1allocator_1_1_buffer.html#ac4fc2cc6aa1368cfb74aff329d9a1300',1,'mlx::core::allocator::Buffer']]],
-  ['buffer_43',['buffer',['../classmlx_1_1core_1_1array.html#ab3daf04c27c4593d9d73c397b8484a08',1,'mlx::core::array::buffer()'],['../classmlx_1_1core_1_1array.html#a634466ce661485394f2fdc3bd6796bcd',1,'mlx::core::array::buffer() const']]],
-  ['buffer_5fsize_44',['buffer_size',['../classmlx_1_1core_1_1array.html#a914577c63755b2e862d2da68bbf8e3dd',1,'mlx::core::array']]],
-  ['build_5flib_5fname_45',['build_lib_name',['../namespacemlx_1_1core.html#a3ef23f334cb9f68a2c50524bc67c913b',1,'mlx::core']]]
+  ['bfloat16_5fto_5fuint16_2',['bfloat16_to_uint16',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088',1,'bfloat16_to_uint16(const bfloat16_t x):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a1420e191fa60d707dce327d0938e3088',1,'bfloat16_to_uint16(const bfloat16_t x):&#160;bf16.h']]],
+  ['bfloat_5fbits_5fto_5ffloat_3',['bfloat_bits_to_float',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3b33ae338dc4f223d0f3c748de07bad1',1,'bf16.h']]],
+  ['bfs_5fmax_5fwidth_4',['bfs_max_width',['../namespacemlx_1_1core_1_1env.html#ac3266e1259a64c8b56bdc6c7029179f2',1,'mlx::core::env']]],
+  ['binary_5',['binary',['../namespacemlx_1_1core_1_1metal.html#a269d591ec02e2f7c0f7a718fbfa37f73',1,'mlx::core::metal']]],
+  ['binary_5fg_6',['binary_g',['../metal_2kernels_2binary_8h.html#ab1b49438a70f6c707c18afd5bce12bb3',1,'binary_g(device const T *a, device const T *b, device U *c, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#aaf6edb734cea627bca4f6540dc338fbd',1,'binary_g(device const T *a, device const T *b, device U *c, device U *d, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const int &amp;ndim, uint3 index, uint3 grid_dim):&#160;binary_two.h']]],
+  ['binary_5fg_5fnd1_7',['binary_g_nd1',['../metal_2kernels_2binary_8h.html#a6808bfb006cb5473da087a2758d0d867',1,'binary_g_nd1(device const T *a, device const T *b, device U *c, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ad1fad37c168192b212a4294f4cf78133',1,'binary_g_nd1(device const T *a, device const T *b, device U *c, device U *d, constant const size_t &amp;a_stride, constant const size_t &amp;b_stride, uint index):&#160;binary_two.h']]],
+  ['binary_5fg_5fnd2_8',['binary_g_nd2',['../metal_2kernels_2binary_8h.html#a6cefcfee68bd62f3a6924df0cd53dd49',1,'binary_g_nd2(device const T *a, device const T *b, device U *c, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a97b5613aff654d32c49225209a19bb95',1,'binary_g_nd2(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[2], constant const size_t b_strides[2], uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
+  ['binary_5fg_5fnd3_9',['binary_g_nd3',['../metal_2kernels_2binary_8h.html#abb15de8250f9a259de80618c6de46dfa',1,'binary_g_nd3(device const T *a, device const T *b, device U *c, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#aae07014f8dffa3649a5c7f4671e1268e',1,'binary_g_nd3(device const T *a, device const T *b, device U *c, device U *d, constant const size_t a_strides[3], constant const size_t b_strides[3], uint3 index, uint3 grid_dim):&#160;binary_two.h']]],
+  ['binary_5fop_5fgpu_10',['binary_op_gpu',['../namespacemlx_1_1core.html#ad884f4a36308b5b4f8a5d990d2e086df',1,'mlx::core::binary_op_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs, const std::string &amp;op, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a094876ea5a2a2445ab64efc8222da202',1,'mlx::core::binary_op_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out, const std::string &amp;op, const Stream &amp;s)']]],
+  ['binary_5fop_5fgpu_5finplace_11',['binary_op_gpu_inplace',['../namespacemlx_1_1core.html#a8616c0b7b0fc118a75400bc86404c367',1,'mlx::core::binary_op_gpu_inplace(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs, const std::string &amp;op, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a7e6af6624e322e7ad60a3873a66e18a3',1,'mlx::core::binary_op_gpu_inplace(const std::vector&lt; array &gt; &amp;inputs, array &amp;out, const std::string &amp;op, const Stream &amp;s)']]],
+  ['binary_5fops_12',['binary_ops',['../namespacemlx_1_1core_1_1metal.html#a8db7f9cc781d4bfb08423a401665f322',1,'mlx::core::metal']]],
+  ['binary_5fss_13',['binary_ss',['../metal_2kernels_2binary_8h.html#a242b8b29a852c255467e50628c6dccf5',1,'binary_ss(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#af8a791ac7ca88d32cd8f4e9ac0f9ab4f',1,'binary_ss(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
+  ['binary_5fsv_14',['binary_sv',['../metal_2kernels_2binary_8h.html#a4116c35f2e4632366d1611d5a95ba141',1,'binary_sv(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ab18c6ecf5065275c93701efd095c916c',1,'binary_sv(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
+  ['binary_5fsv2_15',['binary_sv2',['../metal_2kernels_2binary_8h.html#aa8c48b1b21d8f5a181f5443de2346589',1,'binary_sv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a08822ff98ea6f61a98b49a9e9a38b891',1,'binary_sv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
+  ['binary_5ftwo_16',['binary_two',['../namespacemlx_1_1core_1_1metal.html#aed047eec38b030ec5f29b9da54abf8cb',1,'mlx::core::metal']]],
+  ['binary_5fvs_17',['binary_vs',['../metal_2kernels_2binary_8h.html#a649851d133358dd5832a73b1061b3313',1,'binary_vs(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a12dbda74fa460812177ccb9aeee6e1ca',1,'binary_vs(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
+  ['binary_5fvs2_18',['binary_vs2',['../metal_2kernels_2binary_8h.html#a48bd82eb10f9c623ce7d28daec4fa512',1,'binary_vs2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a273d2f31691f2c64623c2a97eab344be',1,'binary_vs2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
+  ['binary_5fvv_19',['binary_vv',['../metal_2kernels_2binary_8h.html#add6a9aeee3cb0ba909574f27fa9ecd5b',1,'binary_vv(device const T *a, device const T *b, device U *c, uint index):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#ab4324f594c007a6895540b77ad5d89d9',1,'binary_vv(device const T *a, device const T *b, device U *c, device U *d, uint index):&#160;binary_two.h']]],
+  ['binary_5fvv2_20',['binary_vv2',['../metal_2kernels_2binary_8h.html#a19dbbf8fea68b64bdd25dc8d36865171',1,'binary_vv2(device const T *a, device const T *b, device U *c, uint2 index, uint2 grid_dim):&#160;binary.h'],['../metal_2kernels_2binary__two_8h.html#a12e80730e43dfaa4c79ce8d5f99edc50',1,'binary_vv2(device const T *a, device const T *b, device U *c, device U *d, uint2 index, uint2 grid_dim):&#160;binary_two.h']]],
+  ['bits_21',['bits',['../namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825',1,'mlx::core::random::bits(const std::vector&lt; int &gt; &amp;shape, int width, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1random.html#a548ffed4ba3107b89885ff850ffce5f4',1,'mlx::core::random::bits(const std::vector&lt; int &gt; &amp;shape, const std::optional&lt; array &gt; &amp;key=std::nullopt, StreamOrDevice s={})']]],
+  ['bits_5fto_5fbfloat_22',['bits_to_bfloat',['../struct___m_l_x___b_float16.html#a91ccb774773b65f8d4c1aea3f1c6e1ca',1,'_MLX_BFloat16']]],
+  ['bitwise_5fand_23',['bitwise_and',['../group__ops.html#ga752fd2707dabb05d0308ba3d55346ada',1,'mlx::core']]],
+  ['bitwise_5for_24',['bitwise_or',['../group__ops.html#ga8af4f22c08c11c4ffab7e3d45e0f3cd6',1,'mlx::core']]],
+  ['bitwise_5fxor_25',['bitwise_xor',['../group__ops.html#ga3188638fba3a60e264baf69956a1e08b',1,'mlx::core']]],
+  ['bitwisebinary_26',['BitwiseBinary',['../classmlx_1_1core_1_1_bitwise_binary.html#a0d8b3a94951621ffcdebc6fda748a172',1,'mlx::core::BitwiseBinary']]],
+  ['block_5fmasked_5fgemm_27',['block_masked_gemm',['../steel__gemm__masked_8h.html#af805e998b2046ee30c2b4be813e3af97',1,'block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device out_mask_t *out_mask, const device op_mask_t *lhs_mask, const device op_mask_t *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid):&#160;steel_gemm_masked.h'],['../steel__gemm__masked_8h.html#a477932e2ae9d49366f7ede6db63f9cac',1,'block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device bool *out_mask, const device bool *lhs_mask, const device bool *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid):&#160;steel_gemm_masked.h']]],
+  ['block_5fmasked_5fmm_28',['block_masked_mm',['../group__ops.html#ga6b76c8ea46b19e6866af155fa5910be6',1,'mlx::core']]],
+  ['block_5fsort_29',['block_sort',['../struct_kernel_merge_sort.html#a56b644ec66f7fb5c01b280f124304be9',1,'KernelMergeSort::block_sort()'],['../struct_kernel_multi_block_merge_sort.html#a322ed2eac315a561e0fd90af2fd577eb',1,'KernelMultiBlockMergeSort::block_sort()'],['../sort_8h.html#a93f14092416169c4449141043ac45ffd',1,'block_sort(const device T *inp, device U *out, const constant int &amp;size_sorted_axis, const constant int &amp;in_stride_sorted_axis, const constant int &amp;out_stride_sorted_axis, const constant int &amp;in_stride_segment_axis, const constant int &amp;out_stride_segment_axis, uint3 tid, uint3 lid):&#160;sort.h']]],
+  ['block_5fsort_5fnc_30',['block_sort_nc',['../sort_8h.html#a4ee3de195a6f9c33aa91ac52461808ad',1,'sort.h']]],
+  ['blockloader_31',['BlockLoader',['../structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335',1,'mlx::steel::BlockLoader::BlockLoader(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)'],['../structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335',1,'mlx::steel::BlockLoader::BlockLoader(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)']]],
+  ['blockloadert_32',['BlockLoaderT',['../structmlx_1_1steel_1_1_block_loader_t.html#a076616a7c67ad1b847e0e6b046077ee2',1,'mlx::steel::BlockLoaderT']]],
+  ['blockmaskedmm_33',['BlockMaskedMM',['../classmlx_1_1core_1_1_block_masked_m_m.html#ad26509deb5306d0c5eb72477e9a57477',1,'mlx::core::BlockMaskedMM']]],
+  ['blockmma_34',['BlockMMA',['../structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8',1,'mlx::steel::BlockMMA::BlockMMA(ushort simd_group_id, ushort simd_lane_id)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8',1,'mlx::steel::BlockMMA::BlockMMA(ushort simd_group_id, ushort simd_lane_id)']]],
+  ['bluestein_5ffft_35',['bluestein_fft',['../backend_2metal_2kernels_2fft_8h.html#a0abc609e9756475800e996775a96a87e',1,'fft.h']]],
+  ['broadcast_36',['Broadcast',['../classmlx_1_1core_1_1_broadcast.html#accbab8433c93e281608a268d11afaefb',1,'mlx::core::Broadcast']]],
+  ['broadcast_5farrays_37',['broadcast_arrays',['../group__ops.html#gab783890428b596f715dc7dd2057eae99',1,'mlx::core']]],
+  ['broadcast_5fshapes_38',['broadcast_shapes',['../namespacemlx_1_1core.html#a075e07def338cd9d815182d0e6a656c0',1,'mlx::core']]],
+  ['broadcast_5fto_39',['broadcast_to',['../group__ops.html#gad256e86cc1a6e6b3832e392baa90318d',1,'mlx::core']]],
+  ['bs_5fqmm_5fn_40',['bs_qmm_n',['../quantized_8h.html#a1a66b061c46383952a0f067c3848971f',1,'quantized.h']]],
+  ['bs_5fqmm_5ft_41',['bs_qmm_t',['../quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84',1,'quantized.h']]],
+  ['bs_5fqmv_42',['bs_qmv',['../quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed',1,'quantized.h']]],
+  ['bs_5fqmv_5ffast_43',['bs_qmv_fast',['../quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7',1,'quantized.h']]],
+  ['bs_5fqvm_44',['bs_qvm',['../quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494',1,'quantized.h']]],
+  ['buffer_45',['Buffer',['../classmlx_1_1core_1_1allocator_1_1_buffer.html#ac4fc2cc6aa1368cfb74aff329d9a1300',1,'mlx::core::allocator::Buffer']]],
+  ['buffer_46',['buffer',['../classmlx_1_1core_1_1array.html#ab3daf04c27c4593d9d73c397b8484a08',1,'mlx::core::array::buffer()'],['../classmlx_1_1core_1_1array.html#a634466ce661485394f2fdc3bd6796bcd',1,'mlx::core::array::buffer() const']]],
+  ['buffer_5fsize_47',['buffer_size',['../classmlx_1_1core_1_1array.html#a914577c63755b2e862d2da68bbf8e3dd',1,'mlx::core::array']]],
+  ['build_5flib_5fname_48',['build_lib_name',['../namespacemlx_1_1core.html#a3ef23f334cb9f68a2c50524bc67c913b',1,'mlx::core']]]
 ];
diff --git a/docs/build/html/search/functions_3.js b/docs/build/html/search/functions_3.js
index 2e834b06f..b74b4a074 100644
--- a/docs/build/html/search/functions_3.js
+++ b/docs/build/html/search/functions_3.js
@@ -13,15 +13,15 @@ var searchData=
   ['cholesky_10',['Cholesky',['../classmlx_1_1core_1_1_cholesky.html#a6ae2e30b85f99f4f0d7f14c7949818ab',1,'mlx::core::Cholesky']]],
   ['cholesky_11',['cholesky',['../namespacemlx_1_1core_1_1linalg.html#a46c8a4f806f0a97a4323e91189aa512b',1,'mlx::core::linalg']]],
   ['cholesky_5finv_12',['cholesky_inv',['../namespacemlx_1_1core_1_1linalg.html#aef0fe4894c5cf98792d59859c6d20511',1,'mlx::core::linalg']]],
-  ['clear_13',['clear',['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7',1,'mlx::steel::MMATile']]],
+  ['clear_13',['clear',['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7',1,'mlx::steel::MMATile::clear()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7',1,'mlx::steel::MMATile::clear()']]],
   ['clear_5fcache_14',['clear_cache',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a447c1eb38c00d2e8e521675297f4a9b1',1,'mlx::core::metal::MetalAllocator::clear_cache()'],['../namespacemlx_1_1core_1_1metal.html#a22b3384ebd17f2fca198f81b9f1b6dc3',1,'mlx::core::metal::clear_cache()']]],
   ['clip_15',['clip',['../group__ops.html#ga157cd7c23f9b306fee2e1eb2b9bf1dd8',1,'mlx::core']]],
   ['cmplx_16',['cmplx',['../structpocketfft_1_1detail_1_1cmplx.html#a5b1ce506f1023f5254025ac81b831a2c',1,'pocketfft::detail::cmplx::cmplx()'],['../structpocketfft_1_1detail_1_1cmplx.html#a05491b4f1f22ca0bc49012f6a1c1710a',1,'pocketfft::detail::cmplx::cmplx(T r_, T i_)']]],
   ['cndarr_17',['cndarr',['../classpocketfft_1_1detail_1_1cndarr.html#abf73f1b4ddcfb27d7f85cfa441607129',1,'pocketfft::detail::cndarr']]],
-  ['col_5freduce_5f2pass_18',['col_reduce_2pass',['../reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d',1,'reduce_col.h']]],
-  ['col_5freduce_5flongcolumn_19',['col_reduce_longcolumn',['../reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb',1,'reduce_col.h']]],
-  ['col_5freduce_5flooped_20',['col_reduce_looped',['../reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385',1,'reduce_col.h']]],
-  ['col_5freduce_5fsmall_21',['col_reduce_small',['../reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5',1,'reduce_col.h']]],
+  ['col_5freduce_5f2pass_18',['col_reduce_2pass',['../reduce__col_8h.html#a9a7be400d810700b47fc1a998032ce29',1,'reduce_col.h']]],
+  ['col_5freduce_5flongcolumn_19',['col_reduce_longcolumn',['../reduce__col_8h.html#aa3287cd98e97123b67b5d3920d984ca2',1,'reduce_col.h']]],
+  ['col_5freduce_5flooped_20',['col_reduce_looped',['../reduce__col_8h.html#ae8f9354e1c595142d05b33fe13988f02',1,'reduce_col.h']]],
+  ['col_5freduce_5fsmall_21',['col_reduce_small',['../reduce__col_8h.html#a82cd031d8014c02e61dc9a817ea6d4ec',1,'reduce_col.h']]],
   ['collapse_5fcontiguous_5fdims_22',['collapse_contiguous_dims',['../namespacemlx_1_1core.html#a38fe6ec5220d13d96c7dad7556d2b613',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; int64_t &gt; &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#af2895f9b0083efd8221275eb8cadccbe',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; size_t &gt; &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a90e2b6edc0fe82230cb93f5ea39febb4',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; array &gt; &amp;xs, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#ac813412cce77fc1340dcfefc6e099276',1,'mlx::core::collapse_contiguous_dims(Arrays &amp;&amp;... xs)'],['../namespacemlx_1_1core.html#aab3cc7f3808934ae0727b920eba231bd',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; int64_t &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a1e0cbcf109d32794ffc8efc7302ba9b0',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a4ee50bfb240512d0c0ce151dfe2c74ef',1,'mlx::core::collapse_contiguous_dims(const array &amp;a, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())']]],
   ['commandencoder_23',['CommandEncoder',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3',1,'mlx::core::metal::CommandEncoder::CommandEncoder(MTL::CommandBuffer *cbuf)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14',1,'mlx::core::metal::CommandEncoder::CommandEncoder(const CommandEncoder &amp;)=delete']]],
   ['commit_5fcommand_5fbuffer_24',['commit_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c',1,'mlx::core::metal::Device']]],
@@ -39,66 +39,68 @@ var searchData=
   ['complex_5fmul_5fconj_36',['complex_mul_conj',['../radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3',1,'radix.h']]],
   ['compute_5fstrided_5findices_37',['compute_strided_indices',['../struct_read_writer.html#a7c903fbb8b85a856ba5564d7df537cdf',1,'ReadWriter']]],
   ['concatenate_38',['Concatenate',['../classmlx_1_1core_1_1_concatenate.html#acff07853de2d31faeec7c4ca40ce0888',1,'mlx::core::Concatenate']]],
-  ['concatenate_39',['concatenate',['../group__ops.html#gabdc36fa65697d0361c8d67495de77129',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
+  ['concatenate_39',['concatenate',['../namespacemlx_1_1core.html#a76a2e310857f60f5ea6f1388d45b964d',1,'mlx::core::concatenate(std::string &amp;acc, T first)'],['../namespacemlx_1_1core.html#aaf51544472fa87fa974686eacdd2a4a6',1,'mlx::core::concatenate(std::string &amp;acc, T first, Args... args)'],['../group__ops.html#gabdc36fa65697d0361c8d67495de77129',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
   ['concatenate_5fgpu_40',['concatenate_gpu',['../namespacemlx_1_1core.html#a050299d0d366ca5c9d09d1004dcc3e7d',1,'mlx::core']]],
   ['concurrentcontext_41',['ConcurrentContext',['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174',1,'mlx::core::metal::CommandEncoder::ConcurrentContext']]],
   ['conj_42',['conj',['../namespacepocketfft_1_1detail.html#a66d79051d502046a9b9f103e744dbad3',1,'pocketfft::detail']]],
   ['conjugate_43',['Conjugate',['../classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87',1,'mlx::core::Conjugate']]],
   ['conjugate_44',['conjugate',['../group__ops.html#ga5b596906bf8cdc8d97ed6ddc9aeb4c23',1,'mlx::core']]],
-  ['contiguous_5fscan_45',['contiguous_scan',['../scan_8h.html#a60d279b9add7d56639bb209408f09d79',1,'scan.h']]],
-  ['contiguousiterator_46',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6',1,'mlx::core::ContiguousIterator::ContiguousIterator()'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a6cb378408b6f546eeb6ade1a4faafe3c',1,'mlx::core::ContiguousIterator::ContiguousIterator(const array &amp;a)'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a16bdacb53f65b7284068cd49d4cba292',1,'mlx::core::ContiguousIterator::ContiguousIterator(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides, int dims)']]],
-  ['conv_47',['conv',['../namespacemlx_1_1core_1_1metal.html#ab1704e853394c725668c06752ebb5c24',1,'mlx::core::metal']]],
-  ['conv1d_48',['conv1d',['../group__ops.html#ga30d47e08093c03a3676f235f9f559411',1,'mlx::core']]],
-  ['conv2d_49',['conv2d',['../group__ops.html#ga73b02833229678786e7f302d458d5a83',1,'mlx::core']]],
-  ['conv2dinputblockloadergeneral_50',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b',1,'mlx::steel::Conv2DInputBlockLoaderGeneral']]],
-  ['conv2dinputblockloaderlargefilter_51',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8755116a535539744e4947bc69f9c50f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter']]],
-  ['conv2dinputblockloadersmallchannels_52',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ab9fd3fdeab94470dde3326f1dd5c455a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels']]],
-  ['conv2dinputblockloadersmallfilter_53',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0a2cbf57c51cd928722e3f06aafcf933',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
-  ['conv2dweightblockloader_54',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a9a7dca3512b64cffb6eac305d795831c',1,'mlx::steel::Conv2DWeightBlockLoader']]],
-  ['conv2dweightblockloadergeneral_55',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#ad0550fabbdc9297559381a5b488e9af1',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral']]],
-  ['conv2dweightblockloadersmallchannels_56',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae1806ea1c19713819dee83a38ab35fa6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels']]],
-  ['conv3d_57',['conv3d',['../group__ops.html#ga6e9907d2f14dc4803e4306b3dbc4b3ca',1,'mlx::core']]],
-  ['conv_5fgeneral_58',['conv_general',['../group__ops.html#ga2236e5dfc7e52e28abf6c21675d0a51e',1,'mlx::core::conv_general(array input, array weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding_lo={}, std::vector&lt; int &gt; padding_hi={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})'],['../group__ops.html#gab59f89942cd1efaadffe9e8762e3c99d',1,'mlx::core::conv_general(const array &amp;input, const array &amp;weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})']]],
-  ['conv_5ftranspose1d_59',['conv_transpose1d',['../group__ops.html#gaa30bf1adcd78d1c2595d07b215731714',1,'mlx::core']]],
-  ['conv_5ftranspose2d_60',['conv_transpose2d',['../group__ops.html#gaebb59971cb9bc45005dc1d398e4f0a3d',1,'mlx::core']]],
-  ['conv_5ftranspose3d_61',['conv_transpose3d',['../group__ops.html#ga8db814da631d9cd32a8d6563bf4ac530',1,'mlx::core']]],
-  ['convolution_62',['Convolution',['../classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef',1,'mlx::core::Convolution']]],
-  ['copy_63',['Copy',['../classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584',1,'mlx::core::Copy']]],
-  ['copy_64',['copy',['../namespacemlx_1_1core.html#a479648542a2bea151b947b18f0e79dd2',1,'mlx::core::copy()'],['../namespacemlx_1_1core_1_1metal.html#aa215e631e2680f04a591b88d91571719',1,'mlx::core::metal::copy()'],['../group__ops.html#gae306e93af12f774bd80bad6c231b09d6',1,'mlx::core::copy()']]],
-  ['copy_5fg_65',['copy_g',['../metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36',1,'copy.h']]],
-  ['copy_5fg_5fnd1_66',['copy_g_nd1',['../metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77',1,'copy.h']]],
-  ['copy_5fg_5fnd2_67',['copy_g_nd2',['../metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c',1,'copy.h']]],
-  ['copy_5fg_5fnd3_68',['copy_g_nd3',['../metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff',1,'copy.h']]],
-  ['copy_5fgg_69',['copy_gg',['../metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5',1,'copy.h']]],
-  ['copy_5fgg_5fnd1_70',['copy_gg_nd1',['../metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1',1,'copy.h']]],
-  ['copy_5fgg_5fnd2_71',['copy_gg_nd2',['../metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950',1,'copy.h']]],
-  ['copy_5fgg_5fnd3_72',['copy_gg_nd3',['../metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd',1,'copy.h']]],
-  ['copy_5fgpu_73',['copy_gpu',['../namespacemlx_1_1core.html#addaa46a13ac2deb1d9ce621338320e0e',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a6a6f4e46c8fc44fdc74c50ace02bcf38',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype)']]],
-  ['copy_5fgpu_5finplace_74',['copy_gpu_inplace',['../namespacemlx_1_1core.html#a69e30f5d30a6d72ac0ffe4886f24b7ba',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a8e1ccb0ed9387b0a789311d9f8964803',1,'mlx::core::copy_gpu_inplace(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#ae55b801b09ccf55cba96278163a9b1ef',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int64_t &gt; &amp;istride, int64_t ioffset, CopyType ctype, const Stream &amp;s)']]],
-  ['copy_5fhartley_75',['copy_hartley',['../namespacepocketfft_1_1detail.html#abac3fcc8ce83800d228774f64c28d4c3',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#ae7b44d2773d9d06a9787aff01d66b3ed',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
-  ['copy_5finplace_76',['copy_inplace',['../namespacemlx_1_1core.html#a98495894a796b2cc6d022e7a03432c64',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, CopyType ctype)'],['../namespacemlx_1_1core.html#aad636e2d0b2f882cadd1b438f4daa9ed',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype)']]],
-  ['copy_5finput_77',['copy_input',['../namespacepocketfft_1_1detail.html#aff05be3064743c1143b19318ab12ad4a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; cmplx&lt; T &gt; &gt; &amp;src, cmplx&lt; vtype_t&lt; T &gt; &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a30fc708f9d8f9cfa74194925c7863c0a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, vtype_t&lt; T &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a3387bd35f237870e42b8461769e6aec4',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, T *dst)']]],
-  ['copy_5foutput_78',['copy_output',['../namespacepocketfft_1_1detail.html#a1523a037300a8da05db210b802d9cb0e',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const cmplx&lt; vtype_t&lt; T &gt; &gt; *src, ndarr&lt; cmplx&lt; T &gt; &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a21980853aca4d92ed06e3dcffe7ef660',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a310481c334e46674710ba794ad7403c0',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
-  ['copy_5fs_79',['copy_s',['../metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea',1,'copy.h']]],
-  ['copy_5fs2_80',['copy_s2',['../metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3',1,'copy.h']]],
-  ['copy_5fshared_5fbuffer_81',['copy_shared_buffer',['../classmlx_1_1core_1_1array.html#a28df7a333d90a311c49bc4bce7a1ad6d',1,'mlx::core::array::copy_shared_buffer(const array &amp;other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a92974c656c35a972ad241f80584bbd29',1,'mlx::core::array::copy_shared_buffer(const array &amp;other)']]],
-  ['copy_5fv_82',['copy_v',['../metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659',1,'copy.h']]],
-  ['copy_5fv2_83',['copy_v2',['../metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3',1,'copy.h']]],
-  ['cos_84',['Cos',['../classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995',1,'mlx::core::Cos']]],
-  ['cos_85',['cos',['../namespacepocketfft_1_1detail.html#a499c1e8b7d79a5272af024f46c63ff9d',1,'pocketfft::detail::cos()'],['../namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3',1,'metal::cos()'],['../namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88',1,'metal::fast::cos()'],['../namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220',1,'metal::precise::cos()'],['../group__ops.html#ga39dfdf72b556012aa35ff27a94116e74',1,'mlx::core::cos()']]],
-  ['cosh_86',['Cosh',['../classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1',1,'mlx::core::Cosh']]],
-  ['cosh_87',['cosh',['../namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0',1,'metal::cosh()'],['../namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e',1,'metal::fast::cosh()'],['../namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc',1,'metal::precise::cosh()'],['../group__ops.html#ga2181b71cda88007a3092be4795ff0715',1,'mlx::core::cosh()']]],
-  ['cospi_88',['cospi',['../namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1',1,'metal::cospi()'],['../namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce',1,'metal::fast::cospi()'],['../namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7',1,'metal::precise::cospi()']]],
-  ['cost_5fguess_89',['cost_guess',['../structpocketfft_1_1detail_1_1util.html#ad3d874bc3fb0048df2270779a15d4bd0',1,'pocketfft::detail::util']]],
-  ['count_5fdown_90',['count_down',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#a81d6597189b40410e35f3cd653fd1342',1,'pocketfft::detail::threading::latch']]],
-  ['cross_91',['cross',['../namespacemlx_1_1core_1_1linalg.html#abcda3fbda45183c21e7f27aa0dde64e6',1,'mlx::core::linalg']]],
-  ['cummax_92',['cummax',['../group__ops.html#gaee37cac8476e8f8d666bcded5bc59143',1,'mlx::core']]],
-  ['cummin_93',['cummin',['../group__ops.html#ga19c1bf6929fe8d66b9cd408946aea6a8',1,'mlx::core']]],
-  ['cumprod_94',['cumprod',['../group__ops.html#ga0d71dfbc14ef3ed564b0c5ee26af680f',1,'mlx::core']]],
-  ['cumsum_95',['cumsum',['../group__ops.html#gaddc825a5c173e195ab0fda83ad630420',1,'mlx::core']]],
-  ['custom_96',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313',1,'mlx::core::fast::Custom']]],
-  ['custom_5ffunction_97',['custom_function',['../namespacemlx_1_1core.html#a8d3ca5fbaecdb995660c24cde5aeebaf',1,'mlx::core']]],
-  ['custom_5fvjp_98',['custom_vjp',['../namespacemlx_1_1core.html#a9290596250fa308df4c69b44483bb8aa',1,'mlx::core']]],
-  ['customkernel_99',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153',1,'mlx::core::fast::CustomKernel']]],
-  ['customtransforms_100',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488',1,'mlx::core::CustomTransforms']]]
+  ['contiguous_45',['Contiguous',['../classmlx_1_1core_1_1_contiguous.html#a3e83f414c02ae0b92a50b6f8e402e1c0',1,'mlx::core::Contiguous']]],
+  ['contiguous_46',['contiguous',['../group__ops.html#ga8ab10aa6c41416d739791164a52b25d5',1,'mlx::core']]],
+  ['contiguous_5fscan_47',['contiguous_scan',['../scan_8h.html#a60d279b9add7d56639bb209408f09d79',1,'scan.h']]],
+  ['contiguousiterator_48',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6',1,'mlx::core::ContiguousIterator::ContiguousIterator()'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a6cb378408b6f546eeb6ade1a4faafe3c',1,'mlx::core::ContiguousIterator::ContiguousIterator(const array &amp;a)'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a16bdacb53f65b7284068cd49d4cba292',1,'mlx::core::ContiguousIterator::ContiguousIterator(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides, int dims)']]],
+  ['conv_49',['conv',['../namespacemlx_1_1core_1_1metal.html#ab1704e853394c725668c06752ebb5c24',1,'mlx::core::metal']]],
+  ['conv1d_50',['conv1d',['../group__ops.html#ga30d47e08093c03a3676f235f9f559411',1,'mlx::core']]],
+  ['conv2d_51',['conv2d',['../group__ops.html#ga73b02833229678786e7f302d458d5a83',1,'mlx::core']]],
+  ['conv2dinputblockloadergeneral_52',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b',1,'mlx::steel::Conv2DInputBlockLoaderGeneral']]],
+  ['conv2dinputblockloaderlargefilter_53',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8755116a535539744e4947bc69f9c50f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter']]],
+  ['conv2dinputblockloadersmallchannels_54',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ab9fd3fdeab94470dde3326f1dd5c455a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels']]],
+  ['conv2dinputblockloadersmallfilter_55',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0a2cbf57c51cd928722e3f06aafcf933',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
+  ['conv2dweightblockloader_56',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a9a7dca3512b64cffb6eac305d795831c',1,'mlx::steel::Conv2DWeightBlockLoader']]],
+  ['conv2dweightblockloadergeneral_57',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#ad0550fabbdc9297559381a5b488e9af1',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral']]],
+  ['conv2dweightblockloadersmallchannels_58',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae1806ea1c19713819dee83a38ab35fa6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels']]],
+  ['conv3d_59',['conv3d',['../group__ops.html#ga6e9907d2f14dc4803e4306b3dbc4b3ca',1,'mlx::core']]],
+  ['conv_5fgeneral_60',['conv_general',['../group__ops.html#ga2236e5dfc7e52e28abf6c21675d0a51e',1,'mlx::core::conv_general(array input, array weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding_lo={}, std::vector&lt; int &gt; padding_hi={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})'],['../group__ops.html#gab59f89942cd1efaadffe9e8762e3c99d',1,'mlx::core::conv_general(const array &amp;input, const array &amp;weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})']]],
+  ['conv_5ftranspose1d_61',['conv_transpose1d',['../group__ops.html#gaa30bf1adcd78d1c2595d07b215731714',1,'mlx::core']]],
+  ['conv_5ftranspose2d_62',['conv_transpose2d',['../group__ops.html#gaebb59971cb9bc45005dc1d398e4f0a3d',1,'mlx::core']]],
+  ['conv_5ftranspose3d_63',['conv_transpose3d',['../group__ops.html#ga8db814da631d9cd32a8d6563bf4ac530',1,'mlx::core']]],
+  ['convolution_64',['Convolution',['../classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef',1,'mlx::core::Convolution']]],
+  ['copy_65',['Copy',['../classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584',1,'mlx::core::Copy']]],
+  ['copy_66',['copy',['../namespacemlx_1_1core.html#a479648542a2bea151b947b18f0e79dd2',1,'mlx::core::copy()'],['../namespacemlx_1_1core_1_1metal.html#aa215e631e2680f04a591b88d91571719',1,'mlx::core::metal::copy()'],['../group__ops.html#gae306e93af12f774bd80bad6c231b09d6',1,'mlx::core::copy()']]],
+  ['copy_5fg_67',['copy_g',['../metal_2kernels_2copy_8h.html#a71e4103db4689d90ef6f9d5ba93604cf',1,'copy.h']]],
+  ['copy_5fg_5fnd1_68',['copy_g_nd1',['../metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77',1,'copy.h']]],
+  ['copy_5fg_5fnd2_69',['copy_g_nd2',['../metal_2kernels_2copy_8h.html#a39ec5b7b8351e4332b842982a2ee6260',1,'copy.h']]],
+  ['copy_5fg_5fnd3_70',['copy_g_nd3',['../metal_2kernels_2copy_8h.html#aab82689380897ff4716b5eafd6ef3ecc',1,'copy.h']]],
+  ['copy_5fgg_71',['copy_gg',['../metal_2kernels_2copy_8h.html#ade9a9eea9b8262a854a11721fe2bb9fa',1,'copy.h']]],
+  ['copy_5fgg_5fnd1_72',['copy_gg_nd1',['../metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1',1,'copy.h']]],
+  ['copy_5fgg_5fnd2_73',['copy_gg_nd2',['../metal_2kernels_2copy_8h.html#af0b06ac3a96852a64fa4274a94b58301',1,'copy.h']]],
+  ['copy_5fgg_5fnd3_74',['copy_gg_nd3',['../metal_2kernels_2copy_8h.html#a3f3836ad0b6545ec9b9e1864224f7a13',1,'copy.h']]],
+  ['copy_5fgpu_75',['copy_gpu',['../namespacemlx_1_1core.html#addaa46a13ac2deb1d9ce621338320e0e',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a6a6f4e46c8fc44fdc74c50ace02bcf38',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype)']]],
+  ['copy_5fgpu_5finplace_76',['copy_gpu_inplace',['../namespacemlx_1_1core.html#a69e30f5d30a6d72ac0ffe4886f24b7ba',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a8e1ccb0ed9387b0a789311d9f8964803',1,'mlx::core::copy_gpu_inplace(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#ae55b801b09ccf55cba96278163a9b1ef',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int64_t &gt; &amp;istride, int64_t ioffset, CopyType ctype, const Stream &amp;s)']]],
+  ['copy_5fhartley_77',['copy_hartley',['../namespacepocketfft_1_1detail.html#abac3fcc8ce83800d228774f64c28d4c3',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#ae7b44d2773d9d06a9787aff01d66b3ed',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
+  ['copy_5finplace_78',['copy_inplace',['../namespacemlx_1_1core.html#a98495894a796b2cc6d022e7a03432c64',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, CopyType ctype)'],['../namespacemlx_1_1core.html#aad636e2d0b2f882cadd1b438f4daa9ed',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype)']]],
+  ['copy_5finput_79',['copy_input',['../namespacepocketfft_1_1detail.html#aff05be3064743c1143b19318ab12ad4a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; cmplx&lt; T &gt; &gt; &amp;src, cmplx&lt; vtype_t&lt; T &gt; &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a30fc708f9d8f9cfa74194925c7863c0a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, vtype_t&lt; T &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a3387bd35f237870e42b8461769e6aec4',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, T *dst)']]],
+  ['copy_5foutput_80',['copy_output',['../namespacepocketfft_1_1detail.html#a1523a037300a8da05db210b802d9cb0e',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const cmplx&lt; vtype_t&lt; T &gt; &gt; *src, ndarr&lt; cmplx&lt; T &gt; &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a21980853aca4d92ed06e3dcffe7ef660',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a310481c334e46674710ba794ad7403c0',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
+  ['copy_5fs_81',['copy_s',['../metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea',1,'copy.h']]],
+  ['copy_5fs2_82',['copy_s2',['../metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3',1,'copy.h']]],
+  ['copy_5fshared_5fbuffer_83',['copy_shared_buffer',['../classmlx_1_1core_1_1array.html#a28df7a333d90a311c49bc4bce7a1ad6d',1,'mlx::core::array::copy_shared_buffer(const array &amp;other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a92974c656c35a972ad241f80584bbd29',1,'mlx::core::array::copy_shared_buffer(const array &amp;other)']]],
+  ['copy_5fv_84',['copy_v',['../metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659',1,'copy.h']]],
+  ['copy_5fv2_85',['copy_v2',['../metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3',1,'copy.h']]],
+  ['cos_86',['Cos',['../classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995',1,'mlx::core::Cos']]],
+  ['cos_87',['cos',['../namespacepocketfft_1_1detail.html#a499c1e8b7d79a5272af024f46c63ff9d',1,'pocketfft::detail::cos()'],['../namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3',1,'metal::cos()'],['../namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88',1,'metal::fast::cos()'],['../namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220',1,'metal::precise::cos()'],['../group__ops.html#ga39dfdf72b556012aa35ff27a94116e74',1,'mlx::core::cos()']]],
+  ['cosh_88',['Cosh',['../classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1',1,'mlx::core::Cosh']]],
+  ['cosh_89',['cosh',['../namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0',1,'metal::cosh()'],['../namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e',1,'metal::fast::cosh()'],['../namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc',1,'metal::precise::cosh()'],['../group__ops.html#ga2181b71cda88007a3092be4795ff0715',1,'mlx::core::cosh()']]],
+  ['cospi_90',['cospi',['../namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1',1,'metal::cospi()'],['../namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce',1,'metal::fast::cospi()'],['../namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7',1,'metal::precise::cospi()']]],
+  ['cost_5fguess_91',['cost_guess',['../structpocketfft_1_1detail_1_1util.html#ad3d874bc3fb0048df2270779a15d4bd0',1,'pocketfft::detail::util']]],
+  ['count_5fdown_92',['count_down',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#a81d6597189b40410e35f3cd653fd1342',1,'pocketfft::detail::threading::latch']]],
+  ['cross_93',['cross',['../namespacemlx_1_1core_1_1linalg.html#abcda3fbda45183c21e7f27aa0dde64e6',1,'mlx::core::linalg']]],
+  ['cummax_94',['cummax',['../group__ops.html#gaee37cac8476e8f8d666bcded5bc59143',1,'mlx::core']]],
+  ['cummin_95',['cummin',['../group__ops.html#ga19c1bf6929fe8d66b9cd408946aea6a8',1,'mlx::core']]],
+  ['cumprod_96',['cumprod',['../group__ops.html#ga0d71dfbc14ef3ed564b0c5ee26af680f',1,'mlx::core']]],
+  ['cumsum_97',['cumsum',['../group__ops.html#gaddc825a5c173e195ab0fda83ad630420',1,'mlx::core']]],
+  ['custom_98',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313',1,'mlx::core::fast::Custom']]],
+  ['custom_5ffunction_99',['custom_function',['../namespacemlx_1_1core.html#a8d3ca5fbaecdb995660c24cde5aeebaf',1,'mlx::core']]],
+  ['custom_5fvjp_100',['custom_vjp',['../namespacemlx_1_1core.html#a9290596250fa308df4c69b44483bb8aa',1,'mlx::core']]],
+  ['customkernel_101',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153',1,'mlx::core::fast::CustomKernel']]],
+  ['customtransforms_102',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488',1,'mlx::core::CustomTransforms']]]
 ];
diff --git a/docs/build/html/search/functions_4.js b/docs/build/html/search/functions_4.js
index 8254b96c0..974863786 100644
--- a/docs/build/html/search/functions_4.js
+++ b/docs/build/html/search/functions_4.js
@@ -28,8 +28,8 @@ var searchData=
   ['diag_25',['diag',['../group__ops.html#ga11af511875640e1fa88e0ca87e199344',1,'mlx::core']]],
   ['diagonal_26',['diagonal',['../group__ops.html#ga9236b085a88ead3128ed8079d009cac6',1,'mlx::core']]],
   ['disable_5fcompile_27',['disable_compile',['../namespacemlx_1_1core.html#a5f5fea955057bb3842b271b037909e66',1,'mlx::core']]],
-  ['dispatchthreadgroups_28',['dispatchThreadgroups',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e',1,'mlx::core::metal::CommandEncoder']]],
-  ['dispatchthreads_29',['dispatchThreads',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810',1,'mlx::core::metal::CommandEncoder']]],
+  ['dispatch_5fthreadgroups_28',['dispatch_threadgroups',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a85796b2bf41dbf347ae0978d4660600d',1,'mlx::core::metal::CommandEncoder']]],
+  ['dispatch_5fthreads_29',['dispatch_threads',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a0a8501b940e5a347475fa4bc38fb4c05',1,'mlx::core::metal::CommandEncoder']]],
   ['distprimitive_30',['DistPrimitive',['../classmlx_1_1core_1_1distributed_1_1_dist_primitive.html#a8c54166951522c2a52ef39fce8c87f8f',1,'mlx::core::distributed::DistPrimitive']]],
   ['divide_31',['Divide',['../classmlx_1_1core_1_1_divide.html#a62fc71e8998be65ff18285dbbd21eedb',1,'mlx::core::Divide']]],
   ['divide_32',['divide',['../namespacemetal.html#a2aea493fc1a874970b77ed0031e965df',1,'metal::divide()'],['../namespacemetal_1_1fast.html#ae70bc2185e4649369cf7b15f5e1d48be',1,'metal::fast::divide()'],['../namespacemetal_1_1precise.html#aec0982cdb96a08b61f51129150d82e9d',1,'metal::precise::divide()'],['../group__ops.html#ga77472dd06cfa7a30a42e4fd927bd859f',1,'mlx::core::divide()']]],
diff --git a/docs/build/html/search/functions_5.js b/docs/build/html/search/functions_5.js
index 88db21482..2552a4e74 100644
--- a/docs/build/html/search/functions_5.js
+++ b/docs/build/html/search/functions_5.js
@@ -5,14 +5,14 @@ var searchData=
   ['eigvalsh_2',['eigvalsh',['../namespacemlx_1_1core_1_1linalg.html#a00c8e24432b0773dac64b8602bd142ba',1,'mlx::core::linalg']]],
   ['einsum_3',['einsum',['../namespacemlx_1_1core.html#a2a9b98c65578dd3720b3b375c1471e58',1,'mlx::core']]],
   ['einsum_5fpath_4',['einsum_path',['../namespacemlx_1_1core.html#ab14ec41f17675691c1fdebb8990b6695',1,'mlx::core']]],
-  ['elem_5fto_5floc_5',['elem_to_loc',['../namespacemlx_1_1core.html#a77657cb50fd9392f7f4c64e43843c2b3',1,'mlx::core::elem_to_loc(int elem, const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides)'],['../namespacemlx_1_1core.html#ad7e4f40eb351b554bbfabb6d7d600d06',1,'mlx::core::elem_to_loc(int elem, const array &amp;a)'],['../backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1',1,'elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#aa6b041005351293e68e19b5abf1286cd',1,'elem_to_loc(stride_t elem, constant const int *shape, constant const stride_t *strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a37e00d94751710e81c9632bca2f91e51',1,'elem_to_loc(uint3 elem, constant const int *shape, constant const stride_t *strides, int ndim):&#160;utils.h']]],
-  ['elem_5fto_5floc_5f1_6',['elem_to_loc_1',['../backend_2metal_2kernels_2utils_8h.html#a196a07022b812b241d4c06192c0fa83d',1,'utils.h']]],
-  ['elem_5fto_5floc_5f2_7',['elem_to_loc_2',['../backend_2metal_2kernels_2utils_8h.html#ad6c45cacca97899cd362df49c06fea79',1,'utils.h']]],
-  ['elem_5fto_5floc_5f2_5fnd_8',['elem_to_loc_2_nd',['../backend_2metal_2kernels_2utils_8h.html#a01c9309978a6c12f79b6e4108728a953',1,'utils.h']]],
-  ['elem_5fto_5floc_5f3_9',['elem_to_loc_3',['../backend_2metal_2kernels_2utils_8h.html#a2c34ed54714c69e6e1b44344f9e6e330',1,'utils.h']]],
-  ['elem_5fto_5floc_5f3_5fnd_10',['elem_to_loc_3_nd',['../backend_2metal_2kernels_2utils_8h.html#a66940b1cc3d64651d24634bc696d528b',1,'utils.h']]],
+  ['elem_5fto_5floc_5',['elem_to_loc',['../namespacemlx_1_1core.html#a77657cb50fd9392f7f4c64e43843c2b3',1,'mlx::core::elem_to_loc(int elem, const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides)'],['../namespacemlx_1_1core.html#ad7e4f40eb351b554bbfabb6d7d600d06',1,'mlx::core::elem_to_loc(int elem, const array &amp;a)'],['../backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5',1,'elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#a4b53fb0679f67f9063deba94753d4185',1,'elem_to_loc(StrideT elem, constant const int *shape, constant const StrideT *strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2utils_8h.html#aec82f4bf0e22b8d1b89ad654ad8d8753',1,'elem_to_loc(uint3 elem, constant const int *shape, constant const StrideT *strides, int ndim):&#160;utils.h']]],
+  ['elem_5fto_5floc_5f1_6',['elem_to_loc_1',['../backend_2metal_2kernels_2utils_8h.html#ac612d0ae30b8257198339debe04916a3',1,'utils.h']]],
+  ['elem_5fto_5floc_5f2_7',['elem_to_loc_2',['../backend_2metal_2kernels_2utils_8h.html#a43f33efc000962d6de881a3aab7458de',1,'utils.h']]],
+  ['elem_5fto_5floc_5f2_5fnd_8',['elem_to_loc_2_nd',['../backend_2metal_2kernels_2utils_8h.html#a66a2d7eec0262b12db16cd6c781ccf9a',1,'utils.h']]],
+  ['elem_5fto_5floc_5f3_9',['elem_to_loc_3',['../backend_2metal_2kernels_2utils_8h.html#a650f8ea8cf9f9519da9e301aad0308dc',1,'utils.h']]],
+  ['elem_5fto_5floc_5f3_5fnd_10',['elem_to_loc_3_nd',['../backend_2metal_2kernels_2utils_8h.html#a65d87b425e1f8ca19df97c15049f8733',1,'utils.h']]],
   ['elem_5fto_5floc_5fbroadcast_11',['elem_to_loc_broadcast',['../backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f',1,'elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, int ndim):&#160;utils.h'],['../backend_2metal_2kernels_2steel_2utils_8h.html#a42bd57d203a40d3d7d429f2333590a3c',1,'elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, constant const size_t *c_strides, int ndim):&#160;utils.h']]],
-  ['elems_12',['elems',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc',1,'mlx::steel::MMATile::elems()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1',1,'mlx::steel::MMATile::elems() const']]],
+  ['elems_12',['elems',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc',1,'mlx::steel::MMATile::elems()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1',1,'mlx::steel::MMATile::elems() const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc',1,'mlx::steel::MMATile::elems()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1',1,'mlx::steel::MMATile::elems() const']]],
   ['empty_13',['empty',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html#a1269e5da40c3f5145c895cee3641879a',1,'pocketfft::detail::threading::concurrent_queue']]],
   ['enable_5fcompile_14',['enable_compile',['../namespacemlx_1_1core.html#a1983a2466bff3bae4d23cf34bd0946c9',1,'mlx::core']]],
   ['end_15',['end',['../classmlx_1_1core_1_1array.html#a5daf64552fb450825c9b382f3a5fa2d4',1,'mlx::core::array']]],
@@ -27,8 +27,8 @@ var searchData=
   ['erfinv_24',['ErfInv',['../classmlx_1_1core_1_1_erf_inv.html#a5d0279247b67da4592311559f04e1478',1,'mlx::core::ErfInv']]],
   ['erfinv_25',['erfinv',['../erf_8h.html#a1846e0d683c7aff826bb32addcc3b885',1,'erfinv():&#160;erf.h'],['../group__ops.html#ga76fb9062c64264e34d2e07013390557c',1,'mlx::core::erfinv()']]],
   ['eval_26',['eval',['../classmlx_1_1core_1_1array.html#a2820c45188071a22175e9fa42e10a49a',1,'mlx::core::array::eval()'],['../namespacemlx_1_1core.html#a7d6e097d8effed52f4713672e471f299',1,'mlx::core::eval(std::vector&lt; array &gt; outputs)'],['../namespacemlx_1_1core.html#adb14f689c9f75f7901edb196c2bfb971',1,'mlx::core::eval(Arrays &amp;&amp;... outputs)']]],
-  ['eval_5fcpu_27',['eval_cpu',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#acdc1965ad64ee9ee6328fe150a97902e',1,'mlx::core::distributed::AllReduce::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#ab721fe0072fffbddbc3c4334dd033ba5',1,'mlx::core::distributed::AllGather::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#af2620837bfc1b97217d006ed6e374051',1,'mlx::core::distributed::Send::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a3be84b08122a939edd6062d26261358a',1,'mlx::core::distributed::Recv::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#a7da6e0cfd630958d9633b2e2bd97a54f',1,'mlx::core::fast::RMSNorm::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#adfc1d52bc266466ab29ee45fd8fab439',1,'mlx::core::fast::RMSNormVJP::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5d7a4c1c9ee84e327d1c371733108c05',1,'mlx::core::fast::LayerNorm::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a0d8c4c6e7462befc38f7e08244fa1c2b',1,'mlx::core::fast::LayerNormVJP::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a05a7d595c6b9dadf7ddfd6e3fd402f0e',1,'mlx::core::fast::RoPE::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ae20851e002f7fcb6d4f97817596f6328',1,'mlx::core::fast::ScaledDotProductAttention::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd',1,'mlx::core::fast::AffineQuantize::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad',1,'mlx::core::fast::CustomKernel::eval_cpu()'],['../classmlx_1_1core_1_1_primitive.html#a1596dc50b910538eae14878e98f07575',1,'mlx::core::Primitive::eval_cpu()'],['../classmlx_1_1core_1_1_unary_primitive.html#a7e8f6f5d6ae0a33f6abc0f5a46e0b132',1,'mlx::core::UnaryPrimitive::eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output)=0'],['../classmlx_1_1core_1_1_unary_primitive.html#aa0ed6e32c36200a3ff9bc592c9b300db',1,'mlx::core::UnaryPrimitive::eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60',1,'mlx::core::Abs::eval_cpu()'],['../classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f',1,'mlx::core::Add::eval_cpu()'],['../classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c',1,'mlx::core::AddMM::eval_cpu()'],['../classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1',1,'mlx::core::Arange::eval_cpu()'],['../classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006',1,'mlx::core::ArcCos::eval_cpu()'],['../classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9',1,'mlx::core::ArcCosh::eval_cpu()'],['../classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4',1,'mlx::core::ArcSin::eval_cpu()'],['../classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066',1,'mlx::core::ArcSinh::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3',1,'mlx::core::ArcTan::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c',1,'mlx::core::ArcTan2::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd',1,'mlx::core::ArcTanh::eval_cpu()'],['../classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828',1,'mlx::core::ArgPartition::eval_cpu()'],['../classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287',1,'mlx::core::ArgReduce::eval_cpu()'],['../classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa',1,'mlx::core::ArgSort::eval_cpu()'],['../classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d',1,'mlx::core::AsType::eval_cpu()'],['../classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193',1,'mlx::core::AsStrided::eval_cpu()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283',1,'mlx::core::BitwiseBinary::eval_cpu()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2',1,'mlx::core::BlockMaskedMM::eval_cpu()'],['../classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730',1,'mlx::core::GatherMM::eval_cpu()'],['../classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780',1,'mlx::core::Broadcast::eval_cpu()'],['../classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035',1,'mlx::core::Ceil::eval_cpu()'],['../classmlx_1_1core_1_1_compiled.html#ac45b1d0fedd85feefbff7ce7e168b151',1,'mlx::core::Compiled::eval_cpu()'],['../classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258',1,'mlx::core::Concatenate::eval_cpu()'],['../classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61',1,'mlx::core::Conjugate::eval_cpu()'],['../classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b',1,'mlx::core::Convolution::eval_cpu()'],['../classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c',1,'mlx::core::Copy::eval_cpu()'],['../classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152',1,'mlx::core::Cos::eval_cpu()'],['../classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d',1,'mlx::core::Cosh::eval_cpu()'],['../classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184',1,'mlx::core::CustomTransforms::eval_cpu()'],['../classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e',1,'mlx::core::Depends::eval_cpu()'],['../classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49',1,'mlx::core::Divide::eval_cpu()'],['../classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3',1,'mlx::core::DivMod::eval_cpu()'],['../classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2',1,'mlx::core::Select::eval_cpu()'],['../classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc',1,'mlx::core::Remainder::eval_cpu()'],['../classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454',1,'mlx::core::Equal::eval_cpu()'],['../classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6',1,'mlx::core::Erf::eval_cpu()'],['../classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e',1,'mlx::core::ErfInv::eval_cpu()'],['../classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c',1,'mlx::core::Exp::eval_cpu()'],['../classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a',1,'mlx::core::Expm1::eval_cpu()'],['../classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635',1,'mlx::core::FFT::eval_cpu()'],['../classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7',1,'mlx::core::Floor::eval_cpu()'],['../classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c',1,'mlx::core::Full::eval_cpu()'],['../classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290',1,'mlx::core::Gather::eval_cpu()'],['../classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae',1,'mlx::core::Greater::eval_cpu()'],['../classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075',1,'mlx::core::GreaterEqual::eval_cpu()'],['../classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d',1,'mlx::core::Hadamard::eval_cpu()'],['../classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829',1,'mlx::core::Imag::eval_cpu()'],['../classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef',1,'mlx::core::Less::eval_cpu()'],['../classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16',1,'mlx::core::LessEqual::eval_cpu()'],['../classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a',1,'mlx::core::Load::eval_cpu()'],['../classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f',1,'mlx::core::Log::eval_cpu()'],['../classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23',1,'mlx::core::Log1p::eval_cpu()'],['../classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3',1,'mlx::core::LogicalNot::eval_cpu()'],['../classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3',1,'mlx::core::LogicalAnd::eval_cpu()'],['../classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62',1,'mlx::core::LogicalOr::eval_cpu()'],['../classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0',1,'mlx::core::LogAddExp::eval_cpu()'],['../classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc',1,'mlx::core::Matmul::eval_cpu()'],['../classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf',1,'mlx::core::Maximum::eval_cpu()'],['../classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e',1,'mlx::core::Minimum::eval_cpu()'],['../classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34',1,'mlx::core::Multiply::eval_cpu()'],['../classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b',1,'mlx::core::Negative::eval_cpu()'],['../classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047',1,'mlx::core::NotEqual::eval_cpu()'],['../classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f',1,'mlx::core::NumberOfElements::eval_cpu()'],['../classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb',1,'mlx::core::Pad::eval_cpu()'],['../classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8',1,'mlx::core::Partition::eval_cpu()'],['../classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206',1,'mlx::core::Power::eval_cpu()'],['../classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3',1,'mlx::core::QuantizedMatmul::eval_cpu()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c',1,'mlx::core::GatherQMM::eval_cpu()'],['../classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2',1,'mlx::core::RandomBits::eval_cpu()'],['../classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934',1,'mlx::core::Real::eval_cpu()'],['../classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f',1,'mlx::core::Reshape::eval_cpu()'],['../classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa',1,'mlx::core::Reduce::eval_cpu()'],['../classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007',1,'mlx::core::Round::eval_cpu()'],['../classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b',1,'mlx::core::Scan::eval_cpu()'],['../classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97',1,'mlx::core::Scatter::eval_cpu()'],['../classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255',1,'mlx::core::Sigmoid::eval_cpu()'],['../classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97',1,'mlx::core::Sign::eval_cpu()'],['../classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5',1,'mlx::core::Sin::eval_cpu()'],['../classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd',1,'mlx::core::Sinh::eval_cpu()'],['../classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2',1,'mlx::core::Slice::eval_cpu()'],['../classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b',1,'mlx::core::SliceUpdate::eval_cpu()'],['../classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79',1,'mlx::core::Softmax::eval_cpu()'],['../classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd',1,'mlx::core::Sort::eval_cpu()'],['../classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4',1,'mlx::core::Split::eval_cpu()'],['../classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59',1,'mlx::core::Square::eval_cpu()'],['../classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5',1,'mlx::core::Sqrt::eval_cpu()'],['../classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2',1,'mlx::core::StopGradient::eval_cpu()'],['../classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12',1,'mlx::core::Subtract::eval_cpu()'],['../classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9',1,'mlx::core::Tan::eval_cpu()'],['../classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5',1,'mlx::core::Tanh::eval_cpu()'],['../classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f',1,'mlx::core::Uniform::eval_cpu()'],['../classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497',1,'mlx::core::View::eval_cpu()'],['../classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8',1,'mlx::core::Transpose::eval_cpu()'],['../classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2',1,'mlx::core::QRF::eval_cpu()'],['../classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6',1,'mlx::core::SVD::eval_cpu()'],['../classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81',1,'mlx::core::Inverse::eval_cpu()'],['../classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5',1,'mlx::core::Cholesky::eval_cpu()'],['../classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be',1,'mlx::core::Eigh::eval_cpu()']]],
-  ['eval_5fgpu_28',['eval_gpu',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a52df7155f56b8450581b2fd2747cad20',1,'mlx::core::distributed::AllReduce::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a4251ce0f2db2045226b66210b828af7a',1,'mlx::core::distributed::AllGather::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a0c8dbd2a912be91be04ec701e29fba3d',1,'mlx::core::distributed::Send::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a932e39624bc3d234a7489c3decc4749e',1,'mlx::core::distributed::Recv::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae7955e8d43c097eecae264e804b4d8ca',1,'mlx::core::fast::RMSNorm::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a48efb8fa84c4ba6cc9fb560ebbe01560',1,'mlx::core::fast::RMSNormVJP::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a77abda7f47bffa2c037a5d60cccc1528',1,'mlx::core::fast::LayerNorm::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a954a003a4a27c8c4c60a5a14142a9cc3',1,'mlx::core::fast::LayerNormVJP::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a913b6b00fc518b25ac3947e4e15790f2',1,'mlx::core::fast::RoPE::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a505f38ba93a3499895f5312e0112e73d',1,'mlx::core::fast::ScaledDotProductAttention::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ad51666e69f670e286293aff96eb435a9',1,'mlx::core::fast::ScaledDotProductAttention::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out)'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628',1,'mlx::core::fast::AffineQuantize::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db',1,'mlx::core::fast::CustomKernel::eval_gpu()'],['../classmlx_1_1core_1_1_primitive.html#ad217376dcf5eff691d731566faec2ba2',1,'mlx::core::Primitive::eval_gpu()'],['../classmlx_1_1core_1_1_unary_primitive.html#a6b7f80abaf038d53ec6ffbb0dfac6adb',1,'mlx::core::UnaryPrimitive::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output)=0'],['../classmlx_1_1core_1_1_unary_primitive.html#a971fe9ad47f6569118879ce1d0f41447',1,'mlx::core::UnaryPrimitive::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514',1,'mlx::core::Abs::eval_gpu()'],['../classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d',1,'mlx::core::Add::eval_gpu()'],['../classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9',1,'mlx::core::AddMM::eval_gpu()'],['../classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031',1,'mlx::core::Arange::eval_gpu()'],['../classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c',1,'mlx::core::ArcCos::eval_gpu()'],['../classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc',1,'mlx::core::ArcCosh::eval_gpu()'],['../classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3',1,'mlx::core::ArcSin::eval_gpu()'],['../classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac',1,'mlx::core::ArcSinh::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254',1,'mlx::core::ArcTan::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50',1,'mlx::core::ArcTan2::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d',1,'mlx::core::ArcTanh::eval_gpu()'],['../classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc',1,'mlx::core::ArgPartition::eval_gpu()'],['../classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29',1,'mlx::core::ArgReduce::eval_gpu()'],['../classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709',1,'mlx::core::ArgSort::eval_gpu()'],['../classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b',1,'mlx::core::AsType::eval_gpu()'],['../classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed',1,'mlx::core::AsStrided::eval_gpu()'],['../classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd',1,'mlx::core::BitwiseBinary::eval_gpu()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9',1,'mlx::core::BlockMaskedMM::eval_gpu()'],['../classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1',1,'mlx::core::GatherMM::eval_gpu()'],['../classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe',1,'mlx::core::Broadcast::eval_gpu()'],['../classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887',1,'mlx::core::Ceil::eval_gpu()'],['../classmlx_1_1core_1_1_compiled.html#aa3d5ff0f2b3554ad48fbbf2a0f3336d5',1,'mlx::core::Compiled::eval_gpu()'],['../classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474',1,'mlx::core::Concatenate::eval_gpu()'],['../classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de',1,'mlx::core::Conjugate::eval_gpu()'],['../classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2',1,'mlx::core::Convolution::eval_gpu()'],['../classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1',1,'mlx::core::Copy::eval_gpu()'],['../classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060',1,'mlx::core::Cos::eval_gpu()'],['../classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559',1,'mlx::core::Cosh::eval_gpu()'],['../classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667',1,'mlx::core::CustomTransforms::eval_gpu()'],['../classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28',1,'mlx::core::Depends::eval_gpu()'],['../classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7',1,'mlx::core::Divide::eval_gpu()'],['../classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc',1,'mlx::core::DivMod::eval_gpu()'],['../classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b',1,'mlx::core::Select::eval_gpu()'],['../classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161',1,'mlx::core::Remainder::eval_gpu()'],['../classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c',1,'mlx::core::Equal::eval_gpu()'],['../classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008',1,'mlx::core::Erf::eval_gpu()'],['../classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db',1,'mlx::core::ErfInv::eval_gpu()'],['../classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822',1,'mlx::core::Exp::eval_gpu()'],['../classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f',1,'mlx::core::Expm1::eval_gpu()'],['../classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd',1,'mlx::core::FFT::eval_gpu()'],['../classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65',1,'mlx::core::Floor::eval_gpu()'],['../classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872',1,'mlx::core::Full::eval_gpu()'],['../classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8',1,'mlx::core::Gather::eval_gpu()'],['../classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878',1,'mlx::core::Greater::eval_gpu()'],['../classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24',1,'mlx::core::GreaterEqual::eval_gpu()'],['../classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733',1,'mlx::core::Hadamard::eval_gpu()'],['../classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6',1,'mlx::core::Imag::eval_gpu()'],['../classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917',1,'mlx::core::Less::eval_gpu()'],['../classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac',1,'mlx::core::LessEqual::eval_gpu()'],['../classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d',1,'mlx::core::Load::eval_gpu()'],['../classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390',1,'mlx::core::Log::eval_gpu()'],['../classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431',1,'mlx::core::Log1p::eval_gpu()'],['../classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a',1,'mlx::core::LogicalNot::eval_gpu()'],['../classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f',1,'mlx::core::LogicalAnd::eval_gpu()'],['../classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a',1,'mlx::core::LogicalOr::eval_gpu()'],['../classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a',1,'mlx::core::LogAddExp::eval_gpu()'],['../classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7',1,'mlx::core::Matmul::eval_gpu()'],['../classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7',1,'mlx::core::Maximum::eval_gpu()'],['../classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba',1,'mlx::core::Minimum::eval_gpu()'],['../classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0',1,'mlx::core::Multiply::eval_gpu()'],['../classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b',1,'mlx::core::Negative::eval_gpu()'],['../classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2',1,'mlx::core::NotEqual::eval_gpu()'],['../classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5',1,'mlx::core::NumberOfElements::eval_gpu()'],['../classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153',1,'mlx::core::Pad::eval_gpu()'],['../classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef',1,'mlx::core::Partition::eval_gpu()'],['../classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11',1,'mlx::core::Power::eval_gpu()'],['../classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3',1,'mlx::core::QuantizedMatmul::eval_gpu()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887',1,'mlx::core::GatherQMM::eval_gpu()'],['../classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a',1,'mlx::core::RandomBits::eval_gpu()'],['../classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2',1,'mlx::core::Real::eval_gpu()'],['../classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059',1,'mlx::core::Reshape::eval_gpu()'],['../classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f',1,'mlx::core::Reduce::eval_gpu()'],['../classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec',1,'mlx::core::Round::eval_gpu()'],['../classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde',1,'mlx::core::Scan::eval_gpu()'],['../classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678',1,'mlx::core::Scatter::eval_gpu()'],['../classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca',1,'mlx::core::Sigmoid::eval_gpu()'],['../classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b',1,'mlx::core::Sign::eval_gpu()'],['../classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e',1,'mlx::core::Sin::eval_gpu()'],['../classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75',1,'mlx::core::Sinh::eval_gpu()'],['../classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a',1,'mlx::core::Slice::eval_gpu()'],['../classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b',1,'mlx::core::SliceUpdate::eval_gpu()'],['../classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af',1,'mlx::core::Softmax::eval_gpu()'],['../classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382',1,'mlx::core::Sort::eval_gpu()'],['../classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df',1,'mlx::core::Split::eval_gpu()'],['../classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045',1,'mlx::core::Square::eval_gpu()'],['../classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501',1,'mlx::core::Sqrt::eval_gpu()'],['../classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89',1,'mlx::core::StopGradient::eval_gpu()'],['../classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c',1,'mlx::core::Subtract::eval_gpu()'],['../classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f',1,'mlx::core::Tan::eval_gpu()'],['../classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761',1,'mlx::core::Tanh::eval_gpu()'],['../classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0',1,'mlx::core::Uniform::eval_gpu()'],['../classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075',1,'mlx::core::View::eval_gpu()'],['../classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e',1,'mlx::core::Transpose::eval_gpu()'],['../classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9',1,'mlx::core::QRF::eval_gpu()'],['../classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83',1,'mlx::core::SVD::eval_gpu()'],['../classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2',1,'mlx::core::Inverse::eval_gpu()'],['../classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795',1,'mlx::core::Cholesky::eval_gpu()'],['../classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2',1,'mlx::core::Eigh::eval_gpu()']]],
+  ['eval_5fcpu_27',['eval_cpu',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#acdc1965ad64ee9ee6328fe150a97902e',1,'mlx::core::distributed::AllReduce::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#ab721fe0072fffbddbc3c4334dd033ba5',1,'mlx::core::distributed::AllGather::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#af2620837bfc1b97217d006ed6e374051',1,'mlx::core::distributed::Send::eval_cpu()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a3be84b08122a939edd6062d26261358a',1,'mlx::core::distributed::Recv::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#a7da6e0cfd630958d9633b2e2bd97a54f',1,'mlx::core::fast::RMSNorm::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#adfc1d52bc266466ab29ee45fd8fab439',1,'mlx::core::fast::RMSNormVJP::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a5d7a4c1c9ee84e327d1c371733108c05',1,'mlx::core::fast::LayerNorm::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a0d8c4c6e7462befc38f7e08244fa1c2b',1,'mlx::core::fast::LayerNormVJP::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a05a7d595c6b9dadf7ddfd6e3fd402f0e',1,'mlx::core::fast::RoPE::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ae20851e002f7fcb6d4f97817596f6328',1,'mlx::core::fast::ScaledDotProductAttention::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a3b5d628628d245b38911118d4a0ff9fd',1,'mlx::core::fast::AffineQuantize::eval_cpu()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a4ad1b7a9919753c759093f3e21a15bad',1,'mlx::core::fast::CustomKernel::eval_cpu()'],['../classmlx_1_1core_1_1_primitive.html#a1596dc50b910538eae14878e98f07575',1,'mlx::core::Primitive::eval_cpu()'],['../classmlx_1_1core_1_1_unary_primitive.html#a7e8f6f5d6ae0a33f6abc0f5a46e0b132',1,'mlx::core::UnaryPrimitive::eval_cpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output)=0'],['../classmlx_1_1core_1_1_unary_primitive.html#aa0ed6e32c36200a3ff9bc592c9b300db',1,'mlx::core::UnaryPrimitive::eval_cpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1_abs.html#a0d3e697496ef8e842d21195cb3c14e60',1,'mlx::core::Abs::eval_cpu()'],['../classmlx_1_1core_1_1_add.html#a5bacfc51dfa2a5a931bad2dd7bdc7a5f',1,'mlx::core::Add::eval_cpu()'],['../classmlx_1_1core_1_1_add_m_m.html#a15694e3bf2ed5c193237b2b9ca00867c',1,'mlx::core::AddMM::eval_cpu()'],['../classmlx_1_1core_1_1_arange.html#aba44432491cbd599bf72712f5f4267a1',1,'mlx::core::Arange::eval_cpu()'],['../classmlx_1_1core_1_1_arc_cos.html#a58dcba9e706cb12bab062bb7fa5fa006',1,'mlx::core::ArcCos::eval_cpu()'],['../classmlx_1_1core_1_1_arc_cosh.html#a0f6d989bcbbc38f15ef17a136879a9c9',1,'mlx::core::ArcCosh::eval_cpu()'],['../classmlx_1_1core_1_1_arc_sin.html#ab3542492c14021329788de8f2a9be1e4',1,'mlx::core::ArcSin::eval_cpu()'],['../classmlx_1_1core_1_1_arc_sinh.html#a52574b24d8d16839c58673f51f8ac066',1,'mlx::core::ArcSinh::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tan.html#a1211bc31241227528f04435239ddb9a3',1,'mlx::core::ArcTan::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tan2.html#a13094e6b702769928ca0da468f5ce45c',1,'mlx::core::ArcTan2::eval_cpu()'],['../classmlx_1_1core_1_1_arc_tanh.html#a5af9224e1f1ffec412b0baa0af7e1ecd',1,'mlx::core::ArcTanh::eval_cpu()'],['../classmlx_1_1core_1_1_arg_partition.html#a896f75c5325798ac3f9093f6a4581828',1,'mlx::core::ArgPartition::eval_cpu()'],['../classmlx_1_1core_1_1_arg_reduce.html#ad8d48725623ede1ff654fa13eccf2287',1,'mlx::core::ArgReduce::eval_cpu()'],['../classmlx_1_1core_1_1_arg_sort.html#a022079683774bfeb531b3a002cff16fa',1,'mlx::core::ArgSort::eval_cpu()'],['../classmlx_1_1core_1_1_as_type.html#aa89dbf4d73b00c6a44cffd04d5bb228d',1,'mlx::core::AsType::eval_cpu()'],['../classmlx_1_1core_1_1_as_strided.html#acdd4705e4503ff0b124215c4676b4193',1,'mlx::core::AsStrided::eval_cpu()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a2194bf585213bda1b2966aa02d2fe283',1,'mlx::core::BitwiseBinary::eval_cpu()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#aa85da478cdc6d4a97be06e5d4abee1f2',1,'mlx::core::BlockMaskedMM::eval_cpu()'],['../classmlx_1_1core_1_1_gather_m_m.html#a62352074a480df0e1f879b0bae425730',1,'mlx::core::GatherMM::eval_cpu()'],['../classmlx_1_1core_1_1_broadcast.html#a53d48d9778e2d4c24a124cd767900780',1,'mlx::core::Broadcast::eval_cpu()'],['../classmlx_1_1core_1_1_ceil.html#a9791801fff3f8b79944e15ac2a45a035',1,'mlx::core::Ceil::eval_cpu()'],['../classmlx_1_1core_1_1_compiled.html#ac45b1d0fedd85feefbff7ce7e168b151',1,'mlx::core::Compiled::eval_cpu()'],['../classmlx_1_1core_1_1_concatenate.html#a609e76bede7fc5581ec84ddcb727a258',1,'mlx::core::Concatenate::eval_cpu()'],['../classmlx_1_1core_1_1_conjugate.html#ae39643e2178f442ffba05139f8609d61',1,'mlx::core::Conjugate::eval_cpu()'],['../classmlx_1_1core_1_1_contiguous.html#a742de24e6c0310cd85a606dec0cd8336',1,'mlx::core::Contiguous::eval_cpu()'],['../classmlx_1_1core_1_1_convolution.html#ac74256068da01730629109fa4fa8432b',1,'mlx::core::Convolution::eval_cpu()'],['../classmlx_1_1core_1_1_copy.html#af4a0ebec423e84ffe8083a5e9ed0d70c',1,'mlx::core::Copy::eval_cpu()'],['../classmlx_1_1core_1_1_cos.html#a061fc446268fe56237ae6b20ccf78152',1,'mlx::core::Cos::eval_cpu()'],['../classmlx_1_1core_1_1_cosh.html#ae8702df7e8f0e20cbeccb2a548961d3d',1,'mlx::core::Cosh::eval_cpu()'],['../classmlx_1_1core_1_1_custom_transforms.html#adba1c40c77a2138df6b5f75483f62184',1,'mlx::core::CustomTransforms::eval_cpu()'],['../classmlx_1_1core_1_1_depends.html#a0c7ea6db97337591fa53c6e6bde41e5e',1,'mlx::core::Depends::eval_cpu()'],['../classmlx_1_1core_1_1_divide.html#a823443c2a8e8b81bbcaeee6ddbcdbf49',1,'mlx::core::Divide::eval_cpu()'],['../classmlx_1_1core_1_1_div_mod.html#ae350b7b93ad128e3133ee14f247193b3',1,'mlx::core::DivMod::eval_cpu()'],['../classmlx_1_1core_1_1_select.html#aa51aa36e0adbd69e0d23d7c7adf88de2',1,'mlx::core::Select::eval_cpu()'],['../classmlx_1_1core_1_1_remainder.html#ac6c6c86a0bf02e6e529eb87f6e617ccc',1,'mlx::core::Remainder::eval_cpu()'],['../classmlx_1_1core_1_1_equal.html#aabb8aa61fa581defddcdca1274b1b454',1,'mlx::core::Equal::eval_cpu()'],['../classmlx_1_1core_1_1_erf.html#a84ea16e43d5b7f83bbc2d5ece78a3fb6',1,'mlx::core::Erf::eval_cpu()'],['../classmlx_1_1core_1_1_erf_inv.html#af579627402af3249565134884701d39e',1,'mlx::core::ErfInv::eval_cpu()'],['../classmlx_1_1core_1_1_exp.html#a47934c5a5023bc7ae7ae89bff45ebb2c',1,'mlx::core::Exp::eval_cpu()'],['../classmlx_1_1core_1_1_expm1.html#ab9c8b7aa50fe4592d55f8957baac647a',1,'mlx::core::Expm1::eval_cpu()'],['../classmlx_1_1core_1_1_f_f_t.html#a6bc262a0c2b5d4fe655e3e2e0ff28635',1,'mlx::core::FFT::eval_cpu()'],['../classmlx_1_1core_1_1_floor.html#a1a7dc5f571b7b73e7ef3cbdc1dd1fcf7',1,'mlx::core::Floor::eval_cpu()'],['../classmlx_1_1core_1_1_full.html#a3dccd3756599d7fd018b2af0093b082c',1,'mlx::core::Full::eval_cpu()'],['../classmlx_1_1core_1_1_gather.html#a9ed5587f0d04b59a2b9186c0aac21290',1,'mlx::core::Gather::eval_cpu()'],['../classmlx_1_1core_1_1_greater.html#abe1c03f311d0e0b610f3392a6566f2ae',1,'mlx::core::Greater::eval_cpu()'],['../classmlx_1_1core_1_1_greater_equal.html#a15469125b9bea89b64bfeac01590c075',1,'mlx::core::GreaterEqual::eval_cpu()'],['../classmlx_1_1core_1_1_hadamard.html#ab27d6a9df42b3aab41ace3073a4c880d',1,'mlx::core::Hadamard::eval_cpu()'],['../classmlx_1_1core_1_1_imag.html#a17d1f1f9f8528668fcdf39b636720829',1,'mlx::core::Imag::eval_cpu()'],['../classmlx_1_1core_1_1_less.html#a32624124ffece066f496b3299056bcef',1,'mlx::core::Less::eval_cpu()'],['../classmlx_1_1core_1_1_less_equal.html#a55d1352b0e97841a92503bc57c19ed16',1,'mlx::core::LessEqual::eval_cpu()'],['../classmlx_1_1core_1_1_load.html#ada026ac30566f3109d8182e35d307c0a',1,'mlx::core::Load::eval_cpu()'],['../classmlx_1_1core_1_1_log.html#aadc7bb4cb24f3ecbbb9ed54a699ab74f',1,'mlx::core::Log::eval_cpu()'],['../classmlx_1_1core_1_1_log1p.html#a8192e5438de99c4cda056987935cba23',1,'mlx::core::Log1p::eval_cpu()'],['../classmlx_1_1core_1_1_logical_not.html#acf3f7b3b20ca69533536e0e0a05725b3',1,'mlx::core::LogicalNot::eval_cpu()'],['../classmlx_1_1core_1_1_logical_and.html#adbe1c1785af1a8b827289d22b0d170b3',1,'mlx::core::LogicalAnd::eval_cpu()'],['../classmlx_1_1core_1_1_logical_or.html#a13cd4cbf26589287e85aeaaca42d7f62',1,'mlx::core::LogicalOr::eval_cpu()'],['../classmlx_1_1core_1_1_log_add_exp.html#abef17fb590b1a8d356f2a580e45d41f0',1,'mlx::core::LogAddExp::eval_cpu()'],['../classmlx_1_1core_1_1_matmul.html#a357a7f57a2a220a91977f810a69413fc',1,'mlx::core::Matmul::eval_cpu()'],['../classmlx_1_1core_1_1_maximum.html#a62b38fbe5f96db58c2b60165ac4eadcf',1,'mlx::core::Maximum::eval_cpu()'],['../classmlx_1_1core_1_1_minimum.html#a6b93f493ee87089943a8085fe59dfc6e',1,'mlx::core::Minimum::eval_cpu()'],['../classmlx_1_1core_1_1_multiply.html#a624fce06c047cdc4dfdbdcaaddb25f34',1,'mlx::core::Multiply::eval_cpu()'],['../classmlx_1_1core_1_1_negative.html#af43553dc418c8ebe75fa9cdcba103c3b',1,'mlx::core::Negative::eval_cpu()'],['../classmlx_1_1core_1_1_not_equal.html#a8f95f8b5873850b875b1641df8196047',1,'mlx::core::NotEqual::eval_cpu()'],['../classmlx_1_1core_1_1_number_of_elements.html#acc328321cf5300874ee884367cbede3f',1,'mlx::core::NumberOfElements::eval_cpu()'],['../classmlx_1_1core_1_1_pad.html#aaf82dd163cd536fbf97304f8b29080cb',1,'mlx::core::Pad::eval_cpu()'],['../classmlx_1_1core_1_1_partition.html#a784596ab567f9f3cb4fe1a69466523d8',1,'mlx::core::Partition::eval_cpu()'],['../classmlx_1_1core_1_1_power.html#a6783da16fb6ff393aaa57737f1973206',1,'mlx::core::Power::eval_cpu()'],['../classmlx_1_1core_1_1_quantized_matmul.html#ab3dfa73b74d8f4f2e9ab4f0eb016b0e3',1,'mlx::core::QuantizedMatmul::eval_cpu()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a89aae98bfbdd6563df44ef7d70f0bf8c',1,'mlx::core::GatherQMM::eval_cpu()'],['../classmlx_1_1core_1_1_random_bits.html#a5752d051cd16cf5f8d4754c0a656f0d2',1,'mlx::core::RandomBits::eval_cpu()'],['../classmlx_1_1core_1_1_real.html#a365d046caac91b521f0f5a5518037934',1,'mlx::core::Real::eval_cpu()'],['../classmlx_1_1core_1_1_reshape.html#a658de2c5f710991b48e14b2bd19b229f',1,'mlx::core::Reshape::eval_cpu()'],['../classmlx_1_1core_1_1_reduce.html#aeb8a58b560c0a09ae3a695df7829acfa',1,'mlx::core::Reduce::eval_cpu()'],['../classmlx_1_1core_1_1_round.html#ad066b0944b437f64ab546025efa00007',1,'mlx::core::Round::eval_cpu()'],['../classmlx_1_1core_1_1_scan.html#a15676d9fd066e935782a923fba3e940b',1,'mlx::core::Scan::eval_cpu()'],['../classmlx_1_1core_1_1_scatter.html#a7623f590f8b77167b5ebb4f14bc9dc97',1,'mlx::core::Scatter::eval_cpu()'],['../classmlx_1_1core_1_1_sigmoid.html#aa930ce05734cca529ebcb8d0ca8e1255',1,'mlx::core::Sigmoid::eval_cpu()'],['../classmlx_1_1core_1_1_sign.html#a7498ec993b66879be30c5d9762c45a97',1,'mlx::core::Sign::eval_cpu()'],['../classmlx_1_1core_1_1_sin.html#ab34f9cebc2aed55a0b6ab4c991f02eb5',1,'mlx::core::Sin::eval_cpu()'],['../classmlx_1_1core_1_1_sinh.html#ab6d5f6f40d177f6435f6a51c71b939dd',1,'mlx::core::Sinh::eval_cpu()'],['../classmlx_1_1core_1_1_slice.html#a4b13503f5b2f5c6a90d394b020f9b3f2',1,'mlx::core::Slice::eval_cpu()'],['../classmlx_1_1core_1_1_slice_update.html#ad82ca0e3ab88a0e086431050deea831b',1,'mlx::core::SliceUpdate::eval_cpu()'],['../classmlx_1_1core_1_1_softmax.html#ac9ebc2eab1683b682e689ed8f4622b79',1,'mlx::core::Softmax::eval_cpu()'],['../classmlx_1_1core_1_1_sort.html#a459769a0241b2620e55bedaba19827cd',1,'mlx::core::Sort::eval_cpu()'],['../classmlx_1_1core_1_1_split.html#aff2889cb9074f0fda53edf8fa40b1fd4',1,'mlx::core::Split::eval_cpu()'],['../classmlx_1_1core_1_1_square.html#a1f4d327a705950616da63b83c2829e59',1,'mlx::core::Square::eval_cpu()'],['../classmlx_1_1core_1_1_sqrt.html#a5a64ecc4eef1e30a2963435dca7cefd5',1,'mlx::core::Sqrt::eval_cpu()'],['../classmlx_1_1core_1_1_stop_gradient.html#a56207714d374b08f60e4d9cdbc7340b2',1,'mlx::core::StopGradient::eval_cpu()'],['../classmlx_1_1core_1_1_subtract.html#a47574258b6c95f8ad260c114d6d36a12',1,'mlx::core::Subtract::eval_cpu()'],['../classmlx_1_1core_1_1_tan.html#a9c9a731158fa60eef30067fe0da9f3e9',1,'mlx::core::Tan::eval_cpu()'],['../classmlx_1_1core_1_1_tanh.html#af7ed4345f622da069e5b0284067923f5',1,'mlx::core::Tanh::eval_cpu()'],['../classmlx_1_1core_1_1_uniform.html#a037a2c96b79b70a64f2b637c9f1a432f',1,'mlx::core::Uniform::eval_cpu()'],['../classmlx_1_1core_1_1_view.html#a0ad6deb11914a242f10e8039fcb02497',1,'mlx::core::View::eval_cpu()'],['../classmlx_1_1core_1_1_transpose.html#a1fbcfcca43f9ec06c63a3c14708c30f8',1,'mlx::core::Transpose::eval_cpu()'],['../classmlx_1_1core_1_1_q_r_f.html#a48493887395d65a27f04de1804d277d2',1,'mlx::core::QRF::eval_cpu()'],['../classmlx_1_1core_1_1_s_v_d.html#a637f5c39fa8b10722c04a066f6c1ada6',1,'mlx::core::SVD::eval_cpu()'],['../classmlx_1_1core_1_1_inverse.html#aeb1d8dc9bc4052a616023f65b3c7bb81',1,'mlx::core::Inverse::eval_cpu()'],['../classmlx_1_1core_1_1_cholesky.html#a4bdec36c1cc99aadf9a4a39d4c57bea5',1,'mlx::core::Cholesky::eval_cpu()'],['../classmlx_1_1core_1_1_eigh.html#a894b32e17229394f6a43b4a0655fd8be',1,'mlx::core::Eigh::eval_cpu()']]],
+  ['eval_5fgpu_28',['eval_gpu',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#a52df7155f56b8450581b2fd2747cad20',1,'mlx::core::distributed::AllReduce::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a4251ce0f2db2045226b66210b828af7a',1,'mlx::core::distributed::AllGather::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_send.html#a0c8dbd2a912be91be04ec701e29fba3d',1,'mlx::core::distributed::Send::eval_gpu()'],['../classmlx_1_1core_1_1distributed_1_1_recv.html#a932e39624bc3d234a7489c3decc4749e',1,'mlx::core::distributed::Recv::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm.html#ae7955e8d43c097eecae264e804b4d8ca',1,'mlx::core::fast::RMSNorm::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_r_m_s_norm_v_j_p.html#a48efb8fa84c4ba6cc9fb560ebbe01560',1,'mlx::core::fast::RMSNormVJP::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm.html#a77abda7f47bffa2c037a5d60cccc1528',1,'mlx::core::fast::LayerNorm::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_layer_norm_v_j_p.html#a954a003a4a27c8c4c60a5a14142a9cc3',1,'mlx::core::fast::LayerNormVJP::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_ro_p_e.html#a913b6b00fc518b25ac3947e4e15790f2',1,'mlx::core::fast::RoPE::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#a505f38ba93a3499895f5312e0112e73d',1,'mlx::core::fast::ScaledDotProductAttention::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#ad51666e69f670e286293aff96eb435a9',1,'mlx::core::fast::ScaledDotProductAttention::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;out)'],['../classmlx_1_1core_1_1fast_1_1_affine_quantize.html#a63812b2abaf26ad7e7fa4c9e82db1628',1,'mlx::core::fast::AffineQuantize::eval_gpu()'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a2ed2a16b23053f8195068386a99fd6db',1,'mlx::core::fast::CustomKernel::eval_gpu()'],['../classmlx_1_1core_1_1_primitive.html#ad217376dcf5eff691d731566faec2ba2',1,'mlx::core::Primitive::eval_gpu()'],['../classmlx_1_1core_1_1_unary_primitive.html#a6b7f80abaf038d53ec6ffbb0dfac6adb',1,'mlx::core::UnaryPrimitive::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, array &amp;output)=0'],['../classmlx_1_1core_1_1_unary_primitive.html#a971fe9ad47f6569118879ce1d0f41447',1,'mlx::core::UnaryPrimitive::eval_gpu(const std::vector&lt; array &gt; &amp;inputs, std::vector&lt; array &gt; &amp;outputs) override'],['../classmlx_1_1core_1_1_abs.html#a0a976e636dd8505b473fbdddf949f514',1,'mlx::core::Abs::eval_gpu()'],['../classmlx_1_1core_1_1_add.html#aa0aacbc1e26b95a2f040f62aa4f69c3d',1,'mlx::core::Add::eval_gpu()'],['../classmlx_1_1core_1_1_add_m_m.html#a5f933be14baebc32a0be0f9a69148aa9',1,'mlx::core::AddMM::eval_gpu()'],['../classmlx_1_1core_1_1_arange.html#a7a2e9787c6c3a78b4a6df91206974031',1,'mlx::core::Arange::eval_gpu()'],['../classmlx_1_1core_1_1_arc_cos.html#a46f72d4af89b0a0f5f203783fb44589c',1,'mlx::core::ArcCos::eval_gpu()'],['../classmlx_1_1core_1_1_arc_cosh.html#aa6a2587485a0e015ac2d5211d7d045fc',1,'mlx::core::ArcCosh::eval_gpu()'],['../classmlx_1_1core_1_1_arc_sin.html#a7fa4ae7a85bc8bed97ea258ae30762f3',1,'mlx::core::ArcSin::eval_gpu()'],['../classmlx_1_1core_1_1_arc_sinh.html#a79f648a86de4c10386a1ce3b5e38e8ac',1,'mlx::core::ArcSinh::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tan.html#a77866feb27028865d844070447c9a254',1,'mlx::core::ArcTan::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tan2.html#a76d3f0c29e0ff4642b8d39dac90d3f50',1,'mlx::core::ArcTan2::eval_gpu()'],['../classmlx_1_1core_1_1_arc_tanh.html#a10566b9d3b2c7d090895b46d9040bc1d',1,'mlx::core::ArcTanh::eval_gpu()'],['../classmlx_1_1core_1_1_arg_partition.html#a9a60995eaf85f63c877e86b23cbc15fc',1,'mlx::core::ArgPartition::eval_gpu()'],['../classmlx_1_1core_1_1_arg_reduce.html#aafa982ce2abc0cd9e81e43aa2c823d29',1,'mlx::core::ArgReduce::eval_gpu()'],['../classmlx_1_1core_1_1_arg_sort.html#abc2d730850ec4ee8d7968b7417911709',1,'mlx::core::ArgSort::eval_gpu()'],['../classmlx_1_1core_1_1_as_type.html#a5b111b9d74c60d27b4a7ebaa49f96e0b',1,'mlx::core::AsType::eval_gpu()'],['../classmlx_1_1core_1_1_as_strided.html#ab6771a208323994927ca162ba7bb10ed',1,'mlx::core::AsStrided::eval_gpu()'],['../classmlx_1_1core_1_1_bitwise_binary.html#ac831a29fc46701b00bbe63ee33832afd',1,'mlx::core::BitwiseBinary::eval_gpu()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#ab372b6df4de00a33795a052a23bb1df9',1,'mlx::core::BlockMaskedMM::eval_gpu()'],['../classmlx_1_1core_1_1_gather_m_m.html#ad754c35f460a055cc383ad93a5f72da1',1,'mlx::core::GatherMM::eval_gpu()'],['../classmlx_1_1core_1_1_broadcast.html#ab9bd9dbcedcefc9b29c84911b5ce69fe',1,'mlx::core::Broadcast::eval_gpu()'],['../classmlx_1_1core_1_1_ceil.html#abe178e0058e44b6618be414215e96887',1,'mlx::core::Ceil::eval_gpu()'],['../classmlx_1_1core_1_1_compiled.html#aa3d5ff0f2b3554ad48fbbf2a0f3336d5',1,'mlx::core::Compiled::eval_gpu()'],['../classmlx_1_1core_1_1_concatenate.html#a309a1c50e97f9925866433ee2841c474',1,'mlx::core::Concatenate::eval_gpu()'],['../classmlx_1_1core_1_1_conjugate.html#aff0a802166e3724db88ab5d3feb2d3de',1,'mlx::core::Conjugate::eval_gpu()'],['../classmlx_1_1core_1_1_contiguous.html#a519cd16fd0c55b371ea7625fbb37c70f',1,'mlx::core::Contiguous::eval_gpu()'],['../classmlx_1_1core_1_1_convolution.html#a30b64109eeb1778f002b99447dff9dd2',1,'mlx::core::Convolution::eval_gpu()'],['../classmlx_1_1core_1_1_copy.html#a1eda7b2ea771a168f67421f0d384b3a1',1,'mlx::core::Copy::eval_gpu()'],['../classmlx_1_1core_1_1_cos.html#a5ef41aafad595f6cdd8c535e36e12060',1,'mlx::core::Cos::eval_gpu()'],['../classmlx_1_1core_1_1_cosh.html#a23f71b43792934c3ec0ebe9b74f32559',1,'mlx::core::Cosh::eval_gpu()'],['../classmlx_1_1core_1_1_custom_transforms.html#a7b3538681acbb20af3ed37b0877f6667',1,'mlx::core::CustomTransforms::eval_gpu()'],['../classmlx_1_1core_1_1_depends.html#ae5057f65e69490ad0add8eeda2b75e28',1,'mlx::core::Depends::eval_gpu()'],['../classmlx_1_1core_1_1_divide.html#abffda0ce37221ddc28dc9eea794f6bc7',1,'mlx::core::Divide::eval_gpu()'],['../classmlx_1_1core_1_1_div_mod.html#a003117c9ecf3c06a27248f72a76348dc',1,'mlx::core::DivMod::eval_gpu()'],['../classmlx_1_1core_1_1_select.html#a2a82b6cba4c386b2b87f225a4b08ea9b',1,'mlx::core::Select::eval_gpu()'],['../classmlx_1_1core_1_1_remainder.html#a7919ea9b84e42522d51bf0d5a396e161',1,'mlx::core::Remainder::eval_gpu()'],['../classmlx_1_1core_1_1_equal.html#ac3757001fec42ceb5ece2954df42161c',1,'mlx::core::Equal::eval_gpu()'],['../classmlx_1_1core_1_1_erf.html#ad8551be664d767dccc3c0d8cc1eca008',1,'mlx::core::Erf::eval_gpu()'],['../classmlx_1_1core_1_1_erf_inv.html#a4a2413d0634db1f3dae1806ddfa632db',1,'mlx::core::ErfInv::eval_gpu()'],['../classmlx_1_1core_1_1_exp.html#a7d63695a97a14760fd33b5d4e6590822',1,'mlx::core::Exp::eval_gpu()'],['../classmlx_1_1core_1_1_expm1.html#a82930071f4b77d883b300f77966aff5f',1,'mlx::core::Expm1::eval_gpu()'],['../classmlx_1_1core_1_1_f_f_t.html#a1c21b26d1e9ad7c4da78ae845721b2dd',1,'mlx::core::FFT::eval_gpu()'],['../classmlx_1_1core_1_1_floor.html#aaa29c83538099eb8f951c95a41f2eb65',1,'mlx::core::Floor::eval_gpu()'],['../classmlx_1_1core_1_1_full.html#aa54f99bb4cba12a551392dea56003872',1,'mlx::core::Full::eval_gpu()'],['../classmlx_1_1core_1_1_gather.html#aec48ee529cb2449915a7b27a3c4361e8',1,'mlx::core::Gather::eval_gpu()'],['../classmlx_1_1core_1_1_greater.html#ae8957cccf4c924d941f57a1bb751c878',1,'mlx::core::Greater::eval_gpu()'],['../classmlx_1_1core_1_1_greater_equal.html#ac246263b4548126c3d4ab7e392575d24',1,'mlx::core::GreaterEqual::eval_gpu()'],['../classmlx_1_1core_1_1_hadamard.html#a2470feb690f5463138490763c38b5733',1,'mlx::core::Hadamard::eval_gpu()'],['../classmlx_1_1core_1_1_imag.html#a247a4d059b0a99678c6be8c15e42c1e6',1,'mlx::core::Imag::eval_gpu()'],['../classmlx_1_1core_1_1_less.html#a353335ce06ddbe8498d86d129c835917',1,'mlx::core::Less::eval_gpu()'],['../classmlx_1_1core_1_1_less_equal.html#acf035a82b11e6f63742143ea540fedac',1,'mlx::core::LessEqual::eval_gpu()'],['../classmlx_1_1core_1_1_load.html#a06933e887ea94a4d01d81195c5e07a3d',1,'mlx::core::Load::eval_gpu()'],['../classmlx_1_1core_1_1_log.html#aaaa49e9455f3a197bc319646b5ca6390',1,'mlx::core::Log::eval_gpu()'],['../classmlx_1_1core_1_1_log1p.html#a1b97decae7338d46874e736c95fa7431',1,'mlx::core::Log1p::eval_gpu()'],['../classmlx_1_1core_1_1_logical_not.html#a1d0d2bc93f935eca6c85ef7bf67f2d6a',1,'mlx::core::LogicalNot::eval_gpu()'],['../classmlx_1_1core_1_1_logical_and.html#a132b2eedaa3978de5a5350da3c2ca40f',1,'mlx::core::LogicalAnd::eval_gpu()'],['../classmlx_1_1core_1_1_logical_or.html#a3be1da328f0f8620de2e4fc1d22a077a',1,'mlx::core::LogicalOr::eval_gpu()'],['../classmlx_1_1core_1_1_log_add_exp.html#acace355b62ec00df649f9f99e8f2eb7a',1,'mlx::core::LogAddExp::eval_gpu()'],['../classmlx_1_1core_1_1_matmul.html#a8707a4e9b75c769e8f1dbca15c6a1ae7',1,'mlx::core::Matmul::eval_gpu()'],['../classmlx_1_1core_1_1_maximum.html#ade0f721b10a6b3a12bdadd34c48f72a7',1,'mlx::core::Maximum::eval_gpu()'],['../classmlx_1_1core_1_1_minimum.html#aadc68afa0afbe2103f19d161f5e0a2ba',1,'mlx::core::Minimum::eval_gpu()'],['../classmlx_1_1core_1_1_multiply.html#a634fcb4e981d8d3f4d94252caf25bee0',1,'mlx::core::Multiply::eval_gpu()'],['../classmlx_1_1core_1_1_negative.html#a97f1b316eace0c6d9e576d766940c75b',1,'mlx::core::Negative::eval_gpu()'],['../classmlx_1_1core_1_1_not_equal.html#a61179747e34e203150e9c660dfddb5f2',1,'mlx::core::NotEqual::eval_gpu()'],['../classmlx_1_1core_1_1_number_of_elements.html#a2c98c42915fb2bfe12f5c99ea553eff5',1,'mlx::core::NumberOfElements::eval_gpu()'],['../classmlx_1_1core_1_1_pad.html#aefd4d3a5bd8b6b35b266c9e558ada153',1,'mlx::core::Pad::eval_gpu()'],['../classmlx_1_1core_1_1_partition.html#a8eca1be21ae9ccfda46e6f3e85f506ef',1,'mlx::core::Partition::eval_gpu()'],['../classmlx_1_1core_1_1_power.html#a80577d4c0853c24027777c90a1ec7e11',1,'mlx::core::Power::eval_gpu()'],['../classmlx_1_1core_1_1_quantized_matmul.html#a2812ad007d695ed1aaf9cf706fb9c4b3',1,'mlx::core::QuantizedMatmul::eval_gpu()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a86eb048afc95646b2e96ec5493e3d887',1,'mlx::core::GatherQMM::eval_gpu()'],['../classmlx_1_1core_1_1_random_bits.html#a578756866665358577418e4cdd94aa3a',1,'mlx::core::RandomBits::eval_gpu()'],['../classmlx_1_1core_1_1_real.html#a1e209e88a43bdd1eea43ad0b03f9a7f2',1,'mlx::core::Real::eval_gpu()'],['../classmlx_1_1core_1_1_reshape.html#aa1e85f28471875750c47351520b56059',1,'mlx::core::Reshape::eval_gpu()'],['../classmlx_1_1core_1_1_reduce.html#ae9caaf42edadfe73ea208d98f526890f',1,'mlx::core::Reduce::eval_gpu()'],['../classmlx_1_1core_1_1_round.html#af7fe5ff8f3db166c203b4be4b07f13ec',1,'mlx::core::Round::eval_gpu()'],['../classmlx_1_1core_1_1_scan.html#aef22c6fc2b2cb2a907cd8965c7413dde',1,'mlx::core::Scan::eval_gpu()'],['../classmlx_1_1core_1_1_scatter.html#ab304345db3d8cfeea15e27461ae2e678',1,'mlx::core::Scatter::eval_gpu()'],['../classmlx_1_1core_1_1_sigmoid.html#a7a6bd0222d51d7f25f2719a91ccdfeca',1,'mlx::core::Sigmoid::eval_gpu()'],['../classmlx_1_1core_1_1_sign.html#afa2b48b99a194106006b44af69ffda8b',1,'mlx::core::Sign::eval_gpu()'],['../classmlx_1_1core_1_1_sin.html#a6b59f1156cf8bdad8d45acd1d825cb5e',1,'mlx::core::Sin::eval_gpu()'],['../classmlx_1_1core_1_1_sinh.html#a5a1af2399f166d5b228b5e83a1837c75',1,'mlx::core::Sinh::eval_gpu()'],['../classmlx_1_1core_1_1_slice.html#aa53c21ff06a7c659e889af6b97d10a4a',1,'mlx::core::Slice::eval_gpu()'],['../classmlx_1_1core_1_1_slice_update.html#aac1a1d122e5697be057d63552141032b',1,'mlx::core::SliceUpdate::eval_gpu()'],['../classmlx_1_1core_1_1_softmax.html#a35dac69ddcc7e2ec0e1a76fe93db85af',1,'mlx::core::Softmax::eval_gpu()'],['../classmlx_1_1core_1_1_sort.html#a4141c48f0e8670c728663f3722675382',1,'mlx::core::Sort::eval_gpu()'],['../classmlx_1_1core_1_1_split.html#a78ddda89c4daee73c74cfbc1e44656df',1,'mlx::core::Split::eval_gpu()'],['../classmlx_1_1core_1_1_square.html#a0ea2a78a5bb52daa4103263bf2f98045',1,'mlx::core::Square::eval_gpu()'],['../classmlx_1_1core_1_1_sqrt.html#a6d205e679a593d1ba20206c5c47ba501',1,'mlx::core::Sqrt::eval_gpu()'],['../classmlx_1_1core_1_1_stop_gradient.html#a907b96f0a1ce608e211d87ccf2b9ca89',1,'mlx::core::StopGradient::eval_gpu()'],['../classmlx_1_1core_1_1_subtract.html#a69021b23daf061764d97fabbc0f4f06c',1,'mlx::core::Subtract::eval_gpu()'],['../classmlx_1_1core_1_1_tan.html#aca7dbb4836507005a2032ac957a04d3f',1,'mlx::core::Tan::eval_gpu()'],['../classmlx_1_1core_1_1_tanh.html#a48df896599ae93dbce84a5c0f50cf761',1,'mlx::core::Tanh::eval_gpu()'],['../classmlx_1_1core_1_1_uniform.html#a5f88cbf2495f24f87cefd99aaaebe4d0',1,'mlx::core::Uniform::eval_gpu()'],['../classmlx_1_1core_1_1_view.html#add6e12ff1e476fe1db7718b14f21b075',1,'mlx::core::View::eval_gpu()'],['../classmlx_1_1core_1_1_transpose.html#a38d25739c08aa594a6775015a1d7d92e',1,'mlx::core::Transpose::eval_gpu()'],['../classmlx_1_1core_1_1_q_r_f.html#ae5fa3482192f4713605cd07e7fc1c6c9',1,'mlx::core::QRF::eval_gpu()'],['../classmlx_1_1core_1_1_s_v_d.html#a7067b2207f826a25549d571856b94e83',1,'mlx::core::SVD::eval_gpu()'],['../classmlx_1_1core_1_1_inverse.html#a086fbbc947ad232e01686ad063a78ed2',1,'mlx::core::Inverse::eval_gpu()'],['../classmlx_1_1core_1_1_cholesky.html#a8c918594bf129888044ef37fcae56795',1,'mlx::core::Cholesky::eval_gpu()'],['../classmlx_1_1core_1_1_eigh.html#a67775b41c0a15e356f08d51d9736baa2',1,'mlx::core::Eigh::eval_gpu()']]],
   ['event_29',['Event',['../classmlx_1_1core_1_1_event.html#a833506419b2110ad1abd89b2dd238b4d',1,'mlx::core::Event::Event()=default'],['../classmlx_1_1core_1_1_event.html#a13e4835f2ffb2cc22e29148a448ea184',1,'mlx::core::Event::Event(const Stream &amp;steam)']]],
   ['event_30',['event',['../classmlx_1_1core_1_1array.html#a0a8e4d6e67e739a712876bb36f88f9bf',1,'mlx::core::array']]],
   ['exec_31',['exec',['../classpocketfft_1_1detail_1_1cfftp.html#a95211024bf007d27e700835db556fbd2',1,'pocketfft::detail::cfftp::exec()'],['../classpocketfft_1_1detail_1_1rfftp.html#a073972f42bdd3617693be7be2cb5e0ac',1,'pocketfft::detail::rfftp::exec()'],['../classpocketfft_1_1detail_1_1fftblue.html#a5fb03413a3d1a653842875adcf87ae8c',1,'pocketfft::detail::fftblue::exec()'],['../classpocketfft_1_1detail_1_1pocketfft__c.html#a436afd63e8e130f97aff103ae964a45d',1,'pocketfft::detail::pocketfft_c::exec()'],['../classpocketfft_1_1detail_1_1pocketfft__r.html#a2815bc8aa04fa986834b02e502f98b33',1,'pocketfft::detail::pocketfft_r::exec()'],['../classpocketfft_1_1detail_1_1_t__dct1.html#a7736111ff9d220f983e41a6fecd5f058',1,'pocketfft::detail::T_dct1::exec()'],['../classpocketfft_1_1detail_1_1_t__dst1.html#a598a9511004263eb3610053d7efc9e26',1,'pocketfft::detail::T_dst1::exec()'],['../classpocketfft_1_1detail_1_1_t__dcst23.html#a2a45b7b4612904c2be69c01f6d5029ac',1,'pocketfft::detail::T_dcst23::exec()'],['../classpocketfft_1_1detail_1_1_t__dcst4.html#af794ebf21009d5f918681188081df708',1,'pocketfft::detail::T_dcst4::exec()']]],
diff --git a/docs/build/html/search/functions_6.js b/docs/build/html/search/functions_6.js
index fd72ead01..2cfc4370e 100644
--- a/docs/build/html/search/functions_6.js
+++ b/docs/build/html/search/functions_6.js
@@ -15,7 +15,7 @@ var searchData=
   ['fill_5fgpu_12',['fill_gpu',['../namespacemlx_1_1core.html#ae789dbda2a0f4e21aa0984f6a5dc986c',1,'mlx::core']]],
   ['flags_13',['flags',['../classmlx_1_1core_1_1array.html#a0a20a6065ae71b64c1e3aa22a45fd8a1',1,'mlx::core::array']]],
   ['flatten_14',['flatten',['../group__ops.html#ga50aa98754b412bb57c083f6e3e95061f',1,'mlx::core::flatten(const array &amp;a, int start_axis, int end_axis=-1, StreamOrDevice s={})'],['../group__ops.html#gaa6adbc9c86f0ab27d8810a02e9e719fd',1,'mlx::core::flatten(const array &amp;a, StreamOrDevice s={})']]],
-  ['float_5fto_5fbfloat_5fbits_15',['float_to_bfloat_bits',['../backend_2metal_2kernels_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1',1,'bf16.h']]],
+  ['float_5fto_5fbfloat_5fbits_15',['float_to_bfloat_bits',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31ce5e8e860295fa236e0d4b0befeae1',1,'bf16.h']]],
   ['floor_16',['Floor',['../classmlx_1_1core_1_1_floor.html#ada4e979b784b732696313d7094e91340',1,'mlx::core::Floor']]],
   ['floor_17',['floor',['../namespacemetal.html#a020790f30c28a9982c4a83deaa258277',1,'metal::floor()'],['../namespacemetal_1_1fast.html#ac012ce1701c2339914f15cce9f2c632f',1,'metal::fast::floor()'],['../namespacemetal_1_1precise.html#a66e02b028e3cecfe7c80773460dc7925',1,'metal::precise::floor()'],['../group__ops.html#ga8d656904aa2690b60955ae745aecfc30',1,'mlx::core::floor(const array &amp;a, StreamOrDevice s={})']]],
   ['floor_5fdivide_18',['floor_divide',['../group__ops.html#ga05b4c6054d028107869511f927da01cd',1,'mlx::core']]],
@@ -28,7 +28,7 @@ var searchData=
   ['fmod_25',['fmod',['../namespacemetal.html#a2ff952d4d596a7969b2a3035fc2fda58',1,'metal::fmod()'],['../namespacemetal_1_1fast.html#adbec09f18a89f773d7e368ef04a69526',1,'metal::fast::fmod()'],['../namespacemetal_1_1precise.html#aa99937178a1fc8158054e328eeeae648',1,'metal::precise::fmod()']]],
   ['four_5fstep_5ffft_26',['four_step_fft',['../backend_2metal_2kernels_2fft_8h.html#a6558a8205ee4c3e4767bafa93f7606de',1,'fft.h']]],
   ['fract_27',['fract',['../namespacemetal.html#a6b1c15d251aeaacb1f4338a5e152ae78',1,'metal::fract()'],['../namespacemetal_1_1fast.html#aa8bb448827503e485eb649eb3edb2d4c',1,'metal::fast::fract()'],['../namespacemetal_1_1precise.html#a0f21c19332a90df1a8ff507a813b5757',1,'metal::precise::fract()']]],
-  ['frag_5fat_28',['frag_at',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4',1,'mlx::steel::MMATile::frag_at(const short i, const short j)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485',1,'mlx::steel::MMATile::frag_at(const short i, const short j) const']]],
+  ['frag_5fat_28',['frag_at',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4',1,'mlx::steel::MMATile::frag_at(const short i, const short j)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485',1,'mlx::steel::MMATile::frag_at(const short i, const short j) const'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4',1,'mlx::steel::MMATile::frag_at(const short i, const short j)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485',1,'mlx::steel::MMATile::frag_at(const short i, const short j) const']]],
   ['free_29',['free',['../classmlx_1_1core_1_1allocator_1_1_allocator.html#ae963d551be646ae0e13df2c16f2beefb',1,'mlx::core::allocator::Allocator::free()'],['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html#a84b50d1a3cbffa12c1a6cf0ed8c71079',1,'mlx::core::allocator::CommonAllocator::free()'],['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a109a0a37fb0b3be381a62dc3b1a54bf0',1,'mlx::core::metal::MetalAllocator::free()'],['../namespacemlx_1_1core_1_1allocator.html#a77f0a1215be242db6485612bcb273af5',1,'mlx::core::allocator::free()']]],
   ['frexp_30',['frexp',['../namespacemetal.html#ac89d4ef524d21a301da6c37dbd95ff9f',1,'metal::frexp()'],['../namespacemetal_1_1fast.html#a23902df22aeaa859ef673a36381387c2',1,'metal::fast::frexp()'],['../namespacemetal_1_1precise.html#a0fbb1624c308b97380f894f92fd858b4',1,'metal::precise::frexp()']]],
   ['full_31',['Full',['../classmlx_1_1core_1_1_full.html#aafcb86a2e41353853ec48c717e0c54d6',1,'mlx::core::Full']]],
diff --git a/docs/build/html/search/functions_7.js b/docs/build/html/search/functions_7.js
index 91a1bdfb7..ad5526ce0 100644
--- a/docs/build/html/search/functions_7.js
+++ b/docs/build/html/search/functions_7.js
@@ -2,13 +2,13 @@ var searchData=
 [
   ['gather_0',['Gather',['../classmlx_1_1core_1_1_gather.html#a5b5f47ceff1d43477c87be5116f261d0',1,'mlx::core::Gather']]],
   ['gather_1',['gather',['../namespacemlx_1_1core_1_1metal.html#a545de371fefba1feec2e70b7e9f4187c',1,'mlx::core::metal::gather()'],['../group__ops.html#gab6e7f655a9ff15350ca5379692f9d444',1,'mlx::core::gather(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const std::vector&lt; int &gt; &amp;axes, const std::vector&lt; int &gt; &amp;slice_sizes, StreamOrDevice s={})'],['../group__ops.html#gadb4337ca5d4f88fe9e7c083bc478158b',1,'mlx::core::gather(const array &amp;a, const array &amp;indices, int axis, const std::vector&lt; int &gt; &amp;slice_sizes, StreamOrDevice s={})']]],
-  ['gather_5fimpl_2',['gather_impl',['../gather_8h.html#abdec470e1af0109563ddae3e85e6526c',1,'gather.h']]],
+  ['gather_5fimpl_2',['gather_impl',['../gather_8h.html#a767d7c5be6f2f649101f581449af5599',1,'gather.h']]],
   ['gather_5fmm_3',['gather_mm',['../group__ops.html#ga8d50480266d258cac40ff51bcb0fc6a7',1,'mlx::core']]],
   ['gather_5fqmm_4',['gather_qmm',['../group__ops.html#ga368a0dc0e5dfb76922e7aa55a95f12f0',1,'mlx::core']]],
   ['gathermm_5',['GatherMM',['../classmlx_1_1core_1_1_gather_m_m.html#afd9bbc08138181b80e2fb86536ff3f2a',1,'mlx::core::GatherMM']]],
   ['gatherqmm_6',['GatherQMM',['../classmlx_1_1core_1_1_gather_q_m_m.html#a60ed2ade7f10dd9c9314913a810f9360',1,'mlx::core::GatherQMM']]],
   ['gemm_7',['gemm',['../namespacemlx_1_1core_1_1metal.html#ac46fd23516a61fc56d997910e4144281',1,'mlx::core::metal::gemm()'],['../steel__gemm__fused_8h.html#aa40dd40b9a0bbf20c8911032ed0c3e6d',1,'gemm():&#160;steel_gemm_fused.h']]],
-  ['gemm_5floop_8',['gemm_loop',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780',1,'mlx::steel::GEMMKernel']]],
+  ['gemm_5floop_8',['gemm_loop',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780',1,'mlx::steel::GEMMKernel::gemm_loop(threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread loader_a_t &amp;loader_a, thread loader_b_t &amp;loader_b, thread mma_t &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, LoopAlignment&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})'],['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780',1,'mlx::steel::GEMMKernel::gemm_loop(threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread loader_a_t &amp;loader_a, thread loader_b_t &amp;loader_b, thread mma_t &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, LoopAlignment&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})']]],
   ['gemm_5fsplitk_9',['gemm_splitk',['../steel__gemm__splitk_8h.html#a3be6e095a0a026d3ecf57a3e67f76188',1,'steel_gemm_splitk.h']]],
   ['gemm_5fsplitk_5faccum_10',['gemm_splitk_accum',['../steel__gemm__splitk_8h.html#abeb921bf1dc7941125188ddd390b0907',1,'steel_gemm_splitk.h']]],
   ['gemm_5fsplitk_5faccum_5faxpby_11',['gemm_splitk_accum_axpby',['../steel__gemm__splitk_8h.html#acc33fdfaaf3eb3a0629b3d52c7043dc1',1,'steel_gemm_splitk.h']]],
@@ -29,7 +29,7 @@ var searchData=
   ['get_5fcommand_5fbuffer_26',['get_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210',1,'mlx::core::metal::Device']]],
   ['get_5fcommand_5fbuffer_5fops_27',['get_command_buffer_ops',['../classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8',1,'mlx::core::metal::Device']]],
   ['get_5fcommand_5fencoder_28',['get_command_encoder',['../classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6',1,'mlx::core::metal::Device']]],
-  ['get_5fcoord_29',['get_coord',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
+  ['get_5fcoord_29',['get_coord',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::get_coord(ushort simd_lane_id)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::get_coord(ushort simd_lane_id)']]],
   ['get_5fcopy_5fkernel_30',['get_copy_kernel',['../namespacemlx_1_1core.html#a05a220cff45f12439fde775983c6df78',1,'mlx::core']]],
   ['get_5fdefault_5fstream_31',['get_default_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a2366c7b888e433608e203752edc92282',1,'mlx::core::scheduler::Scheduler']]],
   ['get_5ffft_5fkernel_32',['get_fft_kernel',['../namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f',1,'mlx::core']]],
@@ -44,8 +44,8 @@ var searchData=
   ['get_5fpool_41',['get_pool',['../namespacepocketfft_1_1detail_1_1threading.html#a7ec2b3f99232bd0f15f7b022c59d139a',1,'pocketfft::detail::threading']]],
   ['get_5fprimitive_5fstring_42',['get_primitive_string',['../namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60',1,'mlx::core']]],
   ['get_5fquantized_5fkernel_43',['get_quantized_kernel',['../namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e',1,'mlx::core']]],
-  ['get_5freduce_5finit_5fkernel_44',['get_reduce_init_kernel',['../namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647',1,'mlx::core']]],
-  ['get_5freduce_5fkernel_45',['get_reduce_kernel',['../namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b',1,'mlx::core']]],
+  ['get_5freduce_5finit_5fkernel_44',['get_reduce_init_kernel',['../namespacemlx_1_1core.html#ae0470605dc819efeb6510183619f0299',1,'mlx::core']]],
+  ['get_5freduce_5fkernel_45',['get_reduce_kernel',['../namespacemlx_1_1core.html#a1be32ba7d67137dde7ac191dfe83ff49',1,'mlx::core']]],
   ['get_5freduction_5fplan_46',['get_reduction_plan',['../namespacemlx_1_1core.html#ac97b5a6f009ca3d99854ce9512c20dba',1,'mlx::core']]],
   ['get_5fscan_5fkernel_47',['get_scan_kernel',['../namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f',1,'mlx::core']]],
   ['get_5fshape_48',['get_shape',['../namespacemlx_1_1core.html#aab0d8a256957984acc1e3615c65c898e',1,'mlx::core']]],
@@ -62,16 +62,17 @@ var searchData=
   ['get_5ftwiddle_59',['get_twiddle',['../radix_8h.html#ac5cf950316b9445296ee9ecfc56a56bd',1,'radix.h']]],
   ['get_5ftype_5fstring_60',['get_type_string',['../namespacemlx_1_1core.html#af776fd91dd60594dcfebbafd17f19068',1,'mlx::core']]],
   ['get_5funary_5fkernel_61',['get_unary_kernel',['../namespacemlx_1_1core.html#afbb085188b563a54606d84f87a9bf5a6',1,'mlx::core']]],
-  ['gguf_5fload_5fquantized_62',['gguf_load_quantized',['../namespacemlx_1_1core.html#a65dd68163bdaef3631e3724327782498',1,'mlx::core']]],
-  ['good_63',['good',['../classmlx_1_1core_1_1io_1_1_reader.html#a005d0b52c1f34866f7412b7f41dabec3',1,'mlx::core::io::Reader::good()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a0b050c2c27487007e250e2e19560ffe4',1,'mlx::core::io::Writer::good()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#ac54a2c693acc3d9e6e942412148ffcc9',1,'mlx::core::io::ParallelFileReader::good()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9ec4934b26fb358d699ddce1482b2d54',1,'mlx::core::io::FileWriter::good()']]],
-  ['good_5fsize_5fcmplx_64',['good_size_cmplx',['../structpocketfft_1_1detail_1_1util.html#a758e00d242a1b7eda8f9f0c21f35c624',1,'pocketfft::detail::util']]],
-  ['good_5fsize_5freal_65',['good_size_real',['../structpocketfft_1_1detail_1_1util.html#a173da7d5929ded86fffcebcfdc5086aa',1,'pocketfft::detail::util']]],
-  ['grad_66',['grad',['../namespacemlx_1_1core.html#a3d2b2929ed4636e9e2b86e125b2e57d9',1,'mlx::core::grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;argnums)'],['../namespacemlx_1_1core.html#af482f6c64acd77c57ef5bb4b7be9726c',1,'mlx::core::grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, int argnum=0)'],['../namespacemlx_1_1core.html#a64bc619876b0f8cc81a2637ca81c99f7',1,'mlx::core::grad(const std::function&lt; array(const array &amp;)&gt; &amp;fun)']]],
-  ['greater_67',['Greater',['../classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b',1,'mlx::core::Greater']]],
-  ['greater_68',['greater',['../group__ops.html#gaf4ec7bfc1ad13b891f1f3ef1772ef04d',1,'mlx::core']]],
-  ['greater_5fequal_69',['greater_equal',['../group__ops.html#ga7153071bcfff6faad21332163fb9a430',1,'mlx::core']]],
-  ['greaterequal_70',['GreaterEqual',['../classmlx_1_1core_1_1_greater_equal.html#a19a3c49d5a9b40e17da0e56ef6908527',1,'mlx::core::GreaterEqual']]],
-  ['group_71',['Group',['../structmlx_1_1core_1_1distributed_1_1_group.html#a6f84accc8d6734989b2757bf6cdd0152',1,'mlx::core::distributed::Group']]],
-  ['group_72',['group',['../classmlx_1_1core_1_1distributed_1_1_dist_primitive.html#a8831cb61ac633431b78b5fb99c0ea9ff',1,'mlx::core::distributed::DistPrimitive']]],
-  ['gumbel_73',['gumbel',['../namespacemlx_1_1core_1_1random.html#aa849b765cd794306997bcbb9936d3d84',1,'mlx::core::random']]]
+  ['get_5fvar_62',['get_var',['../namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3',1,'mlx::core::env']]],
+  ['gguf_5fload_5fquantized_63',['gguf_load_quantized',['../namespacemlx_1_1core.html#a65dd68163bdaef3631e3724327782498',1,'mlx::core']]],
+  ['good_64',['good',['../classmlx_1_1core_1_1io_1_1_reader.html#a005d0b52c1f34866f7412b7f41dabec3',1,'mlx::core::io::Reader::good()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a0b050c2c27487007e250e2e19560ffe4',1,'mlx::core::io::Writer::good()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#ac54a2c693acc3d9e6e942412148ffcc9',1,'mlx::core::io::ParallelFileReader::good()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9ec4934b26fb358d699ddce1482b2d54',1,'mlx::core::io::FileWriter::good()']]],
+  ['good_5fsize_5fcmplx_65',['good_size_cmplx',['../structpocketfft_1_1detail_1_1util.html#a758e00d242a1b7eda8f9f0c21f35c624',1,'pocketfft::detail::util']]],
+  ['good_5fsize_5freal_66',['good_size_real',['../structpocketfft_1_1detail_1_1util.html#a173da7d5929ded86fffcebcfdc5086aa',1,'pocketfft::detail::util']]],
+  ['grad_67',['grad',['../namespacemlx_1_1core.html#a3d2b2929ed4636e9e2b86e125b2e57d9',1,'mlx::core::grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; int &gt; &amp;argnums)'],['../namespacemlx_1_1core.html#af482f6c64acd77c57ef5bb4b7be9726c',1,'mlx::core::grad(const std::function&lt; array(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, int argnum=0)'],['../namespacemlx_1_1core.html#a64bc619876b0f8cc81a2637ca81c99f7',1,'mlx::core::grad(const std::function&lt; array(const array &amp;)&gt; &amp;fun)']]],
+  ['greater_68',['Greater',['../classmlx_1_1core_1_1_greater.html#a1d5992a66c020cd97a70e8e3d8cd1a1b',1,'mlx::core::Greater']]],
+  ['greater_69',['greater',['../group__ops.html#gaf4ec7bfc1ad13b891f1f3ef1772ef04d',1,'mlx::core']]],
+  ['greater_5fequal_70',['greater_equal',['../group__ops.html#ga7153071bcfff6faad21332163fb9a430',1,'mlx::core']]],
+  ['greaterequal_71',['GreaterEqual',['../classmlx_1_1core_1_1_greater_equal.html#a19a3c49d5a9b40e17da0e56ef6908527',1,'mlx::core::GreaterEqual']]],
+  ['group_72',['Group',['../structmlx_1_1core_1_1distributed_1_1_group.html#a6f84accc8d6734989b2757bf6cdd0152',1,'mlx::core::distributed::Group']]],
+  ['group_73',['group',['../classmlx_1_1core_1_1distributed_1_1_dist_primitive.html#a8831cb61ac633431b78b5fb99c0ea9ff',1,'mlx::core::distributed::DistPrimitive']]],
+  ['gumbel_74',['gumbel',['../namespacemlx_1_1core_1_1random.html#aa849b765cd794306997bcbb9936d3d84',1,'mlx::core::random']]]
 ];
diff --git a/docs/build/html/search/functions_9.js b/docs/build/html/search/functions_9.js
index e1edfbb0f..f6f0725a4 100644
--- a/docs/build/html/search/functions_9.js
+++ b/docs/build/html/search/functions_9.js
@@ -26,7 +26,7 @@ var searchData=
   ['irfftn_23',['irfftn',['../namespacemlx_1_1core_1_1fft.html#a33f2973ea1b621e67064e46136d2960f',1,'mlx::core::fft::irfftn(const array &amp;a, const std::vector&lt; int &gt; &amp;n, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#a1c9ad11121c5879d5c04bbde2ee238c3',1,'mlx::core::fft::irfftn(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1fft.html#aaf5a7ef93b3426b94c2363a23a5a5b36',1,'mlx::core::fft::irfftn(const array &amp;a, StreamOrDevice s={})']]],
   ['is_5favailable_24',['is_available',['../classmlx_1_1core_1_1array.html#aebed1f37c19197be76105161102a8a40',1,'mlx::core::array::is_available()'],['../namespacemlx_1_1core_1_1metal.html#a0cdf2c08c7bc0927a86070adc206987f',1,'mlx::core::metal::is_available()'],['../namespacemlx_1_1core_1_1distributed.html#a95655473cd0032c06e5fe3fca85aeef3',1,'mlx::core::distributed::is_available()']]],
   ['is_5fdonatable_25',['is_donatable',['../classmlx_1_1core_1_1array.html#a4677a404b5d191af20b52649225de087',1,'mlx::core::array::is_donatable()'],['../namespacemlx_1_1core.html#af650e831ce21759da1ac103037d08d84',1,'mlx::core::is_donatable()']]],
-  ['is_5fequivalent_26',['is_equivalent',['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62',1,'mlx::core::fast::ScaledDotProductAttention::is_equivalent()'],['../classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd',1,'mlx::core::Primitive::is_equivalent()'],['../classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67',1,'mlx::core::Abs::is_equivalent()'],['../classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f',1,'mlx::core::Add::is_equivalent()'],['../classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f',1,'mlx::core::AddMM::is_equivalent()'],['../classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35',1,'mlx::core::Arange::is_equivalent()'],['../classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5',1,'mlx::core::ArcCos::is_equivalent()'],['../classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee',1,'mlx::core::ArcCosh::is_equivalent()'],['../classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab',1,'mlx::core::ArcSin::is_equivalent()'],['../classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f',1,'mlx::core::ArcSinh::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c',1,'mlx::core::ArcTan::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc',1,'mlx::core::ArcTan2::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2',1,'mlx::core::ArcTanh::is_equivalent()'],['../classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a',1,'mlx::core::ArgPartition::is_equivalent()'],['../classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97',1,'mlx::core::ArgReduce::is_equivalent()'],['../classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845',1,'mlx::core::ArgSort::is_equivalent()'],['../classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af',1,'mlx::core::AsType::is_equivalent()'],['../classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094',1,'mlx::core::AsStrided::is_equivalent()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8',1,'mlx::core::BitwiseBinary::is_equivalent()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160',1,'mlx::core::BlockMaskedMM::is_equivalent()'],['../classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b',1,'mlx::core::GatherMM::is_equivalent()'],['../classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616',1,'mlx::core::Broadcast::is_equivalent()'],['../classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52',1,'mlx::core::Ceil::is_equivalent()'],['../classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10',1,'mlx::core::Compiled::is_equivalent()'],['../classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2',1,'mlx::core::Concatenate::is_equivalent()'],['../classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e',1,'mlx::core::Conjugate::is_equivalent()'],['../classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de',1,'mlx::core::Convolution::is_equivalent()'],['../classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da',1,'mlx::core::Copy::is_equivalent()'],['../classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417',1,'mlx::core::Cos::is_equivalent()'],['../classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9',1,'mlx::core::Cosh::is_equivalent()'],['../classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650',1,'mlx::core::Divide::is_equivalent()'],['../classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a',1,'mlx::core::DivMod::is_equivalent()'],['../classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8',1,'mlx::core::Select::is_equivalent()'],['../classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814',1,'mlx::core::Remainder::is_equivalent()'],['../classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02',1,'mlx::core::Equal::is_equivalent()'],['../classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82',1,'mlx::core::Erf::is_equivalent()'],['../classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832',1,'mlx::core::ErfInv::is_equivalent()'],['../classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357',1,'mlx::core::Exp::is_equivalent()'],['../classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06',1,'mlx::core::FFT::is_equivalent()'],['../classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94',1,'mlx::core::Floor::is_equivalent()'],['../classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792',1,'mlx::core::Full::is_equivalent()'],['../classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa',1,'mlx::core::Gather::is_equivalent()'],['../classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1',1,'mlx::core::Greater::is_equivalent()'],['../classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc',1,'mlx::core::GreaterEqual::is_equivalent()'],['../classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8',1,'mlx::core::Hadamard::is_equivalent()'],['../classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5',1,'mlx::core::Imag::is_equivalent()'],['../classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63',1,'mlx::core::Less::is_equivalent()'],['../classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af',1,'mlx::core::LessEqual::is_equivalent()'],['../classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8',1,'mlx::core::Log::is_equivalent()'],['../classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99',1,'mlx::core::LogicalNot::is_equivalent()'],['../classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be',1,'mlx::core::LogicalAnd::is_equivalent()'],['../classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71',1,'mlx::core::LogicalOr::is_equivalent()'],['../classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4',1,'mlx::core::LogAddExp::is_equivalent()'],['../classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630',1,'mlx::core::Matmul::is_equivalent()'],['../classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46',1,'mlx::core::Maximum::is_equivalent()'],['../classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4',1,'mlx::core::Minimum::is_equivalent()'],['../classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2',1,'mlx::core::Multiply::is_equivalent()'],['../classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823',1,'mlx::core::Negative::is_equivalent()'],['../classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d',1,'mlx::core::NotEqual::is_equivalent()'],['../classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f',1,'mlx::core::NumberOfElements::is_equivalent()'],['../classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b',1,'mlx::core::Pad::is_equivalent()'],['../classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8',1,'mlx::core::Partition::is_equivalent()'],['../classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68',1,'mlx::core::Power::is_equivalent()'],['../classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1',1,'mlx::core::QuantizedMatmul::is_equivalent()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11',1,'mlx::core::GatherQMM::is_equivalent()'],['../classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6',1,'mlx::core::RandomBits::is_equivalent()'],['../classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239',1,'mlx::core::Real::is_equivalent()'],['../classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3',1,'mlx::core::Reshape::is_equivalent()'],['../classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e',1,'mlx::core::Reduce::is_equivalent()'],['../classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927',1,'mlx::core::Round::is_equivalent()'],['../classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6',1,'mlx::core::Scan::is_equivalent()'],['../classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f',1,'mlx::core::Scatter::is_equivalent()'],['../classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e',1,'mlx::core::Sigmoid::is_equivalent()'],['../classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb',1,'mlx::core::Sign::is_equivalent()'],['../classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a',1,'mlx::core::Sin::is_equivalent()'],['../classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d',1,'mlx::core::Sinh::is_equivalent()'],['../classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0',1,'mlx::core::Slice::is_equivalent()'],['../classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119',1,'mlx::core::SliceUpdate::is_equivalent()'],['../classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728',1,'mlx::core::Softmax::is_equivalent()'],['../classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511',1,'mlx::core::Sort::is_equivalent()'],['../classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345',1,'mlx::core::Split::is_equivalent()'],['../classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2',1,'mlx::core::Square::is_equivalent()'],['../classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46',1,'mlx::core::Sqrt::is_equivalent()'],['../classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3',1,'mlx::core::StopGradient::is_equivalent()'],['../classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b',1,'mlx::core::Subtract::is_equivalent()'],['../classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4',1,'mlx::core::Tan::is_equivalent()'],['../classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda',1,'mlx::core::Tanh::is_equivalent()'],['../classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b',1,'mlx::core::Uniform::is_equivalent()'],['../classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64',1,'mlx::core::View::is_equivalent()'],['../classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab',1,'mlx::core::Transpose::is_equivalent()'],['../classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381',1,'mlx::core::Eigh::is_equivalent()']]],
+  ['is_5fequivalent_26',['is_equivalent',['../classmlx_1_1core_1_1fast_1_1_scaled_dot_product_attention.html#af08b1294f3f93505a96fdfa85b1edd62',1,'mlx::core::fast::ScaledDotProductAttention::is_equivalent()'],['../classmlx_1_1core_1_1_primitive.html#a6140a502af4c2bbbc776ab26e9afebcd',1,'mlx::core::Primitive::is_equivalent()'],['../classmlx_1_1core_1_1_abs.html#ab6f0ec56bc7c048382297e12dabadc67',1,'mlx::core::Abs::is_equivalent()'],['../classmlx_1_1core_1_1_add.html#aba0a35410c3aac53d0f7a0c283d9ee3f',1,'mlx::core::Add::is_equivalent()'],['../classmlx_1_1core_1_1_add_m_m.html#a6e37c6882dba995a63fb6d8dfb01754f',1,'mlx::core::AddMM::is_equivalent()'],['../classmlx_1_1core_1_1_arange.html#a7b6a45cf9c4b109d4e0373f3fe576c35',1,'mlx::core::Arange::is_equivalent()'],['../classmlx_1_1core_1_1_arc_cos.html#a39557461e3235801886675a9b7d25bf5',1,'mlx::core::ArcCos::is_equivalent()'],['../classmlx_1_1core_1_1_arc_cosh.html#a6928e827b9ac2e86e7d5b02b78150eee',1,'mlx::core::ArcCosh::is_equivalent()'],['../classmlx_1_1core_1_1_arc_sin.html#a13b5e39eeccaf32d94b8eb85b3b753ab',1,'mlx::core::ArcSin::is_equivalent()'],['../classmlx_1_1core_1_1_arc_sinh.html#a63c7a765c7906242dc3371deec094f0f',1,'mlx::core::ArcSinh::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tan.html#a0e5b5fc7218143ecd0a8666d9137c34c',1,'mlx::core::ArcTan::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tan2.html#aeaee58cd803d3ebf0b76574a409682cc',1,'mlx::core::ArcTan2::is_equivalent()'],['../classmlx_1_1core_1_1_arc_tanh.html#ac8ecdd640043dab0461d49d7650679a2',1,'mlx::core::ArcTanh::is_equivalent()'],['../classmlx_1_1core_1_1_arg_partition.html#ad87509ce70b51fb75dfb9c3a05a5b31a',1,'mlx::core::ArgPartition::is_equivalent()'],['../classmlx_1_1core_1_1_arg_reduce.html#a03b81a670dcb1e39bf7279e4d4583b97',1,'mlx::core::ArgReduce::is_equivalent()'],['../classmlx_1_1core_1_1_arg_sort.html#a048cd09c557d29d1111726f97010a845',1,'mlx::core::ArgSort::is_equivalent()'],['../classmlx_1_1core_1_1_as_type.html#a8e6c8b2428ab15c4fb43f2e3a8fb38af',1,'mlx::core::AsType::is_equivalent()'],['../classmlx_1_1core_1_1_as_strided.html#a1738c6aa0a3a3eb68530f0d5b436e094',1,'mlx::core::AsStrided::is_equivalent()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a8cd6b916b4838a6c329cf4df8530c3b8',1,'mlx::core::BitwiseBinary::is_equivalent()'],['../classmlx_1_1core_1_1_block_masked_m_m.html#aef1c303955f9b8f445296372cf181160',1,'mlx::core::BlockMaskedMM::is_equivalent()'],['../classmlx_1_1core_1_1_gather_m_m.html#a163f17f6ce2c002f22e81b302777342b',1,'mlx::core::GatherMM::is_equivalent()'],['../classmlx_1_1core_1_1_broadcast.html#a0e27692b0090ec451954649a36042616',1,'mlx::core::Broadcast::is_equivalent()'],['../classmlx_1_1core_1_1_ceil.html#aacd90acb56eb0649c1cef807aa21df52',1,'mlx::core::Ceil::is_equivalent()'],['../classmlx_1_1core_1_1_compiled.html#a63e5016458887813b4a59dee5a0a3f10',1,'mlx::core::Compiled::is_equivalent()'],['../classmlx_1_1core_1_1_concatenate.html#aaf8a72a0c30114460caf519580cc35d2',1,'mlx::core::Concatenate::is_equivalent()'],['../classmlx_1_1core_1_1_conjugate.html#af42f00a790c6bc5572bd8fe9e5b36c5e',1,'mlx::core::Conjugate::is_equivalent()'],['../classmlx_1_1core_1_1_contiguous.html#aa5d273a461fc6e64f3c9a67c24cb3372',1,'mlx::core::Contiguous::is_equivalent()'],['../classmlx_1_1core_1_1_convolution.html#afb87708a5e3aab2e9e663daa9d8863de',1,'mlx::core::Convolution::is_equivalent()'],['../classmlx_1_1core_1_1_copy.html#afcfa39465015f638e294aa954ea0f3da',1,'mlx::core::Copy::is_equivalent()'],['../classmlx_1_1core_1_1_cos.html#ab611ca38c987915659f7ffcce0370417',1,'mlx::core::Cos::is_equivalent()'],['../classmlx_1_1core_1_1_cosh.html#ae0bacccaf501f5349db0c13cca776ff9',1,'mlx::core::Cosh::is_equivalent()'],['../classmlx_1_1core_1_1_divide.html#a3dda091f05c4164c29bb8129e9712650',1,'mlx::core::Divide::is_equivalent()'],['../classmlx_1_1core_1_1_div_mod.html#af5fcf8ec8515d46844cbeeab6dafb38a',1,'mlx::core::DivMod::is_equivalent()'],['../classmlx_1_1core_1_1_select.html#afc3c333fac7f902c98839921ef2874c8',1,'mlx::core::Select::is_equivalent()'],['../classmlx_1_1core_1_1_remainder.html#a802039faaa2ed7b763ec3d7debcce814',1,'mlx::core::Remainder::is_equivalent()'],['../classmlx_1_1core_1_1_equal.html#a58c1c5003e43f47dc0788c1851deaa02',1,'mlx::core::Equal::is_equivalent()'],['../classmlx_1_1core_1_1_erf.html#abe99dfbc2954c3a7d5dec56ab165ee82',1,'mlx::core::Erf::is_equivalent()'],['../classmlx_1_1core_1_1_erf_inv.html#aaac9e3b454ba564f9c6e804ab6562832',1,'mlx::core::ErfInv::is_equivalent()'],['../classmlx_1_1core_1_1_exp.html#ac6e44bffe7a643ab4ca51e74c7328357',1,'mlx::core::Exp::is_equivalent()'],['../classmlx_1_1core_1_1_f_f_t.html#a0ede3bc8b6d77d560c0a750b68fddc06',1,'mlx::core::FFT::is_equivalent()'],['../classmlx_1_1core_1_1_floor.html#a24b64feb026c4fcd02fc481cffdb1c94',1,'mlx::core::Floor::is_equivalent()'],['../classmlx_1_1core_1_1_full.html#afafcbcae1e28597fe8f7fde289105792',1,'mlx::core::Full::is_equivalent()'],['../classmlx_1_1core_1_1_gather.html#a23ff1406dbf0c770e75ad47440b467aa',1,'mlx::core::Gather::is_equivalent()'],['../classmlx_1_1core_1_1_greater.html#a6877a6888614a618dc64296763ccabb1',1,'mlx::core::Greater::is_equivalent()'],['../classmlx_1_1core_1_1_greater_equal.html#a3daef8596b963026b602019bc56fc5fc',1,'mlx::core::GreaterEqual::is_equivalent()'],['../classmlx_1_1core_1_1_hadamard.html#a8a528d8d69a7343bdfd704a3e74230b8',1,'mlx::core::Hadamard::is_equivalent()'],['../classmlx_1_1core_1_1_imag.html#a51c15ae82855edebba2ba779516465f5',1,'mlx::core::Imag::is_equivalent()'],['../classmlx_1_1core_1_1_less.html#a7d6ed6353a0dcefebd008026dbd3cd63',1,'mlx::core::Less::is_equivalent()'],['../classmlx_1_1core_1_1_less_equal.html#a76ee1438cf4bd109eae4e0b3472b26af',1,'mlx::core::LessEqual::is_equivalent()'],['../classmlx_1_1core_1_1_log.html#a2fc58ea4ca744db493b947d1136d05f8',1,'mlx::core::Log::is_equivalent()'],['../classmlx_1_1core_1_1_logical_not.html#aba53675da351cd9b71a73d475b4bbe99',1,'mlx::core::LogicalNot::is_equivalent()'],['../classmlx_1_1core_1_1_logical_and.html#a9572c35f72e0db2f7f86bbf42438a6be',1,'mlx::core::LogicalAnd::is_equivalent()'],['../classmlx_1_1core_1_1_logical_or.html#a9c8b10a5cf5c69fdc2362390197e4e71',1,'mlx::core::LogicalOr::is_equivalent()'],['../classmlx_1_1core_1_1_log_add_exp.html#a3cf9a202c05aff39919d713d6e2b32e4',1,'mlx::core::LogAddExp::is_equivalent()'],['../classmlx_1_1core_1_1_matmul.html#aab372b59eae0840fc4f75ef5719a2630',1,'mlx::core::Matmul::is_equivalent()'],['../classmlx_1_1core_1_1_maximum.html#a21fe93fbd7799682f481260aee8bdb46',1,'mlx::core::Maximum::is_equivalent()'],['../classmlx_1_1core_1_1_minimum.html#a56c54ee3293cc2cd84462b9ec7ac36b4',1,'mlx::core::Minimum::is_equivalent()'],['../classmlx_1_1core_1_1_multiply.html#ae288159fa2d6d35087a85aca8eafa9b2',1,'mlx::core::Multiply::is_equivalent()'],['../classmlx_1_1core_1_1_negative.html#ac2a4d8159c548639d6289980c8975823',1,'mlx::core::Negative::is_equivalent()'],['../classmlx_1_1core_1_1_not_equal.html#ac12fd6b3e2f2e7e4e622b59badf2c73d',1,'mlx::core::NotEqual::is_equivalent()'],['../classmlx_1_1core_1_1_number_of_elements.html#ad6a32565ccc64499e368e15bba0b438f',1,'mlx::core::NumberOfElements::is_equivalent()'],['../classmlx_1_1core_1_1_pad.html#aad7c3bfecafe435d6a8e807de4c7ea9b',1,'mlx::core::Pad::is_equivalent()'],['../classmlx_1_1core_1_1_partition.html#aabdf6ef4f2159b2bfe93e0e87d4772f8',1,'mlx::core::Partition::is_equivalent()'],['../classmlx_1_1core_1_1_power.html#a76b4ec9d1ff07f06189e414480453d68',1,'mlx::core::Power::is_equivalent()'],['../classmlx_1_1core_1_1_quantized_matmul.html#af28b36e3f40ea41785387800326cc8e1',1,'mlx::core::QuantizedMatmul::is_equivalent()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#a6a7da6bcf657fcdb157c45bf35fdec11',1,'mlx::core::GatherQMM::is_equivalent()'],['../classmlx_1_1core_1_1_random_bits.html#a72ec915debf5823e7c0463045b2894e6',1,'mlx::core::RandomBits::is_equivalent()'],['../classmlx_1_1core_1_1_real.html#a6d9bed396862a9e9d6abfbdcd8d8d239',1,'mlx::core::Real::is_equivalent()'],['../classmlx_1_1core_1_1_reshape.html#abd07c53af476777a04307e0423784cf3',1,'mlx::core::Reshape::is_equivalent()'],['../classmlx_1_1core_1_1_reduce.html#abe8f3327d617d0dd7438f066497ae08e',1,'mlx::core::Reduce::is_equivalent()'],['../classmlx_1_1core_1_1_round.html#aeb3d8607bbba7345a3142d4cbd4e6927',1,'mlx::core::Round::is_equivalent()'],['../classmlx_1_1core_1_1_scan.html#a54445a4d677ca4fe2a58d08eb5223ac6',1,'mlx::core::Scan::is_equivalent()'],['../classmlx_1_1core_1_1_scatter.html#a0208172562abdc90472e6eb5f84c987f',1,'mlx::core::Scatter::is_equivalent()'],['../classmlx_1_1core_1_1_sigmoid.html#a04814ba1b0edf8299d5ca1bcb8749d8e',1,'mlx::core::Sigmoid::is_equivalent()'],['../classmlx_1_1core_1_1_sign.html#a8c0934acbcc4b146e5aacd35a8c445bb',1,'mlx::core::Sign::is_equivalent()'],['../classmlx_1_1core_1_1_sin.html#af00b0e5516f884996ce7a97e6c1e3e6a',1,'mlx::core::Sin::is_equivalent()'],['../classmlx_1_1core_1_1_sinh.html#adcb1878996fd4902cd550042dd6ad70d',1,'mlx::core::Sinh::is_equivalent()'],['../classmlx_1_1core_1_1_slice.html#a43202c3b8966ae1db9ab82072e4918b0',1,'mlx::core::Slice::is_equivalent()'],['../classmlx_1_1core_1_1_slice_update.html#a60f588acced42391e6e5615ae8d16119',1,'mlx::core::SliceUpdate::is_equivalent()'],['../classmlx_1_1core_1_1_softmax.html#a9215ed7bd36bc11276c58dfb9808d728',1,'mlx::core::Softmax::is_equivalent()'],['../classmlx_1_1core_1_1_sort.html#ae48f07cf641d54234fc4fb6529a33511',1,'mlx::core::Sort::is_equivalent()'],['../classmlx_1_1core_1_1_split.html#af25a0cc259573b9dce60d285eee18345',1,'mlx::core::Split::is_equivalent()'],['../classmlx_1_1core_1_1_square.html#a6abc881d44071019aa15481e5ea75ab2',1,'mlx::core::Square::is_equivalent()'],['../classmlx_1_1core_1_1_sqrt.html#ab871c2b8ab4a27a3f782a005d0e87c46',1,'mlx::core::Sqrt::is_equivalent()'],['../classmlx_1_1core_1_1_stop_gradient.html#a327539298b21d800d26482b94fce41b3',1,'mlx::core::StopGradient::is_equivalent()'],['../classmlx_1_1core_1_1_subtract.html#af1c05e1e3f703ba916d54f8ccbbd102b',1,'mlx::core::Subtract::is_equivalent()'],['../classmlx_1_1core_1_1_tan.html#afdf46288e7f60ea7f878688347dff7e4',1,'mlx::core::Tan::is_equivalent()'],['../classmlx_1_1core_1_1_tanh.html#a0692a1de2373b86eb394252ed4fecfda',1,'mlx::core::Tanh::is_equivalent()'],['../classmlx_1_1core_1_1_uniform.html#abb6048807a7c5b2e35a77e06a17f801b',1,'mlx::core::Uniform::is_equivalent()'],['../classmlx_1_1core_1_1_view.html#a7cb8403a96a47cb258caac4e3b850f64',1,'mlx::core::View::is_equivalent()'],['../classmlx_1_1core_1_1_transpose.html#a799ec3c3fa9f1b9e6177c755252a3eab',1,'mlx::core::Transpose::is_equivalent()'],['../classmlx_1_1core_1_1_eigh.html#a09414e3fe88a952408d164d6dd0af381',1,'mlx::core::Eigh::is_equivalent()']]],
   ['is_5fopen_27',['is_open',['../classmlx_1_1core_1_1io_1_1_reader.html#a780f504058bd9c80cb3d105046a9f985',1,'mlx::core::io::Reader::is_open()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a85aa36bdb0dbfb8c5b6cfd955b03417a',1,'mlx::core::io::Writer::is_open()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a653009adbcbce8248bc666df502fdbde',1,'mlx::core::io::ParallelFileReader::is_open()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#ad5d2ee671a81700cb1658c41309d6676',1,'mlx::core::io::FileWriter::is_open()']]],
   ['is_5fpower_5fof_5f2_28',['is_power_of_2',['../namespacemlx_1_1core.html#adacbc4526e8964b267a8ec3eb1bc1a32',1,'mlx::core']]],
   ['is_5fready_29',['is_ready',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#ab41ecc5adb6187aa2682ca190fd920f3',1,'pocketfft::detail::threading::latch']]],
diff --git a/docs/build/html/search/functions_a.js b/docs/build/html/search/functions_a.js
index e37825181..b44679d9d 100644
--- a/docs/build/html/search/functions_a.js
+++ b/docs/build/html/search/functions_a.js
@@ -1,4 +1,4 @@
 var searchData=
 [
-  ['jvp_0',['jvp',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80',1,'mlx::core::distributed::AllReduce::jvp()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913',1,'mlx::core::distributed::AllGather::jvp()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584',1,'mlx::core::fast::Custom::jvp()'],['../classmlx_1_1core_1_1_primitive.html#a9fecf38f53da08ba1947543c2b3158c2',1,'mlx::core::Primitive::jvp()'],['../classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11',1,'mlx::core::Abs::jvp()'],['../classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7',1,'mlx::core::Add::jvp()'],['../classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9',1,'mlx::core::ArcCos::jvp()'],['../classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7',1,'mlx::core::ArcCosh::jvp()'],['../classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4',1,'mlx::core::ArcSin::jvp()'],['../classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4',1,'mlx::core::ArcSinh::jvp()'],['../classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760',1,'mlx::core::ArcTan::jvp()'],['../classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738',1,'mlx::core::ArcTan2::jvp()'],['../classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a',1,'mlx::core::ArcTanh::jvp()'],['../classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595',1,'mlx::core::ArgPartition::jvp()'],['../classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa',1,'mlx::core::ArgReduce::jvp()'],['../classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0',1,'mlx::core::AsType::jvp()'],['../classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53',1,'mlx::core::AsStrided::jvp()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d',1,'mlx::core::BitwiseBinary::jvp()'],['../classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece',1,'mlx::core::Broadcast::jvp()'],['../classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066',1,'mlx::core::Ceil::jvp()'],['../classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205',1,'mlx::core::Compiled::jvp()'],['../classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1',1,'mlx::core::Concatenate::jvp()'],['../classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc',1,'mlx::core::Copy::jvp()'],['../classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1',1,'mlx::core::Cos::jvp()'],['../classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863',1,'mlx::core::Cosh::jvp()'],['../classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720',1,'mlx::core::CustomTransforms::jvp()'],['../classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c',1,'mlx::core::Divide::jvp()'],['../classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9',1,'mlx::core::DivMod::jvp()'],['../classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6',1,'mlx::core::Select::jvp()'],['../classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79',1,'mlx::core::Remainder::jvp()'],['../classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f',1,'mlx::core::Equal::jvp()'],['../classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe',1,'mlx::core::Erf::jvp()'],['../classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be',1,'mlx::core::ErfInv::jvp()'],['../classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59',1,'mlx::core::Exp::jvp()'],['../classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1',1,'mlx::core::Expm1::jvp()'],['../classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6',1,'mlx::core::FFT::jvp()'],['../classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af',1,'mlx::core::Floor::jvp()'],['../classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407',1,'mlx::core::Full::jvp()'],['../classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d',1,'mlx::core::Gather::jvp()'],['../classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1',1,'mlx::core::Greater::jvp()'],['../classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20',1,'mlx::core::GreaterEqual::jvp()'],['../classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a',1,'mlx::core::Hadamard::jvp()'],['../classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a',1,'mlx::core::Imag::jvp()'],['../classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce',1,'mlx::core::Less::jvp()'],['../classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f',1,'mlx::core::LessEqual::jvp()'],['../classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832',1,'mlx::core::Log::jvp()'],['../classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2',1,'mlx::core::Log1p::jvp()'],['../classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c',1,'mlx::core::LogicalNot::jvp()'],['../classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434',1,'mlx::core::LogicalAnd::jvp()'],['../classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4',1,'mlx::core::LogicalOr::jvp()'],['../classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329',1,'mlx::core::LogAddExp::jvp()'],['../classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39',1,'mlx::core::Maximum::jvp()'],['../classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038',1,'mlx::core::Minimum::jvp()'],['../classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4',1,'mlx::core::Multiply::jvp()'],['../classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979',1,'mlx::core::Negative::jvp()'],['../classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17',1,'mlx::core::NotEqual::jvp()'],['../classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72',1,'mlx::core::Pad::jvp()'],['../classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a',1,'mlx::core::Partition::jvp()'],['../classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a',1,'mlx::core::Power::jvp()'],['../classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23',1,'mlx::core::QuantizedMatmul::jvp()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0',1,'mlx::core::GatherQMM::jvp()'],['../classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526',1,'mlx::core::Real::jvp()'],['../classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5',1,'mlx::core::Reshape::jvp()'],['../classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7',1,'mlx::core::Round::jvp()'],['../classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee',1,'mlx::core::Scan::jvp()'],['../classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934',1,'mlx::core::Scatter::jvp()'],['../classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db',1,'mlx::core::Sigmoid::jvp()'],['../classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b',1,'mlx::core::Sign::jvp()'],['../classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de',1,'mlx::core::Sin::jvp()'],['../classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c',1,'mlx::core::Sinh::jvp()'],['../classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36',1,'mlx::core::Slice::jvp()'],['../classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611',1,'mlx::core::SliceUpdate::jvp()'],['../classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f',1,'mlx::core::Softmax::jvp()'],['../classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62',1,'mlx::core::Sort::jvp()'],['../classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282',1,'mlx::core::Split::jvp()'],['../classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d',1,'mlx::core::Square::jvp()'],['../classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818',1,'mlx::core::Sqrt::jvp()'],['../classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220',1,'mlx::core::Subtract::jvp()'],['../classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2',1,'mlx::core::Tan::jvp()'],['../classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a',1,'mlx::core::Tanh::jvp()'],['../classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1',1,'mlx::core::Transpose::jvp()'],['../namespacemlx_1_1core.html#a179a632200366c223d6ab56d3e032592',1,'mlx::core::jvp(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;tangents)'],['../namespacemlx_1_1core.html#af38e7582db29519bb39326f6fa531d20',1,'mlx::core::jvp(const std::function&lt; array(const array &amp;)&gt; &amp;fun, const array &amp;primal, const array &amp;tangent)']]]
+  ['jvp_0',['jvp',['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#aeaf6f2b5955e7417cd1e36db42c45a80',1,'mlx::core::distributed::AllReduce::jvp()'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#a96f08a4ea8453d0b4b737c7b07972913',1,'mlx::core::distributed::AllGather::jvp()'],['../classmlx_1_1core_1_1fast_1_1_custom.html#ac77b28702654df8e7d882a49357a9584',1,'mlx::core::fast::Custom::jvp()'],['../classmlx_1_1core_1_1_primitive.html#a9fecf38f53da08ba1947543c2b3158c2',1,'mlx::core::Primitive::jvp()'],['../classmlx_1_1core_1_1_abs.html#a6c1e6eeaf4f5e63898c3487106e88e11',1,'mlx::core::Abs::jvp()'],['../classmlx_1_1core_1_1_add.html#a77230069f76fe60a2fe1007822a277b7',1,'mlx::core::Add::jvp()'],['../classmlx_1_1core_1_1_arc_cos.html#a240079c616f1a1f127aa783308096fe9',1,'mlx::core::ArcCos::jvp()'],['../classmlx_1_1core_1_1_arc_cosh.html#a80fcb790649219c30260af903b76a1d7',1,'mlx::core::ArcCosh::jvp()'],['../classmlx_1_1core_1_1_arc_sin.html#a37affc8c5e84e5c54e73a71fc0821ea4',1,'mlx::core::ArcSin::jvp()'],['../classmlx_1_1core_1_1_arc_sinh.html#a79ebf2f6dfecbfbb93170fdd1ca87bf4',1,'mlx::core::ArcSinh::jvp()'],['../classmlx_1_1core_1_1_arc_tan.html#a0f5590a2297fc133b4b0a15f9dd0c760',1,'mlx::core::ArcTan::jvp()'],['../classmlx_1_1core_1_1_arc_tan2.html#a01675433f2a4fa466b2f48272dbca738',1,'mlx::core::ArcTan2::jvp()'],['../classmlx_1_1core_1_1_arc_tanh.html#a534ebdbfe77241884630d25021274c4a',1,'mlx::core::ArcTanh::jvp()'],['../classmlx_1_1core_1_1_arg_partition.html#aedea4b47f947a6fe358dd1238cdfb595',1,'mlx::core::ArgPartition::jvp()'],['../classmlx_1_1core_1_1_arg_reduce.html#a03bb925e1b488c560bc3d67ce62ba6fa',1,'mlx::core::ArgReduce::jvp()'],['../classmlx_1_1core_1_1_as_type.html#a213400967150c57da35795e1c9f65ca0',1,'mlx::core::AsType::jvp()'],['../classmlx_1_1core_1_1_as_strided.html#a8ff0a398c47b42e08bc1122e07a02b53',1,'mlx::core::AsStrided::jvp()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a1dae6ce5dc0498d20530403fe5c5531d',1,'mlx::core::BitwiseBinary::jvp()'],['../classmlx_1_1core_1_1_broadcast.html#ae2fc3851a117079244708864be770ece',1,'mlx::core::Broadcast::jvp()'],['../classmlx_1_1core_1_1_ceil.html#a7ad74b27d9f26c886c2af516b845f066',1,'mlx::core::Ceil::jvp()'],['../classmlx_1_1core_1_1_compiled.html#aa385fe28626856ca5f57161b47a3c205',1,'mlx::core::Compiled::jvp()'],['../classmlx_1_1core_1_1_concatenate.html#a9f9e7a9dc3a00e02b84c94e1868baff1',1,'mlx::core::Concatenate::jvp()'],['../classmlx_1_1core_1_1_contiguous.html#a1f9fcae7235e0ae9217825b78cb0f991',1,'mlx::core::Contiguous::jvp()'],['../classmlx_1_1core_1_1_copy.html#a5acf02aa360cbefd86749fe9877b29cc',1,'mlx::core::Copy::jvp()'],['../classmlx_1_1core_1_1_cos.html#a99dd0b7e4aa2c838b77736f1fd539ee1',1,'mlx::core::Cos::jvp()'],['../classmlx_1_1core_1_1_cosh.html#a79facb0882443533f36a0a18407f5863',1,'mlx::core::Cosh::jvp()'],['../classmlx_1_1core_1_1_custom_transforms.html#aa9f695100170d5cae999b3da138ce720',1,'mlx::core::CustomTransforms::jvp()'],['../classmlx_1_1core_1_1_divide.html#ae1f408c447b17b3c84fe7f951d95559c',1,'mlx::core::Divide::jvp()'],['../classmlx_1_1core_1_1_div_mod.html#a1267401f25f25847888dd0a00b3fe3b9',1,'mlx::core::DivMod::jvp()'],['../classmlx_1_1core_1_1_select.html#a172df6812c2ea3e9d3c3fc5d527548d6',1,'mlx::core::Select::jvp()'],['../classmlx_1_1core_1_1_remainder.html#a972002173fc00ee86029d12bf1a9ba79',1,'mlx::core::Remainder::jvp()'],['../classmlx_1_1core_1_1_equal.html#a659d484589d7cd96d038922a1a98730f',1,'mlx::core::Equal::jvp()'],['../classmlx_1_1core_1_1_erf.html#ac733d605d80277d613954794eb8c46fe',1,'mlx::core::Erf::jvp()'],['../classmlx_1_1core_1_1_erf_inv.html#aa52710297ab6f7cd6826418c303e64be',1,'mlx::core::ErfInv::jvp()'],['../classmlx_1_1core_1_1_exp.html#aef6721832fcc283b082e35a7d436fa59',1,'mlx::core::Exp::jvp()'],['../classmlx_1_1core_1_1_expm1.html#ad463730632a00945d3a8addfdaec67b1',1,'mlx::core::Expm1::jvp()'],['../classmlx_1_1core_1_1_f_f_t.html#a34578814b6576f7b7b447541984ecba6',1,'mlx::core::FFT::jvp()'],['../classmlx_1_1core_1_1_floor.html#aa47bc360ec563b6e7d93e8b50626d8af',1,'mlx::core::Floor::jvp()'],['../classmlx_1_1core_1_1_full.html#a281a865d0664596ac8d05ea8e7f26407',1,'mlx::core::Full::jvp()'],['../classmlx_1_1core_1_1_gather.html#ac54ef8fac92ab190f1793f3dd95b9e8d',1,'mlx::core::Gather::jvp()'],['../classmlx_1_1core_1_1_greater.html#aa47a9f80f45daf6a405e34f6dc7c99c1',1,'mlx::core::Greater::jvp()'],['../classmlx_1_1core_1_1_greater_equal.html#ac7346080aaaa01d52896127f383f9d20',1,'mlx::core::GreaterEqual::jvp()'],['../classmlx_1_1core_1_1_hadamard.html#a22b9d55ae3ba5eef63505124696e712a',1,'mlx::core::Hadamard::jvp()'],['../classmlx_1_1core_1_1_imag.html#ac01c5ed9b886983450ed9f049ddac55a',1,'mlx::core::Imag::jvp()'],['../classmlx_1_1core_1_1_less.html#af1493d566f6d940b8f674aac17f5dfce',1,'mlx::core::Less::jvp()'],['../classmlx_1_1core_1_1_less_equal.html#addfe62d3557d216f8307bdf1cbff6a8f',1,'mlx::core::LessEqual::jvp()'],['../classmlx_1_1core_1_1_log.html#ac646d4155322c34f58183d97301e3832',1,'mlx::core::Log::jvp()'],['../classmlx_1_1core_1_1_log1p.html#a537e44c7c993daf48698082e75e71ba2',1,'mlx::core::Log1p::jvp()'],['../classmlx_1_1core_1_1_logical_not.html#a4838c483ced707cfda3d6cd24bf4667c',1,'mlx::core::LogicalNot::jvp()'],['../classmlx_1_1core_1_1_logical_and.html#a78d3be71da224ea19158cf9e8c4cf434',1,'mlx::core::LogicalAnd::jvp()'],['../classmlx_1_1core_1_1_logical_or.html#a292de6001c551214c8152a7a5b0e6bd4',1,'mlx::core::LogicalOr::jvp()'],['../classmlx_1_1core_1_1_log_add_exp.html#aea2d1d58794e86f3488219ed3fa14329',1,'mlx::core::LogAddExp::jvp()'],['../classmlx_1_1core_1_1_maximum.html#a25ac5d5b453e571bf7240aa8de103c39',1,'mlx::core::Maximum::jvp()'],['../classmlx_1_1core_1_1_minimum.html#a10acf4fef35eed7ca55d131b5ae2d038',1,'mlx::core::Minimum::jvp()'],['../classmlx_1_1core_1_1_multiply.html#a79f7f0bb70de2e3e41a66c96285325b4',1,'mlx::core::Multiply::jvp()'],['../classmlx_1_1core_1_1_negative.html#a7d918f9b26b8fb7b047a27d85ebab979',1,'mlx::core::Negative::jvp()'],['../classmlx_1_1core_1_1_not_equal.html#ae2d3e5776efaefed7f4c73f679b02f17',1,'mlx::core::NotEqual::jvp()'],['../classmlx_1_1core_1_1_pad.html#a6e43a42032ef11497e8d91290574ec72',1,'mlx::core::Pad::jvp()'],['../classmlx_1_1core_1_1_partition.html#a310f569a163958940ed02cf52079746a',1,'mlx::core::Partition::jvp()'],['../classmlx_1_1core_1_1_power.html#a3e78b06453faa4fd149fd19c0e7a300a',1,'mlx::core::Power::jvp()'],['../classmlx_1_1core_1_1_quantized_matmul.html#ae51fdd0b81dd26c6687577567c126e23',1,'mlx::core::QuantizedMatmul::jvp()'],['../classmlx_1_1core_1_1_gather_q_m_m.html#adc579058752b927c71b45a962d4869e0',1,'mlx::core::GatherQMM::jvp()'],['../classmlx_1_1core_1_1_real.html#adff418a54970e2344bd3c2885aae5526',1,'mlx::core::Real::jvp()'],['../classmlx_1_1core_1_1_reshape.html#ab8fc28748991017cc3e29f93c91087a5',1,'mlx::core::Reshape::jvp()'],['../classmlx_1_1core_1_1_round.html#a032075a7d0dde2dba6189636d216c5e7',1,'mlx::core::Round::jvp()'],['../classmlx_1_1core_1_1_scan.html#a6f9c862f4fbc7eaf430a361cdd8933ee',1,'mlx::core::Scan::jvp()'],['../classmlx_1_1core_1_1_scatter.html#a270fa8ccf36ce4bbbc23875139223934',1,'mlx::core::Scatter::jvp()'],['../classmlx_1_1core_1_1_sigmoid.html#a62ca1c440896e32958c77af3340847db',1,'mlx::core::Sigmoid::jvp()'],['../classmlx_1_1core_1_1_sign.html#a957992c7aa0e86cf06f861a94372086b',1,'mlx::core::Sign::jvp()'],['../classmlx_1_1core_1_1_sin.html#af662d10180967399820496477ff050de',1,'mlx::core::Sin::jvp()'],['../classmlx_1_1core_1_1_sinh.html#a86e2b37823daf20a4c74c9f273215f9c',1,'mlx::core::Sinh::jvp()'],['../classmlx_1_1core_1_1_slice.html#a8288324045ab21d6c97b1695ce86ef36',1,'mlx::core::Slice::jvp()'],['../classmlx_1_1core_1_1_slice_update.html#a0ce3248cc61dae2b51d7aa8ee4197611',1,'mlx::core::SliceUpdate::jvp()'],['../classmlx_1_1core_1_1_softmax.html#af96172634a24332b0fc8d7ca7e73f19f',1,'mlx::core::Softmax::jvp()'],['../classmlx_1_1core_1_1_sort.html#af113ac983473433eec851c8fddfcba62',1,'mlx::core::Sort::jvp()'],['../classmlx_1_1core_1_1_split.html#ab8a8d30fd1ebf0891f41f3c32eabe282',1,'mlx::core::Split::jvp()'],['../classmlx_1_1core_1_1_square.html#a822629b93b91e2bef29959431d95e22d',1,'mlx::core::Square::jvp()'],['../classmlx_1_1core_1_1_sqrt.html#a78544b1fb5da0c14bce3051ffd177818',1,'mlx::core::Sqrt::jvp()'],['../classmlx_1_1core_1_1_subtract.html#a8100081a99df5166f02efc76d6641220',1,'mlx::core::Subtract::jvp()'],['../classmlx_1_1core_1_1_tan.html#a5d7c76122d63619df17b0e45450bc8f2',1,'mlx::core::Tan::jvp()'],['../classmlx_1_1core_1_1_tanh.html#ae0fbb5370dc1c3a4fb0dd02ca28a832a',1,'mlx::core::Tanh::jvp()'],['../classmlx_1_1core_1_1_transpose.html#ac1a523e25ab7fd9df4da363a922afbe1',1,'mlx::core::Transpose::jvp()'],['../namespacemlx_1_1core.html#a179a632200366c223d6ab56d3e032592',1,'mlx::core::jvp(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, const std::vector&lt; array &gt; &amp;primals, const std::vector&lt; array &gt; &amp;tangents)'],['../namespacemlx_1_1core.html#af38e7582db29519bb39326f6fa531d20',1,'mlx::core::jvp(const std::function&lt; array(const array &amp;)&gt; &amp;fun, const array &amp;primal, const array &amp;tangent)']]]
 ];
diff --git a/docs/build/html/search/functions_c.js b/docs/build/html/search/functions_c.js
index 87249a7a0..2d9b9d4a9 100644
--- a/docs/build/html/search/functions_c.js
+++ b/docs/build/html/search/functions_c.js
@@ -19,16 +19,16 @@ var searchData=
   ['lib_5fname_16',['lib_name',['../classmlx_1_1core_1_1_compiled.html#ae5c16cb91ac31b97e7652cc526c07439',1,'mlx::core::Compiled']]],
   ['linspace_17',['linspace',['../group__ops.html#ga968bcabed902311dcfbd903b0fb886ec',1,'mlx::core']]],
   ['load_18',['Load',['../classmlx_1_1core_1_1_load.html#a3aa8a537cd90bab048df47dca1ed526a',1,'mlx::core::Load']]],
-  ['load_19',['load',['../struct_read_writer.html#a120eaf4b5f32e80972a18d14e82a2d75',1,'ReadWriter::load()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96',1,'mlx::steel::MMATile::load(const threadgroup U *src)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9',1,'mlx::steel::MMATile::load(const device U *src, const int ld)'],['../struct_read_writer.html#a8a97ba42db5692898ef7391db08d8fd0',1,'ReadWriter::load() const'],['../struct_read_writer.html#a2506ee61be67826ac9494efb12a81900',1,'ReadWriter::load() const'],['../namespacemlx_1_1core.html#a954de19249da7c1fa39b89bdc47368aa',1,'mlx::core::load(array &amp;out, size_t offset, const std::shared_ptr&lt; io::Reader &gt; &amp;reader, bool swap_endianess)'],['../namespacemlx_1_1core.html#abada9bfa834d7423959362386720f3db',1,'mlx::core::load(std::shared_ptr&lt; io::Reader &gt; in_stream, StreamOrDevice s={})'],['../namespacemlx_1_1core.html#ac71a08bf4c052ae3c77e9e89cbea071d',1,'mlx::core::load(std::string file, StreamOrDevice s={})']]],
+  ['load_19',['load',['../struct_read_writer.html#a120eaf4b5f32e80972a18d14e82a2d75',1,'ReadWriter::load()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96',1,'mlx::steel::MMATile::load(const threadgroup U *src)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9',1,'mlx::steel::MMATile::load(const device U *src, const int ld)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96',1,'mlx::steel::MMATile::load(const threadgroup U *src)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9',1,'mlx::steel::MMATile::load(const device U *src, const int ld)'],['../struct_read_writer.html#a8a97ba42db5692898ef7391db08d8fd0',1,'ReadWriter::load() const'],['../struct_read_writer.html#a2506ee61be67826ac9494efb12a81900',1,'ReadWriter::load() const'],['../namespacemlx_1_1core.html#a954de19249da7c1fa39b89bdc47368aa',1,'mlx::core::load(array &amp;out, size_t offset, const std::shared_ptr&lt; io::Reader &gt; &amp;reader, bool swap_endianess)'],['../namespacemlx_1_1core.html#abada9bfa834d7423959362386720f3db',1,'mlx::core::load(std::shared_ptr&lt; io::Reader &gt; in_stream, StreamOrDevice s={})'],['../namespacemlx_1_1core.html#ac71a08bf4c052ae3c77e9e89cbea071d',1,'mlx::core::load(std::string file, StreamOrDevice s={})']]],
   ['load_5fgguf_20',['load_gguf',['../namespacemlx_1_1core.html#a2aa12b351ce559deb14cda0a5292c2ce',1,'mlx::core']]],
   ['load_5fpadded_21',['load_padded',['../struct_read_writer.html#add5bd3f647793a5a19d63197a19df73c',1,'ReadWriter::load_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#af3ce6bbb1a8dfb3bab1ae18d3eb45bc0',1,'ReadWriter::load_padded(int length, const device float2 *w_k) const'],['../struct_read_writer.html#ab116f4569bb9dc6eaef0d8d08472e239',1,'ReadWriter::load_padded(int length, const device float2 *w_k) const']]],
-  ['load_5fsafe_22',['load_safe',['../struct_g_e_m_v_kernel.html#a04bb72da9a93d6d1eba468fa311bbba7',1,'GEMVKernel::load_safe()'],['../struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b',1,'QuantizedBlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d',1,'mlx::steel::BlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da',1,'mlx::steel::MMATile::load_safe()'],['../scan_8h.html#ae8eb101e538b85f8a4bcf451489ae0ac',1,'load_safe():&#160;scan.h']]],
+  ['load_5fsafe_22',['load_safe',['../struct_g_e_m_v_kernel.html#a04bb72da9a93d6d1eba468fa311bbba7',1,'GEMVKernel::load_safe()'],['../struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b',1,'QuantizedBlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d',1,'mlx::steel::BlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_block_loader_t.html#ac2d95e35ba39e0984e6f1e58ca935f7d',1,'mlx::steel::BlockLoaderT::load_safe()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da',1,'mlx::steel::MMATile::load_safe()'],['../structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d',1,'mlx::steel::BlockLoader::load_safe()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::load_safe()'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da',1,'mlx::steel::MMATile::load_safe()'],['../scan_8h.html#ae8eb101e538b85f8a4bcf451489ae0ac',1,'load_safe():&#160;scan.h']]],
   ['load_5fsafetensors_23',['load_safetensors',['../namespacemlx_1_1core.html#a96cc40e1af8c4626c813ce4859f70a5c',1,'mlx::core::load_safetensors(std::shared_ptr&lt; io::Reader &gt; in_stream, StreamOrDevice s={})'],['../namespacemlx_1_1core.html#af7eea1682a38d363c56a066321e6d526',1,'mlx::core::load_safetensors(const std::string &amp;file, StreamOrDevice s={})']]],
   ['load_5fstrided_24',['load_strided',['../struct_read_writer.html#a998ef484bade81f726b9edfc6b878197',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a3d9c8cbc582cad6b5218339d0f721559',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a795a71a8e1f154a5af415ebe1b3f0713',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a0935b946b8bf2e769427fcbf2da2f7be',1,'ReadWriter::load_strided(int stride, int overall_n)'],['../struct_read_writer.html#a7d45368c74a8b7c632659504b3273a13',1,'ReadWriter::load_strided(int stride, int overall_n)']]],
-  ['load_5funsafe_25',['load_unsafe',['../struct_g_e_m_v_kernel.html#a6013e9c5b2f72fa1311dd038172df0ce',1,'GEMVKernel::load_unsafe()'],['../struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc',1,'QuantizedBlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a961836be363409744e48e595d5e0c2ec',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8034abc10483487fc94313e3674d1111',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a69e2f7c9814d1cc1c5c267be8618dc55',1,'mlx::steel::Conv2DWeightBlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#aa11d1a142bc868df462f48a7102147f3',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a0e262b003ac0e7ee6272585eac921704',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3859ca11b5991ef6ee9b99afdc3ea30a',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8f078982186421f5b484c0b53af9c655',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::load_unsafe()'],['../structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27',1,'mlx::steel::BlockLoader::load_unsafe()'],['../scan_8h.html#a9c415d07921f3961bad0a00a34f4a9a3',1,'load_unsafe(U values[N_READS], const device T *input):&#160;scan.h']]],
+  ['load_5funsafe_25',['load_unsafe',['../struct_g_e_m_v_kernel.html#a6013e9c5b2f72fa1311dd038172df0ce',1,'GEMVKernel::load_unsafe()'],['../struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc',1,'QuantizedBlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27',1,'mlx::steel::BlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_block_loader_t.html#acb743f32146fdc7986264b7beb35fb38',1,'mlx::steel::BlockLoaderT::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a961836be363409744e48e595d5e0c2ec',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8034abc10483487fc94313e3674d1111',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a69e2f7c9814d1cc1c5c267be8618dc55',1,'mlx::steel::Conv2DWeightBlockLoader::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#aa11d1a142bc868df462f48a7102147f3',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a0e262b003ac0e7ee6272585eac921704',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3859ca11b5991ef6ee9b99afdc3ea30a',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::load_unsafe()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8f078982186421f5b484c0b53af9c655',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::load_unsafe()'],['../structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27',1,'mlx::steel::BlockLoader::load_unsafe()'],['../scan_8h.html#a9c415d07921f3961bad0a00a34f4a9a3',1,'load_unsafe(U values[N_READS], const device T *input):&#160;scan.h']]],
   ['load_5fvector_26',['load_vector',['../quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9',1,'quantized.h']]],
   ['load_5fvector_5fsafe_27',['load_vector_safe',['../quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7',1,'quantized.h']]],
-  ['location_28',['location',['../structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2',1,'looped_elem_to_loc::location()'],['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a368d2a2204cee5055386954acd5ccb90',1,'looped_elem_to_loc&lt; 1, offset_t &gt;::location()'],['../structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a8c7aaffda0ca500d9f9566e5e74217a2',1,'looped_elem_to_loc&lt; 0, offset_t &gt;::location()']]],
+  ['location_28',['location',['../struct_looped_elem_to_loc.html#aba051a428ad0934a9c6d04d4d3ee6e0e',1,'LoopedElemToLoc::location()'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a66b84b12f6c1494e5908989ed2849a9f',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::location()'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a89d9ec4dc2f2f0d77e27aa0c05f261ef',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::location()']]],
   ['log_29',['Log',['../classmlx_1_1core_1_1_log.html#a663e54790c60b56eb0ff09f4f6635fb9',1,'mlx::core::Log']]],
   ['log_30',['log',['../namespacemetal.html#a423a9f4f2fc7ef5ec7eda061277b51b6',1,'metal::log()'],['../namespacemetal_1_1fast.html#aef942e7f9e5c2e58c58644ab1bdd58d1',1,'metal::fast::log()'],['../namespacemetal_1_1precise.html#a341c2b8c27d1bed860f85f8b355023d4',1,'metal::precise::log()'],['../group__ops.html#ga6fb22d4926133573e430fcc92f4eef31',1,'mlx::core::log()']]],
   ['log10_31',['log10',['../namespacemetal.html#a042b98827baa910e9d726227cec55a80',1,'metal::log10()'],['../namespacemetal_1_1fast.html#a0d1150cf2deee5100a7ea2988b3bb39e',1,'metal::fast::log10()'],['../namespacemetal_1_1precise.html#a44239067e8e9248b1574353f98e94d72',1,'metal::precise::log10()'],['../group__ops.html#ga1fdcc7fc8819caf2e6f1c327ed4e9b9e',1,'mlx::core::log10()']]],
@@ -44,5 +44,6 @@ var searchData=
   ['logicalnot_41',['LogicalNot',['../classmlx_1_1core_1_1_logical_not.html#a6f5850b4c78b83d5e2c0d37437fc79b7',1,'mlx::core::LogicalNot']]],
   ['logicalor_42',['LogicalOr',['../classmlx_1_1core_1_1_logical_or.html#a269c22daca1c15ad010bb860bce93918',1,'mlx::core::LogicalOr']]],
   ['logsumexp_43',['logsumexp',['../group__ops.html#gacff4eb57c085d571e722083680267ac5',1,'mlx::core::logsumexp(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga59be50b4e92f1dc20b53460cefa3910d',1,'mlx::core::logsumexp(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gae3969c7bd24c4f3ab97831df28239689',1,'mlx::core::logsumexp(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gafef5cb2159c16a60a95470cc823bdd44',1,'mlx::core::logsumexp(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['lowest_44',['lowest',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]]
+  ['loopedelemtoloc_44',['LoopedElemToLoc',['../struct_looped_elem_to_loc.html#a5653be1c990722a4a215be27efe5648b',1,'LoopedElemToLoc::LoopedElemToLoc()'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#abf536c7162d36af7367e390789944c86',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::LoopedElemToLoc()'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a0e21977d9f23b6994773e8e4f3ee70de',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::LoopedElemToLoc()']]],
+  ['lowest_45',['lowest',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#ae81c58b8223e504965183c99d19a2116',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]]
 ];
diff --git a/docs/build/html/search/functions_d.js b/docs/build/html/search/functions_d.js
index 7967de990..eb37eb174 100644
--- a/docs/build/html/search/functions_d.js
+++ b/docs/build/html/search/functions_d.js
@@ -7,47 +7,49 @@ var searchData=
   ['make_5ftask_4',['make_task',['../namespacemlx_1_1core_1_1metal.html#a4552b7ccdfa7f3cc9895c09799d8048e',1,'mlx::core::metal']]],
   ['malloc_5',['malloc',['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a9a17d2c7a97772bf4a15e6c74af34ca4',1,'mlx::core::allocator::Allocator::malloc()'],['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html#a4f3d5de6b8c0eba22e9403b28a5ef3f0',1,'mlx::core::allocator::CommonAllocator::malloc()'],['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html#a6c0feb9b1ff9977f76c69745393944bc',1,'mlx::core::metal::MetalAllocator::malloc()'],['../namespacemlx_1_1core_1_1allocator.html#a560d10a166e3c294f3757166f9bd6801',1,'mlx::core::allocator::malloc(size_t size)']]],
   ['malloc_5for_5fwait_6',['malloc_or_wait',['../namespacemlx_1_1core_1_1allocator.html#a86ac0a11ff78f21e717f641716c34abc',1,'mlx::core::allocator']]],
-  ['mat_5fat_7',['mat_at',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e',1,'mlx::steel::MMATile']]],
+  ['mat_5fat_7',['mat_at',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e',1,'mlx::steel::MMATile::mat_at(const short i, const short j)'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e',1,'mlx::steel::MMATile::mat_at(const short i, const short j)']]],
   ['matmul_8',['Matmul',['../classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7',1,'mlx::core::Matmul']]],
   ['matmul_9',['matmul',['../group__ops.html#ga753d59f5a9f5f2362865ee83b4dced2a',1,'mlx::core']]],
   ['max_10',['max',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a92320d40a58218e40cc414986ac95c50',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;::max()'],['../namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b',1,'metal::max()'],['../namespacemetal_1_1fast.html#a747e2e58092a27fb8b4dd3d16934fb52',1,'metal::fast::max()'],['../namespacemetal_1_1precise.html#a6a954a4e4e3753303d1dc734855a185f',1,'metal::precise::max()'],['../group__ops.html#ga7fed87d96cc7741d8267f4eac83f5fe7',1,'mlx::core::max(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga25be91d70a5f40341db0615a0b8bfedc',1,'mlx::core::max(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga1ca7b6b91fe2459a7d83897bf013827f',1,'mlx::core::max(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga7b638050e03a93f2896c981bc2850a47',1,'mlx::core::max(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
   ['max3_11',['max3',['../namespacemetal.html#a00f9c0ad66d969794614f56912eed9c9',1,'metal::max3()'],['../namespacemetal_1_1fast.html#a6fc2cf18ffa8149561864c86dba0f803',1,'metal::fast::max3()'],['../namespacemetal_1_1precise.html#ac490e8614ebd2c9343af1ae6c0d4e82c',1,'metal::precise::max3()']]],
-  ['maximum_12',['Maximum',['../classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816',1,'mlx::core::Maximum']]],
-  ['maximum_13',['maximum',['../group__ops.html#ga7ade2ea305e2e4219c3609443fb5db8d',1,'mlx::core']]],
-  ['maybeinsertbarrier_14',['maybeInsertBarrier',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991',1,'mlx::core::metal::CommandEncoder']]],
-  ['mb_5fblock_5fmerge_15',['mb_block_merge',['../sort_8h.html#ab381cd57f344bc7304ab580bfdc78807',1,'sort.h']]],
-  ['mb_5fblock_5fpartition_16',['mb_block_partition',['../sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2',1,'sort.h']]],
-  ['mb_5fblock_5fsort_17',['mb_block_sort',['../sort_8h.html#aa48ff1aff1e9dc1301b6781aa0721d6b',1,'sort.h']]],
-  ['mean_18',['mean',['../group__ops.html#gade46e768fd46b8b640eb16f26abeecef',1,'mlx::core::mean(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga52b59fdd8e8430538e564f5bbcfa31e6',1,'mlx::core::mean(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga066161f3d3e395a1d76c638cb680d444',1,'mlx::core::mean(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga45fba73eab0e3b6e128ed3ce2f43a5da',1,'mlx::core::mean(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['median3_19',['median3',['../namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157',1,'metal::median3()'],['../namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc',1,'metal::fast::median3()'],['../namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2',1,'metal::precise::median3()']]],
-  ['merge_5fpartition_20',['merge_partition',['../struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca',1,'BlockMergeSort::merge_partition()'],['../struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe',1,'KernelMultiBlockMergeSort::merge_partition()']]],
-  ['merge_5fstep_21',['merge_step',['../struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c',1,'BlockMergeSort']]],
-  ['meshgrid_22',['meshgrid',['../group__ops.html#ga577c911618575314de63d1060656a26e',1,'mlx::core']]],
-  ['metal_5fkernel_23',['metal_kernel',['../namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e',1,'mlx::core::fast']]],
-  ['min_24',['min',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min()'],['../namespacemetal.html#a6653b28c9473087141eddce39878d4d3',1,'metal::min()'],['../namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61',1,'metal::fast::min()'],['../namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e',1,'metal::precise::min()'],['../group__ops.html#gab27599802617a4c8f9964ab5f4ffee12',1,'mlx::core::min(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga0140b91e9cdfc3fef0da8e332f65a9e8',1,'mlx::core::min(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga6efb83cd46436678c8f8c4af15cc00f5',1,'mlx::core::min(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga36fa315eef677f4143868f552cd26d03',1,'mlx::core::min(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['min3_25',['min3',['../namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f',1,'metal::min3()'],['../namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f',1,'metal::fast::min3()'],['../namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231',1,'metal::precise::min3()']]],
-  ['minimum_26',['Minimum',['../classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5',1,'mlx::core::Minimum']]],
-  ['minimum_27',['minimum',['../group__ops.html#ga49ba00c090f81f331c91b0c97040bce0',1,'mlx::core']]],
-  ['mlx_5fatomic_5fcompare_5fexchange_5fweak_5fexplicit_28',['mlx_atomic_compare_exchange_weak_explicit',['../atomic_8h.html#ad7f32327ff66354cfa2f0cfdac79316f',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread T *expected, T val, size_t offset):&#160;atomic.h'],['../atomic_8h.html#aa8f47b2e9b95d4b00ad51f08b070deb5',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread uint *expected, uint val, size_t offset):&#160;atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fadd_5fexplicit_29',['mlx_atomic_fetch_add_explicit',['../atomic_8h.html#aad448d9e06e001700b65ca8317216a3b',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fand_5fexplicit_30',['mlx_atomic_fetch_and_explicit',['../atomic_8h.html#a253e3c870c0ddc7c28ab2f6ca2c3eae5',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_31',['mlx_atomic_fetch_max_explicit',['../atomic_8h.html#ac480f2b459a8ad9095cee353e152d00c',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_3c_20float_20_3e_32',['mlx_atomic_fetch_max_explicit&lt; float &gt;',['../atomic_8h.html#a1dce2abfa16417122c4d2bf261129ae4',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_33',['mlx_atomic_fetch_min_explicit',['../atomic_8h.html#a2ec33dca0039bd944d73d1c2b378cc19',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_3c_20float_20_3e_34',['mlx_atomic_fetch_min_explicit&lt; float &gt;',['../atomic_8h.html#ab7d1dc49f319f239b7ee0b7c72976dd0',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmul_5fexplicit_35',['mlx_atomic_fetch_mul_explicit',['../atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5for_5fexplicit_36',['mlx_atomic_fetch_or_explicit',['../atomic_8h.html#ab7391f197001471e4788312bdb6ab37a',1,'atomic.h']]],
-  ['mlx_5fatomic_5fload_5fexplicit_37',['mlx_atomic_load_explicit',['../atomic_8h.html#a253a4e8c2c5768a069e2791b627dfc99',1,'atomic.h']]],
-  ['mlx_5fatomic_5fstore_5fexplicit_38',['mlx_atomic_store_explicit',['../atomic_8h.html#a0ae453140b0819a4c02f265334de98c0',1,'atomic.h']]],
-  ['mma_39',['mma',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()']]],
-  ['mmatile_40',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile']]],
-  ['move_5fshared_5fbuffer_41',['move_shared_buffer',['../classmlx_1_1core_1_1array.html#acce00db63e0f3d80f797b02397ade836',1,'mlx::core::array::move_shared_buffer(array other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a38d7ad605f8282e5e49d0c09e0555c78',1,'mlx::core::array::move_shared_buffer(array other)']]],
-  ['moveaxis_42',['moveaxis',['../group__ops.html#ga24067d10a842db2c9d509ea48135a2c3',1,'mlx::core']]],
-  ['mpinplace_43',['MPINPLACE',['../namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2',1,'pocketfft::detail']]],
-  ['mtl_5fdevice_44',['mtl_device',['../classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653',1,'mlx::core::metal::Device']]],
-  ['mtl_5fresidency_5fset_45',['mtl_residency_set',['../classmlx_1_1core_1_1metal_1_1_residency_set.html#ac4bfe5ef5e2eaebc458a1ed1953d15e9',1,'mlx::core::metal::ResidencySet']]],
-  ['multi_5fiter_46',['multi_iter',['../classpocketfft_1_1detail_1_1multi__iter.html#a9be43bb18840202da6d17988fccc64b9',1,'pocketfft::detail::multi_iter']]],
-  ['multiply_47',['Multiply',['../classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c',1,'mlx::core::Multiply']]],
-  ['multiply_48',['multiply',['../group__ops.html#gaf57392e641640b5d06e4c99518391c38',1,'mlx::core']]],
-  ['multivariate_5fnormal_49',['multivariate_normal',['../namespacemlx_1_1core_1_1random.html#a8c37da3c1c0c561cad7499d6d9db81fb',1,'mlx::core::random']]]
+  ['max_5fops_5fper_5fbuffer_12',['max_ops_per_buffer',['../namespacemlx_1_1core_1_1env.html#aedbf4e739553024c33dd0094dd9107aa',1,'mlx::core::env']]],
+  ['maximum_13',['Maximum',['../classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816',1,'mlx::core::Maximum']]],
+  ['maximum_14',['maximum',['../group__ops.html#ga7ade2ea305e2e4219c3609443fb5db8d',1,'mlx::core']]],
+  ['maybeinsertbarrier_15',['maybeInsertBarrier',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991',1,'mlx::core::metal::CommandEncoder']]],
+  ['mb_5fblock_5fmerge_16',['mb_block_merge',['../sort_8h.html#ab381cd57f344bc7304ab580bfdc78807',1,'sort.h']]],
+  ['mb_5fblock_5fpartition_17',['mb_block_partition',['../sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2',1,'sort.h']]],
+  ['mb_5fblock_5fsort_18',['mb_block_sort',['../sort_8h.html#aa48ff1aff1e9dc1301b6781aa0721d6b',1,'sort.h']]],
+  ['mean_19',['mean',['../group__ops.html#gade46e768fd46b8b640eb16f26abeecef',1,'mlx::core::mean(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga52b59fdd8e8430538e564f5bbcfa31e6',1,'mlx::core::mean(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga066161f3d3e395a1d76c638cb680d444',1,'mlx::core::mean(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga45fba73eab0e3b6e128ed3ce2f43a5da',1,'mlx::core::mean(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['median3_20',['median3',['../namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157',1,'metal::median3()'],['../namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc',1,'metal::fast::median3()'],['../namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2',1,'metal::precise::median3()']]],
+  ['merge_5fpartition_21',['merge_partition',['../struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca',1,'BlockMergeSort::merge_partition()'],['../struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe',1,'KernelMultiBlockMergeSort::merge_partition()']]],
+  ['merge_5fstep_22',['merge_step',['../struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c',1,'BlockMergeSort']]],
+  ['meshgrid_23',['meshgrid',['../group__ops.html#ga577c911618575314de63d1060656a26e',1,'mlx::core']]],
+  ['metal_5fkernel_24',['metal_kernel',['../namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e',1,'mlx::core::fast']]],
+  ['min_25',['min',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min()'],['../namespacemetal.html#a6653b28c9473087141eddce39878d4d3',1,'metal::min()'],['../namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61',1,'metal::fast::min()'],['../namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e',1,'metal::precise::min()'],['../group__ops.html#gab27599802617a4c8f9964ab5f4ffee12',1,'mlx::core::min(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga0140b91e9cdfc3fef0da8e332f65a9e8',1,'mlx::core::min(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga6efb83cd46436678c8f8c4af15cc00f5',1,'mlx::core::min(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga36fa315eef677f4143868f552cd26d03',1,'mlx::core::min(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['min3_26',['min3',['../namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f',1,'metal::min3()'],['../namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f',1,'metal::fast::min3()'],['../namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231',1,'metal::precise::min3()']]],
+  ['minimum_27',['Minimum',['../classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5',1,'mlx::core::Minimum']]],
+  ['minimum_28',['minimum',['../group__ops.html#ga49ba00c090f81f331c91b0c97040bce0',1,'mlx::core']]],
+  ['mlx_5fatomic_5fcompare_5fexchange_5fweak_5fexplicit_29',['mlx_atomic_compare_exchange_weak_explicit',['../atomic_8h.html#ad7f32327ff66354cfa2f0cfdac79316f',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread T *expected, T val, size_t offset):&#160;atomic.h'],['../atomic_8h.html#aa8f47b2e9b95d4b00ad51f08b070deb5',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread uint *expected, uint val, size_t offset):&#160;atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fadd_5fexplicit_30',['mlx_atomic_fetch_add_explicit',['../atomic_8h.html#aad448d9e06e001700b65ca8317216a3b',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fand_5fexplicit_31',['mlx_atomic_fetch_and_explicit',['../atomic_8h.html#a253e3c870c0ddc7c28ab2f6ca2c3eae5',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_32',['mlx_atomic_fetch_max_explicit',['../atomic_8h.html#ac480f2b459a8ad9095cee353e152d00c',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_3c_20float_20_3e_33',['mlx_atomic_fetch_max_explicit&lt; float &gt;',['../atomic_8h.html#a1dce2abfa16417122c4d2bf261129ae4',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_34',['mlx_atomic_fetch_min_explicit',['../atomic_8h.html#a2ec33dca0039bd944d73d1c2b378cc19',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_3c_20float_20_3e_35',['mlx_atomic_fetch_min_explicit&lt; float &gt;',['../atomic_8h.html#ab7d1dc49f319f239b7ee0b7c72976dd0',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmul_5fexplicit_36',['mlx_atomic_fetch_mul_explicit',['../atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5for_5fexplicit_37',['mlx_atomic_fetch_or_explicit',['../atomic_8h.html#ab7391f197001471e4788312bdb6ab37a',1,'atomic.h']]],
+  ['mlx_5fatomic_5fload_5fexplicit_38',['mlx_atomic_load_explicit',['../atomic_8h.html#a253a4e8c2c5768a069e2791b627dfc99',1,'atomic.h']]],
+  ['mlx_5fatomic_5fstore_5fexplicit_39',['mlx_atomic_store_explicit',['../atomic_8h.html#a0ae453140b0819a4c02f265334de98c0',1,'atomic.h']]],
+  ['mma_40',['mma',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()']]],
+  ['mmatile_41',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile::MMATile() thread'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile::MMATile() thread']]],
+  ['move_5for_5fcopy_42',['move_or_copy',['../namespacemlx_1_1core.html#a830a47d8a317dffb0c88e5a7afe6aee2',1,'mlx::core::move_or_copy(const array &amp;in, array &amp;out)'],['../namespacemlx_1_1core.html#aae1e770954edf1f9a35d19e0de4d857a',1,'mlx::core::move_or_copy(const array &amp;in, array &amp;out, const std::vector&lt; size_t &gt; &amp;strides, array::Flags flags, size_t data_size, size_t offset=0)']]],
+  ['move_5fshared_5fbuffer_43',['move_shared_buffer',['../classmlx_1_1core_1_1array.html#acce00db63e0f3d80f797b02397ade836',1,'mlx::core::array::move_shared_buffer(array other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a38d7ad605f8282e5e49d0c09e0555c78',1,'mlx::core::array::move_shared_buffer(array other)']]],
+  ['moveaxis_44',['moveaxis',['../group__ops.html#ga24067d10a842db2c9d509ea48135a2c3',1,'mlx::core']]],
+  ['mpinplace_45',['MPINPLACE',['../namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2',1,'pocketfft::detail']]],
+  ['mtl_5fdevice_46',['mtl_device',['../classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653',1,'mlx::core::metal::Device']]],
+  ['mtl_5fresidency_5fset_47',['mtl_residency_set',['../classmlx_1_1core_1_1metal_1_1_residency_set.html#ac4bfe5ef5e2eaebc458a1ed1953d15e9',1,'mlx::core::metal::ResidencySet']]],
+  ['multi_5fiter_48',['multi_iter',['../classpocketfft_1_1detail_1_1multi__iter.html#a9be43bb18840202da6d17988fccc64b9',1,'pocketfft::detail::multi_iter']]],
+  ['multiply_49',['Multiply',['../classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c',1,'mlx::core::Multiply']]],
+  ['multiply_50',['multiply',['../group__ops.html#gaf57392e641640b5d06e4c99518391c38',1,'mlx::core']]],
+  ['multivariate_5fnormal_51',['multivariate_normal',['../namespacemlx_1_1core_1_1random.html#a8c37da3c1c0c561cad7499d6d9db81fb',1,'mlx::core::random']]]
 ];
diff --git a/docs/build/html/search/functions_e.js b/docs/build/html/search/functions_e.js
index 76ac25c56..d8bf69263 100644
--- a/docs/build/html/search/functions_e.js
+++ b/docs/build/html/search/functions_e.js
@@ -11,7 +11,7 @@ var searchData=
   ['new_5fqueue_8',['new_queue',['../classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67',1,'mlx::core::metal::Device']]],
   ['new_5fscoped_5fmemory_5fpool_9',['new_scoped_memory_pool',['../namespacemlx_1_1core_1_1metal.html#a46583a1aba89449fa72e6cb3a7090981',1,'mlx::core::metal']]],
   ['new_5fstream_10',['new_stream',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a157c8da85fa1bddb8eacf8515a3cc879',1,'mlx::core::scheduler::Scheduler::new_stream()'],['../namespacemlx_1_1core_1_1metal.html#a8b4188f9a090a1da42d62b8a369bf106',1,'mlx::core::metal::new_stream()'],['../namespacemlx_1_1core.html#a6f7c63a9be10337b3b96d527e1db3c2f',1,'mlx::core::new_stream()']]],
-  ['next_11',['next',['../struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9',1,'QuantizedBlockLoader::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3b71f379ff9baf39830c92f4f1ecde52',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a78d2b0098311a278be8394edbd5fc731',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aae56c19bb562219770fec38e5666c6ce',1,'mlx::steel::Conv2DWeightBlockLoader::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af9ce1a767266664bea131a5437002c80',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a30b10bebde7f08b89d03bdd9ea0f48da',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3e5ee68ed0ee43f7e979dd4222f76a8c',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a11743cb1c108f42ccdc6e59204a5b3e8',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::next()'],['../structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8',1,'mlx::steel::BlockLoader::next()'],['../structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca',1,'looped_elem_to_loc::next(const constant int *shape, const constant size_t *strides)'],['../structlooped__elem__to__loc.html#add610f331ef8d7d2d1917050890f82b2',1,'looped_elem_to_loc::next(int n, const constant int *shape, const constant size_t *strides)'],['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a96cf2987c04210c9197e5237e425c4b4',1,'looped_elem_to_loc&lt; 1, offset_t &gt;::next(const constant int *, const constant size_t *strides)'],['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#af2984b35f7d7300d4812e7872b3c8851',1,'looped_elem_to_loc&lt; 1, offset_t &gt;::next(int n, const constant int *, const constant size_t *strides)'],['../structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#aa1e9e1009c16befb9a730835836436e0',1,'looped_elem_to_loc&lt; 0, offset_t &gt;::next(const constant int *, const constant size_t *)'],['../structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a1064cdfdcef779b5628ce5357a6fe4f0',1,'looped_elem_to_loc&lt; 0, offset_t &gt;::next(int, const constant int *, const constant size_t *)'],['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a4193c5eac3ef093a740d5305b25d3e18',1,'mlx::core::random::KeySequence::next()']]],
+  ['next_11',['next',['../struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9',1,'QuantizedBlockLoader::next()'],['../structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8',1,'mlx::steel::BlockLoader::next()'],['../structmlx_1_1steel_1_1_block_loader_t.html#a6008ef45ff980dbe1119da0630f6c697',1,'mlx::steel::BlockLoaderT::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a3b71f379ff9baf39830c92f4f1ecde52',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a78d2b0098311a278be8394edbd5fc731',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aae56c19bb562219770fec38e5666c6ce',1,'mlx::steel::Conv2DWeightBlockLoader::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af9ce1a767266664bea131a5437002c80',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a30b10bebde7f08b89d03bdd9ea0f48da',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::next()'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3e5ee68ed0ee43f7e979dd4222f76a8c',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::next()'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a11743cb1c108f42ccdc6e59204a5b3e8',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::next()'],['../structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8',1,'mlx::steel::BlockLoader::next()'],['../struct_looped_elem_to_loc.html#a54c743940bf96350f3be42bba5d28205',1,'LoopedElemToLoc::next(const constant int *shape, const constant size_t *strides)'],['../struct_looped_elem_to_loc.html#a7da7bd04e79ba86f71c535b5a6ec1a2d',1,'LoopedElemToLoc::next(int n, const constant int *shape, const constant size_t *strides)'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#af5a7c0cddeb52da88fa1140f44aec45c',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::next(const constant int *shape, const constant size_t *strides)'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a8fe55b3a2fa8cd35af568085faed785d',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::next(int n, const constant int *shape, const constant size_t *strides)'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a03f3ca7a60bb85e36d7eba75e0e08b15',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::next(const constant int *, const constant size_t *strides)'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af8f2b29946324756c09951b69e170dd8',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::next(int n, const constant int *, const constant size_t *strides)'],['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a4193c5eac3ef093a740d5305b25d3e18',1,'mlx::core::random::KeySequence::next()']]],
   ['next_5fpower_5fof_5f2_12',['next_power_of_2',['../namespacemlx_1_1core.html#a685c0530e338aabc622325685846ce93',1,'mlx::core']]],
   ['nextafter_13',['nextafter',['../namespacemetal.html#a9547fd7b09164931986f6db4813bd72d',1,'metal::nextafter()'],['../namespacemetal_1_1fast.html#a4583e8be04fc0bd475b97b0934604f23',1,'metal::fast::nextafter()'],['../namespacemetal_1_1precise.html#ad012ceeb55b77f1533749b351331e026',1,'metal::precise::nextafter()']]],
   ['norm_14',['norm',['../namespacemlx_1_1core_1_1linalg.html#aba765b8e95e9a1d33d31f727a185919d',1,'mlx::core::linalg::norm(const array &amp;a, const double ord, const std::optional&lt; std::vector&lt; int &gt; &gt; &amp;axis=std::nullopt, bool keepdims=false, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1linalg.html#acaa85b4146821c268abecec2422c02d2',1,'mlx::core::linalg::norm(const array &amp;a, const double ord, int axis, bool keepdims=false, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1linalg.html#af1ebe0c6dcba9a1c49b5e397dddf3264',1,'mlx::core::linalg::norm(const array &amp;a, const std::string &amp;ord, const std::optional&lt; std::vector&lt; int &gt; &gt; &amp;axis=std::nullopt, bool keepdims=false, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1linalg.html#ae8da67e4c6e073f93889f1051203cd9e',1,'mlx::core::linalg::norm(const array &amp;a, const std::string &amp;ord, int axis, bool keepdims=false, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1linalg.html#a229018071d5602e38d6248230f334a10',1,'mlx::core::linalg::norm(const array &amp;a, const std::optional&lt; std::vector&lt; int &gt; &gt; &amp;axis=std::nullopt, bool keepdims=false, StreamOrDevice s={})'],['../namespacemlx_1_1core_1_1linalg.html#a44250cff34238f01471fd61e76036f03',1,'mlx::core::linalg::norm(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
diff --git a/docs/build/html/search/functions_f.js b/docs/build/html/search/functions_f.js
index 9142546c7..976a1fc85 100644
--- a/docs/build/html/search/functions_f.js
+++ b/docs/build/html/search/functions_f.js
@@ -1,6 +1,6 @@
 var searchData=
 [
-  ['offset_5fneg_5fidx_0',['offset_neg_idx',['../kernels_2indexing_8h.html#ab41167dc537c06fbdb4df100972393df',1,'indexing.h']]],
+  ['offset_5fneg_5fidx_0',['offset_neg_idx',['../kernels_2indexing_8h.html#a58a65ea6215999cd4ccb4fe757cc2dc8',1,'indexing.h']]],
   ['ofs_1',['ofs',['../classpocketfft_1_1detail_1_1simple__iter.html#ab59481ad9c8f04addb907c3ebb89f8fa',1,'pocketfft::detail::simple_iter::ofs()'],['../classpocketfft_1_1detail_1_1rev__iter.html#a78c3b4ad19edf9d20cab40ad109e9dd1',1,'pocketfft::detail::rev_iter::ofs()']]],
   ['ones_2',['ones',['../group__ops.html#ga54eeed455321a54c8e72e16552a978f2',1,'mlx::core::ones(const std::vector&lt; int &gt; &amp;shape, Dtype dtype, StreamOrDevice s={})'],['../group__ops.html#ga6cf4b5e8580e4436302c519d05897dab',1,'mlx::core::ones(const std::vector&lt; int &gt; &amp;shape, StreamOrDevice s={})']]],
   ['ones_5flike_3',['ones_like',['../group__ops.html#ga94f8d3b1906fee99da9cbe39f7be7d42',1,'mlx::core']]],
@@ -8,42 +8,41 @@ var searchData=
   ['operator_20bool_5',['operator bool',['../struct___no_mask.html#ad3723c1e70e46beefd283ce6317416cb',1,'_NoMask::operator bool()'],['../struct___no_mask.html#aafbf8a3201e1cc1abf74dd1f1b7272cd',1,'_NoMask::operator bool() const threadgroup'],['../struct___no_mask.html#a73e9612a619885cbc97cbd8f40df71e7',1,'_NoMask::operator bool() const device'],['../struct___no_mask.html#a4bf336d472bc677028250f76b9cdc08c',1,'_NoMask::operator bool() const constant'],['../struct___no_mask.html#ad3723c1e70e46beefd283ce6317416cb',1,'_NoMask::operator bool()'],['../struct___no_mask.html#aafbf8a3201e1cc1abf74dd1f1b7272cd',1,'_NoMask::operator bool() const threadgroup'],['../struct___no_mask.html#a73e9612a619885cbc97cbd8f40df71e7',1,'_NoMask::operator bool() const device'],['../struct___no_mask.html#a4bf336d472bc677028250f76b9cdc08c',1,'_NoMask::operator bool() const constant']]],
   ['operator_20dtype_6',['operator Dtype',['../structmlx_1_1core_1_1_type_to_dtype.html#aefdd0fd6a5bbf0197a3996ccd4adea13',1,'mlx::core::TypeToDtype']]],
   ['operator_20float_7',['operator float',['../structmlx_1_1core_1_1___m_l_x___b_float16.html#aaae72e5340ce91325f1925be36ba46cb',1,'mlx::core::_MLX_BFloat16::operator float()'],['../structmlx_1_1core_1_1complex128__t.html#a3e2faf180c0b785646a0e4296f709a5e',1,'mlx::core::complex128_t::operator float()'],['../structmlx_1_1core_1_1complex64__t.html#a90d224dd37308345086bb9cc882ef6fc',1,'mlx::core::complex64_t::operator float()'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a363de5054f3673bddc90293fc3c9bb99',1,'mlx::core::_MLX_Float16::operator float()']]],
-  ['operator_20t_8',['operator T',['../struct___m_l_x___b_float16.html#aa7dfefdf0d15e102d2b8258c9ab01836',1,'_MLX_BFloat16::operator T() const thread'],['../struct___m_l_x___b_float16.html#a2546a8afa77e14ed5b3c5da79a281260',1,'_MLX_BFloat16::operator T() const threadgroup'],['../struct___m_l_x___b_float16.html#a1d523f87740fcb852db6ab57896c245a',1,'_MLX_BFloat16::operator T() const device'],['../struct___m_l_x___b_float16.html#a95acd29283024d7093a0bc58c9468a0a',1,'_MLX_BFloat16::operator T() const constant'],['../structcomplex64__t.html#a70e9b16031eeaff3baa601f400023fcd',1,'complex64_t::operator T() const thread'],['../structcomplex64__t.html#a4f3beea7ab6001189b782a74d1746b67',1,'complex64_t::operator T() const threadgroup'],['../structcomplex64__t.html#a9f4f7eca89ffe6c8d126a4145df6d9f2',1,'complex64_t::operator T() const device'],['../structcomplex64__t.html#ac33e2e5263fec76a4fb4418c6e1d8d14',1,'complex64_t::operator T() const constant']]],
+  ['operator_20t_8',['operator T',['../structcomplex64__t.html#a70e9b16031eeaff3baa601f400023fcd',1,'complex64_t::operator T() const thread'],['../structcomplex64__t.html#a4f3beea7ab6001189b782a74d1746b67',1,'complex64_t::operator T() const threadgroup'],['../structcomplex64__t.html#a9f4f7eca89ffe6c8d126a4145df6d9f2',1,'complex64_t::operator T() const device'],['../structcomplex64__t.html#ac33e2e5263fec76a4fb4418c6e1d8d14',1,'complex64_t::operator T() const constant'],['../struct___m_l_x___b_float16.html#aa7dfefdf0d15e102d2b8258c9ab01836',1,'_MLX_BFloat16::operator T() const thread'],['../struct___m_l_x___b_float16.html#a2546a8afa77e14ed5b3c5da79a281260',1,'_MLX_BFloat16::operator T() const threadgroup'],['../struct___m_l_x___b_float16.html#a1d523f87740fcb852db6ab57896c245a',1,'_MLX_BFloat16::operator T() const device'],['../struct___m_l_x___b_float16.html#a95acd29283024d7093a0bc58c9468a0a',1,'_MLX_BFloat16::operator T() const constant']]],
   ['operator_20val_9',['operator Val',['../structmlx_1_1core_1_1_dtype.html#a3b3bc059be5836476da3cb88a4f5e9fd',1,'mlx::core::Dtype']]],
   ['operator_20value_5ftype_10',['operator value_type',['../structmlx_1_1steel_1_1integral__constant.html#a0c11203bed44a6a2c387b365134dcd64',1,'mlx::steel::integral_constant']]],
-  ['operator_21_3d_11',['operator!=',['../backend_2metal_2kernels_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55',1,'operator!=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a6baa722c22d66c7510786bb275cb8cc2',1,'operator!=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa8d9f01582a0a9f01a666d110c74db2a',1,'operator!=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa504a474ab6e00ebe2b1b7ed2f7d1ffb',1,'operator!=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abf5f3040227f021a5b84cf2eda248b2f',1,'operator!=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a347c9bbf816bad2e9e5e91aa448f8b65',1,'operator!=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a33ea086b561c652f25833a5e1ded34dd',1,'operator!=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2bbdcece13148826d3fe33af727bb79b',1,'operator!=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aeb1efa47c5f22cc0b35d49ccce73c406',1,'operator!=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa6b99cde403405df1865c989e4ce845a',1,'operator!=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a204d13a881ae8d337f6efbb98673790c',1,'operator!=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3602117b4c61d5cd4fd72fb8e5f68bd6',1,'operator!=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2721c088adfc9d73cde442d6badd2a6c',1,'operator!=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aa4364eda56525cf7576ff00e550175e6',1,'mlx::steel::operator!=()'],['../namespacemlx_1_1core.html#a94d00a1b7f8a4717ab3f26f45e4da655',1,'mlx::core::operator!=(const Device &amp;lhs, const Device &amp;rhs)'],['../group__ops.html#ga0ac483d85f23252ca8757e9926d5a3c5',1,'mlx::core::operator!=(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga3fecba9f3cb9a19afd8ca492cf509ce0',1,'mlx::core::operator!=(T a, const array &amp;b)'],['../group__ops.html#gaebbf1cfde388c7480159a03c92c9a385',1,'mlx::core::operator!=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a164f109bc19c927b2b3bcc47a5021419',1,'mlx::core::operator!=(const Stream &amp;lhs, const Stream &amp;rhs)'],['../namespacemlx_1_1core.html#ad2f9e1c230ec35d5c406dd616e8f4dea',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af5899b4d5644682cb0ac2a488f630d55',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a72ac8edd190601d7a46782582cedecd8',1,'mlx::core::operator!=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a8084162ba2dd3f9b89195d2bebc3fbb0',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a514263e63f6825b490203ca586864687',1,'mlx::core::operator!=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a1c482bb3d9f9d4c62dee5865892c1f96',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a0030fe7ad09837c670cdfb7d51279519',1,'mlx::core::operator!=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ade3791bc723b8f10fbab22eadb0f705a',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ad78c664f242cd36247c13868547e3dd4',1,'mlx::core::operator!=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab0743a1a1dcb92d40f41ca42d36f242c',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ae7a0f810e546a166c7d05849b5d41f30',1,'mlx::core::operator!=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a676a40637a563f013c725d24fa33fdc8',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a9fcb662b1561e4136bac0106cfb63b6c',1,'mlx::core::operator!=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abcca7fd43590c4347e0f5df8f134030c',1,'mlx::core::operator!=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af3ede3688a2e3b3ba8cb2da180ffe151',1,'mlx::core::operator!=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a54f48469fabd1414bef5097bcded0002',1,'mlx::core::operator!=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af8c648e892cbc6973de535aa17dc2cfe',1,'mlx::core::operator!=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#abc855e1c0584b64d7d995e33211361ab',1,'mlx::core::operator!=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad3684d660d18a54505c759ab286bd936',1,'mlx::core::operator!=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a8afdda14b14262ab5ce0a00c7745d7e8',1,'mlx::core::operator!=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7ccc479be236f2bf3f7725729c5ba201',1,'mlx::core::operator!=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a26a721b8111fce3a1dec9bf724034cd4',1,'mlx::core::operator!=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad5f8c221a53a89e8095aa39fd1f61867',1,'mlx::core::operator!=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a017b52ecf30b33da4aa8da35ccc43220',1,'mlx::core::operator!=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a43c10ca5fb05ee7d0ee63ba56f8a08a3',1,'mlx::core::operator!=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a81284b6ac737f91a8d1ffbbbbf938fe5',1,'mlx::core::operator!=(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_21_3d_11',['operator!=',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afc6e4fc5589bbf30f978f34868dd4e55',1,'operator!=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6baa722c22d66c7510786bb275cb8cc2',1,'operator!=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa8d9f01582a0a9f01a666d110c74db2a',1,'operator!=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa504a474ab6e00ebe2b1b7ed2f7d1ffb',1,'operator!=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abf5f3040227f021a5b84cf2eda248b2f',1,'operator!=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a347c9bbf816bad2e9e5e91aa448f8b65',1,'operator!=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a33ea086b561c652f25833a5e1ded34dd',1,'operator!=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2bbdcece13148826d3fe33af727bb79b',1,'operator!=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeb1efa47c5f22cc0b35d49ccce73c406',1,'operator!=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa6b99cde403405df1865c989e4ce845a',1,'operator!=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a204d13a881ae8d337f6efbb98673790c',1,'operator!=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3602117b4c61d5cd4fd72fb8e5f68bd6',1,'operator!=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2721c088adfc9d73cde442d6badd2a6c',1,'operator!=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aa4364eda56525cf7576ff00e550175e6',1,'mlx::steel::operator!=()'],['../namespacemlx_1_1core.html#a94d00a1b7f8a4717ab3f26f45e4da655',1,'mlx::core::operator!=(const Device &amp;lhs, const Device &amp;rhs)'],['../group__ops.html#ga0ac483d85f23252ca8757e9926d5a3c5',1,'mlx::core::operator!=(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga3fecba9f3cb9a19afd8ca492cf509ce0',1,'mlx::core::operator!=(T a, const array &amp;b)'],['../group__ops.html#gaebbf1cfde388c7480159a03c92c9a385',1,'mlx::core::operator!=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a164f109bc19c927b2b3bcc47a5021419',1,'mlx::core::operator!=(const Stream &amp;lhs, const Stream &amp;rhs)'],['../namespacemlx_1_1core.html#ad2f9e1c230ec35d5c406dd616e8f4dea',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af5899b4d5644682cb0ac2a488f630d55',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a72ac8edd190601d7a46782582cedecd8',1,'mlx::core::operator!=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a8084162ba2dd3f9b89195d2bebc3fbb0',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a514263e63f6825b490203ca586864687',1,'mlx::core::operator!=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a1c482bb3d9f9d4c62dee5865892c1f96',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a0030fe7ad09837c670cdfb7d51279519',1,'mlx::core::operator!=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ade3791bc723b8f10fbab22eadb0f705a',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ad78c664f242cd36247c13868547e3dd4',1,'mlx::core::operator!=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab0743a1a1dcb92d40f41ca42d36f242c',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ae7a0f810e546a166c7d05849b5d41f30',1,'mlx::core::operator!=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a676a40637a563f013c725d24fa33fdc8',1,'mlx::core::operator!=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a9fcb662b1561e4136bac0106cfb63b6c',1,'mlx::core::operator!=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abcca7fd43590c4347e0f5df8f134030c',1,'mlx::core::operator!=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af3ede3688a2e3b3ba8cb2da180ffe151',1,'mlx::core::operator!=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a54f48469fabd1414bef5097bcded0002',1,'mlx::core::operator!=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af8c648e892cbc6973de535aa17dc2cfe',1,'mlx::core::operator!=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#abc855e1c0584b64d7d995e33211361ab',1,'mlx::core::operator!=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad3684d660d18a54505c759ab286bd936',1,'mlx::core::operator!=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a8afdda14b14262ab5ce0a00c7745d7e8',1,'mlx::core::operator!=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7ccc479be236f2bf3f7725729c5ba201',1,'mlx::core::operator!=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a26a721b8111fce3a1dec9bf724034cd4',1,'mlx::core::operator!=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad5f8c221a53a89e8095aa39fd1f61867',1,'mlx::core::operator!=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a017b52ecf30b33da4aa8da35ccc43220',1,'mlx::core::operator!=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a43c10ca5fb05ee7d0ee63ba56f8a08a3',1,'mlx::core::operator!=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a81284b6ac737f91a8d1ffbbbbf938fe5',1,'mlx::core::operator!=(uint64_t lhs, _MLX_Float16 rhs)']]],
   ['operator_25_12',['operator%',['../backend_2metal_2kernels_2complex_8h.html#aaf53122a07c8eca858b5a8e38ae280e0',1,'operator%():&#160;complex.h'],['../group__ops.html#gab3bfbf82b1e4de7b00bbcf1a2255fbde',1,'mlx::core::operator%(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga50817666f0b82afcbf4a123486af9908',1,'mlx::core::operator%(T a, const array &amp;b)'],['../group__ops.html#ga46c01daa07433542a477d216e13a8480',1,'mlx::core::operator%(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a8723d145dd49021bfcb8e6c99e1c91a5',1,'mlx::core::operator%(complex64_t a, complex64_t b)']]],
   ['operator_26_13',['operator&amp;',['../group__ops.html#gaf0d232de4cbfffda1e2c838f8afdf6ff',1,'mlx::core::operator&amp;(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#a9ee95f97bbd69262d99d7bea3bf77631',1,'mlx::core::operator&amp;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0fefc3ae4f1350ebe05ec6098fd6bae3',1,'mlx::core::operator&amp;(_MLX_BFloat16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a1e4cb758ccfe5c267baed9aeb0044834',1,'mlx::core::operator&amp;(uint16_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab9d0f9910070231695d61de08cadb930',1,'mlx::core::operator&amp;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a889d401f425db79d1868aa3beea4829b',1,'mlx::core::operator&amp;(_MLX_Float16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a76dcd1fa3c68b386bc1d1d899a68a120',1,'mlx::core::operator&amp;(uint16_t lhs, _MLX_Float16 rhs)']]],
   ['operator_26_26_14',['operator&amp;&amp;',['../namespacemlx_1_1steel.html#a6353bf11881842e25c46b56f92b7044f',1,'mlx::steel::operator&amp;&amp;()'],['../group__ops.html#gaee1d774bb0843601d7a0a4257d616ae3',1,'mlx::core::operator&amp;&amp;(const array &amp;a, const array &amp;b)']]],
   ['operator_26_3d_15',['operator&amp;=',['../namespacemlx_1_1core.html#a60c263ef46e552c3954688869734b513',1,'mlx::core::operator&amp;=(_MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af9670fc8088339669c54c68b3a320e25',1,'mlx::core::operator&amp;=(_MLX_BFloat16 &amp;lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#ad1f96f0a02024f347b4c4431629407fc',1,'mlx::core::operator&amp;=(_MLX_Float16 &amp;lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae0540f16c4e7bd55d0e86a88495e4967',1,'mlx::core::operator&amp;=(_MLX_Float16 &amp;lhs, uint16_t rhs)']]],
   ['operator_28_29_16',['operator()',['../structpocketfft_1_1detail_1_1_exec_c2_c.html#a4fd637f1a6d335826789af28ac089ecb',1,'pocketfft::detail::ExecC2C::operator()()'],['../structpocketfft_1_1detail_1_1_exec_hartley.html#a67c98b38d12440781053552b9a33bba1',1,'pocketfft::detail::ExecHartley::operator()()'],['../structpocketfft_1_1detail_1_1_exec_dcst.html#a67f4f56e3574c491695f8cb8a1e983d8',1,'pocketfft::detail::ExecDcst::operator()()'],['../structpocketfft_1_1detail_1_1_exec_r2_r.html#acdba1650962714e6afff51e9ca456970',1,'pocketfft::detail::ExecR2R::operator()()'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a0d657bc9a381dca1b5860b9a1b5a5702',1,'mlx::core::detail::Abs::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a564232db7d32811e2ae126c86de104f0',1,'mlx::core::detail::Abs::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a5fac7e6c8277d8706535a52820503c9d',1,'mlx::core::detail::Abs::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#af2c3723e648bd5ed2fe558cc20b7f5eb',1,'mlx::core::detail::Abs::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#a57312cd8530dd0ede3b8037f9c401883',1,'mlx::core::detail::Abs::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_abs.html#ab3b5e3853ed56bfbfa577d965c21112e',1,'mlx::core::detail::Abs::operator()(bool x)'],['../structmlx_1_1core_1_1detail_1_1_arc_cos.html#a04b4c9d1fc0160973aa28b1f809b9d51',1,'mlx::core::detail::ArcCos::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_cosh.html#a767d354bec863942822ee0b9b6742a88',1,'mlx::core::detail::ArcCosh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_sin.html#ac69091929815e5317308b4088f5c2f46',1,'mlx::core::detail::ArcSin::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_sinh.html#ac7bf9bac66fef917f75494b2345e6aaf',1,'mlx::core::detail::ArcSinh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_tan.html#aee87bf10c278a70ca788085d1b499afe',1,'mlx::core::detail::ArcTan::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_tan2.html#a9040b7afcdb4969924aa782fa67f03ac',1,'mlx::core::detail::ArcTan2::operator()()'],['../structmlx_1_1core_1_1detail_1_1_arc_tanh.html#a601e8c52bb938eb3a616756a35419e8b',1,'mlx::core::detail::ArcTanh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a672f65e47d65e4e8d88be252bce0164b',1,'mlx::core::detail::Ceil::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a72a2cab2728fb5e1cc6329a539e5d573',1,'mlx::core::detail::Ceil::operator()(int8_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#aa34590f6a41331be92988558a90dc6fa',1,'mlx::core::detail::Ceil::operator()(int16_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#af14120f3dd98f6198ea257d75be223f7',1,'mlx::core::detail::Ceil::operator()(int32_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#af263ce7743cf7319387baba616c375b5',1,'mlx::core::detail::Ceil::operator()(int64_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a48f00affcd5c2ea1f81d821e019fec29',1,'mlx::core::detail::Ceil::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#ad4d24a44e8a328948393701dacb0ceac',1,'mlx::core::detail::Ceil::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#a70da19b5c9c69f04b9f196bdf266f93c',1,'mlx::core::detail::Ceil::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#af0e7e806b73c664ada837476f9d4d43b',1,'mlx::core::detail::Ceil::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_ceil.html#acc1bfc84a9b91f6e9764234cbe3b9687',1,'mlx::core::detail::Ceil::operator()(bool x)'],['../structmlx_1_1core_1_1detail_1_1_conjugate.html#a7e662d05c6998bd6ced8ad9c187324a5',1,'mlx::core::detail::Conjugate::operator()()'],['../structmlx_1_1core_1_1detail_1_1_cos.html#ad4caef573f9d9071f8945a8efed231ad',1,'mlx::core::detail::Cos::operator()()'],['../structmlx_1_1core_1_1detail_1_1_cosh.html#a63591f49776d9aadc02200036ae38317',1,'mlx::core::detail::Cosh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_erf.html#a168f8ccc6c8053b05dd1a48904ca8fd4',1,'mlx::core::detail::Erf::operator()()'],['../structmlx_1_1core_1_1detail_1_1_erf_inv.html#acc93c0511141404208b35f302f8c1fcb',1,'mlx::core::detail::ErfInv::operator()()'],['../structmlx_1_1core_1_1detail_1_1_exp.html#a0846300cee28315e5b42f74acafbd1a1',1,'mlx::core::detail::Exp::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_exp.html#af247c0d19d59f3310fd0a081eb92cf8b',1,'mlx::core::detail::Exp::operator()(complex64_t x)'],['../structmlx_1_1core_1_1detail_1_1_expm1.html#abf7e61b8387521e9d44334ce88d833a0',1,'mlx::core::detail::Expm1::operator()()'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a16c13cfe736098bffc81d655e172294a',1,'mlx::core::detail::Floor::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a9b6c4c34b6594b8c413abe31f34a73df',1,'mlx::core::detail::Floor::operator()(int8_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#aca4c71204b3ceeca6329f7ea2b041f4c',1,'mlx::core::detail::Floor::operator()(int16_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a3c3ab9e00d1fbd124802517e8c35fe02',1,'mlx::core::detail::Floor::operator()(int32_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a4b5954ffc59c741dd7b86bafda69d5cc',1,'mlx::core::detail::Floor::operator()(int64_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a2e33b10bd5b04551054a87c601440bc7',1,'mlx::core::detail::Floor::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a2500f971100919a694f78669a5e4f438',1,'mlx::core::detail::Floor::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a23df818301d68389e6e12f5a9ec1fbd7',1,'mlx::core::detail::Floor::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#ac988b4f265cf46c68609c9c8787c15fb',1,'mlx::core::detail::Floor::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_floor.html#a7f936e3fd53772bc189d845c73b53202',1,'mlx::core::detail::Floor::operator()(bool x)'],['../structmlx_1_1core_1_1detail_1_1_imag.html#a5bd82e2185f3779e398c179d42a3e782',1,'mlx::core::detail::Imag::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log.html#a0012a4e1744dbe9a28c3b5652be6e1c6',1,'mlx::core::detail::Log::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log2.html#a467bd4c995674721ff5fff6df33aead8',1,'mlx::core::detail::Log2::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log10.html#a2633c5b772bbc9f8b66cffd4a3e01a3f',1,'mlx::core::detail::Log10::operator()()'],['../structmlx_1_1core_1_1detail_1_1_log1p.html#a3220de8c6090c44aa2070b1fbb2dc340',1,'mlx::core::detail::Log1p::operator()()'],['../structmlx_1_1core_1_1detail_1_1_logical_not.html#a79799668ea5c364b0b4e2bc330e76253',1,'mlx::core::detail::LogicalNot::operator()()'],['../structmlx_1_1core_1_1detail_1_1_negative.html#afc4595c70ef7196df374cf4b2cc5e526',1,'mlx::core::detail::Negative::operator()()'],['../structmlx_1_1core_1_1detail_1_1_real.html#ae84a939fdb5916257a7731cda66d4d61',1,'mlx::core::detail::Real::operator()()'],['../structmlx_1_1core_1_1detail_1_1_round.html#a653f29c059bbfa6192378732a8a23351',1,'mlx::core::detail::Round::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_round.html#a82a984f13568051009e257fe85227da6',1,'mlx::core::detail::Round::operator()(complex64_t x)'],['../structmlx_1_1core_1_1detail_1_1_sigmoid.html#a64b72561bfaf758632167f00648f4c89',1,'mlx::core::detail::Sigmoid::operator()()'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a64ed5013cee7ff18c7fe70bc04737e7b',1,'mlx::core::detail::Sign::operator()(T x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a7106ed1f2f98a365fcb3e6ee39084748',1,'mlx::core::detail::Sign::operator()(uint8_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a7163e8c068dcc460600ed04014dc9945',1,'mlx::core::detail::Sign::operator()(uint16_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#ae8f56c7134721c846240830169424c22',1,'mlx::core::detail::Sign::operator()(uint32_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a10ae519e9a74a327fc72c410e9ab2936',1,'mlx::core::detail::Sign::operator()(uint64_t x)'],['../structmlx_1_1core_1_1detail_1_1_sign.html#a91be4e273f6c7ea5d44cfab380b77603',1,'mlx::core::detail::Sign::operator()(complex64_t x)'],['../structmlx_1_1core_1_1detail_1_1_sin.html#ae95671816529cc2188389af37a2f1a13',1,'mlx::core::detail::Sin::operator()()'],['../structmlx_1_1core_1_1detail_1_1_sinh.html#a9663ddf0fa4c0003576b48f3d5385f00',1,'mlx::core::detail::Sinh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_square.html#a54e9e3c0d0896e142289e8282eab1099',1,'mlx::core::detail::Square::operator()()'],['../structmlx_1_1core_1_1detail_1_1_sqrt.html#aa5a4830b3ef7efab20ea88a110667efd',1,'mlx::core::detail::Sqrt::operator()()'],['../structmlx_1_1core_1_1detail_1_1_rsqrt.html#a9af247be16bab83243038aac54446b79',1,'mlx::core::detail::Rsqrt::operator()()'],['../structmlx_1_1core_1_1detail_1_1_tan.html#aba397cd7ac05bbe06dfa9e3a64bdb05f',1,'mlx::core::detail::Tan::operator()()'],['../structmlx_1_1core_1_1detail_1_1_tanh.html#a1749ba1edfd53095ed7d45c0e53bab61',1,'mlx::core::detail::Tanh::operator()()'],['../structmlx_1_1core_1_1detail_1_1_add.html#a2d6011c35768b5fcd2bb75747b944353',1,'mlx::core::detail::Add::operator()()'],['../structmlx_1_1core_1_1detail_1_1_divide.html#a5e0d22e2084c4ca81bec0d457a46c662',1,'mlx::core::detail::Divide::operator()()'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a3bdaf1095ad883ecc0fecc455f02cbf3',1,'mlx::core::detail::Remainder::operator()(T numerator, T denominator)'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a52c3a2ba86fccb24d37d218ae8328954',1,'mlx::core::detail::Remainder::operator()(T numerator, T denominator)'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a0b0dd6ef5b08585fdf8355770da8d747',1,'mlx::core::detail::Remainder::operator()(T numerator, T denominator)'],['../structmlx_1_1core_1_1detail_1_1_remainder.html#a68fe542084fb94d9a5abd740fe07832b',1,'mlx::core::detail::Remainder::operator()(complex64_t numerator, complex64_t denominator)'],['../structmlx_1_1core_1_1detail_1_1_equal.html#a2994cf1884e7126e76d0a20b215fe3ab',1,'mlx::core::detail::Equal::operator()()'],['../structmlx_1_1core_1_1detail_1_1_na_n_equal.html#a073b20b0d8d41ec8364b7c477421b9bf',1,'mlx::core::detail::NaNEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_greater.html#aa3844c2bae3c7a981739f642aa0dd094',1,'mlx::core::detail::Greater::operator()()'],['../structmlx_1_1core_1_1detail_1_1_greater_equal.html#a3b005f85522ad0e4b57044eed930ac30',1,'mlx::core::detail::GreaterEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_less.html#a0b4032dff1ad2b387745cb000aabdcbb',1,'mlx::core::detail::Less::operator()()'],['../structmlx_1_1core_1_1detail_1_1_less_equal.html#a31e70f8830a07557697541301555a7a7',1,'mlx::core::detail::LessEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_maximum.html#a3eb37abec8426ebc42b8c685075c523a',1,'mlx::core::detail::Maximum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_maximum.html#af99345c7c8bc95ccab1b22c0792ac6fd',1,'mlx::core::detail::Maximum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_minimum.html#afca0861556416a8547dd8574528feb69',1,'mlx::core::detail::Minimum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_minimum.html#a64b2eecfbc56aaef7deb939423bac3f8',1,'mlx::core::detail::Minimum::operator()(T x, T y)'],['../structmlx_1_1core_1_1detail_1_1_log_add_exp.html#ad1663fd809acaa4038f90666436599e5',1,'mlx::core::detail::LogAddExp::operator()()'],['../structmlx_1_1core_1_1detail_1_1_multiply.html#a898b090966b047723513224b8d3b22f1',1,'mlx::core::detail::Multiply::operator()()'],['../structmlx_1_1core_1_1detail_1_1_not_equal.html#a23d662b5fd968dc17d3bee2595b5f99d',1,'mlx::core::detail::NotEqual::operator()()'],['../structmlx_1_1core_1_1detail_1_1_power.html#a2c047e1b488e6525447a224975a75db8',1,'mlx::core::detail::Power::operator()(T base, T exp)'],['../structmlx_1_1core_1_1detail_1_1_power.html#a9967db24b8f67d54b6aa3810e274f28c',1,'mlx::core::detail::Power::operator()(T base, T exp)'],['../structmlx_1_1core_1_1detail_1_1_subtract.html#a72ef05830615a2d5d9662926ed82672a',1,'mlx::core::detail::Subtract::operator()()'],['../structmlx_1_1core_1_1detail_1_1_logical_and.html#a046536c1f2f9367983f052a213d7b7d8',1,'mlx::core::detail::LogicalAnd::operator()()'],['../structmlx_1_1core_1_1detail_1_1_logical_or.html#afb134dbab79307d4ba597843c61d0b1a',1,'mlx::core::detail::LogicalOr::operator()()'],['../structmlx_1_1core_1_1detail_1_1_select.html#a930f9da2e6b3453e04f21382435a2cfb',1,'mlx::core::detail::Select::operator()()'],['../structmlx_1_1core_1_1detail_1_1_bitwise_and.html#ae0bed77f95fe2b2f0b594addddd04700',1,'mlx::core::detail::BitwiseAnd::operator()()'],['../structmlx_1_1core_1_1detail_1_1_bitwise_or.html#a5ab05734c5000b454975de6647a08d20',1,'mlx::core::detail::BitwiseOr::operator()()'],['../structmlx_1_1core_1_1detail_1_1_bitwise_xor.html#a0989e3bcd064ae06c33f660696a869a0',1,'mlx::core::detail::BitwiseXor::operator()()'],['../structmlx_1_1core_1_1detail_1_1_left_shift.html#a9385f580830a6ad163dd9bb8c4905e7a',1,'mlx::core::detail::LeftShift::operator()()'],['../structmlx_1_1core_1_1detail_1_1_right_shift.html#a154528ba50e89a4c532a181f135b1620',1,'mlx::core::detail::RightShift::operator()()'],['../structmlx_1_1core_1_1_default_strided_reduce.html#a024682ab93b84e544a07e3a9c3c51fba',1,'mlx::core::DefaultStridedReduce::operator()()'],['../structmlx_1_1core_1_1_default_contiguous_reduce.html#a08144c7a3cdf10af5e47f4575da3694f',1,'mlx::core::DefaultContiguousReduce::operator()()'],['../struct_add.html#ac5c66b63d63a222d3ae0ab8cc7c90eb5',1,'Add::operator()()'],['../struct_floor_divide.html#a2b328e4d768e718fa439f955c524666a',1,'FloorDivide::operator()(T x, T y)'],['../struct_floor_divide.html#afc16a2b2a745225e0bc95640f3fc0219',1,'FloorDivide::operator()(float x, float y)'],['../struct_floor_divide.html#ae91719a15f7e643d552129f476089c6a',1,'FloorDivide::operator()(half x, half y)'],['../struct_floor_divide.html#a4aa9f858626583e02bd79f747229bbca',1,'FloorDivide::operator()(bfloat16_t x, bfloat16_t y)'],['../struct_divide.html#a0a16b9194abc2ab7c61129f81a9bbb3d',1,'Divide::operator()()'],['../struct_remainder.html#ab7875512ff4341c580c6dc372e64fc58',1,'Remainder::operator()(T x, T y)'],['../struct_remainder.html#a18150b5f4425e30b95ffabc6bb25cede',1,'Remainder::operator()(T x, T y)'],['../struct_remainder.html#ab3b75f54b56fd357c9755daadb2cafc2',1,'Remainder::operator()(T x, T y)'],['../struct_remainder.html#ae918ce0e246937d4fe04e2ea36e4b2c1',1,'Remainder::operator()(complex64_t x, complex64_t y)'],['../struct_equal.html#aa498087080900d4428ba428a6496a769',1,'Equal::operator()()'],['../struct_na_n_equal.html#a00220898e02db656d21dde9e9354a8dc',1,'NaNEqual::operator()(T x, T y)'],['../struct_na_n_equal.html#a6185e4554dce5b4659d21673c576be51',1,'NaNEqual::operator()(complex64_t x, complex64_t y)'],['../struct_greater.html#a98d7d8ee360cd0f469c6eb9a017560f5',1,'Greater::operator()()'],['../struct_greater_equal.html#ae69a3bccc567a46506cf0d296294ce80',1,'GreaterEqual::operator()()'],['../struct_less.html#a5ee0b31b2d9123dc4504f2979a5854d3',1,'Less::operator()()'],['../struct_less_equal.html#ae9f9a1b2eae548977139704f0044acfe',1,'LessEqual::operator()()'],['../struct_log_add_exp.html#ab32417f18e8ff68c15f78aceeb624edf',1,'LogAddExp::operator()()'],['../struct_maximum.html#a3ea0f42bc4cd80b68a98f189f9fa859c',1,'Maximum::operator()(T x, T y)'],['../struct_maximum.html#a0bc8fadc87f2c49fc440d625bfc97ca6',1,'Maximum::operator()(T x, T y)'],['../struct_maximum.html#a907e8793900be5927625377dab199644',1,'Maximum::operator()(complex64_t x, complex64_t y)'],['../struct_minimum.html#aa6113dfac3986c0f571fa53f65c5330e',1,'Minimum::operator()(T x, T y)'],['../struct_minimum.html#a0c939921de87ab9c6959238aac81a059',1,'Minimum::operator()(T x, T y)'],['../struct_minimum.html#a800fba087280f79c2f7e9aff75bed093',1,'Minimum::operator()(complex64_t x, complex64_t y)'],['../struct_multiply.html#a1327fc5a0713931afe997b0d4d2988e0',1,'Multiply::operator()()'],['../struct_not_equal.html#af008d73a5d9cde0b8309b7e8ee7438b2',1,'NotEqual::operator()(T x, T y)'],['../struct_not_equal.html#a14de494cea4e4869351202cad1149f17',1,'NotEqual::operator()(complex64_t x, complex64_t y)'],['../struct_power.html#a2b6df2a9e48155ff9734caca8504a79f',1,'Power::operator()(T base, T exp)'],['../struct_power.html#a36829163d42973034a1f8a7ecc57a1de',1,'Power::operator()(T base, T exp)'],['../struct_power.html#a27cdfb313c4e82b63bdcdaee923cbbef',1,'Power::operator()(complex64_t x, complex64_t y)'],['../struct_subtract.html#ae0856cd8d449074ca287baa7e460f68a',1,'Subtract::operator()()'],['../struct_logical_and.html#a8bc6bdabc0ea0678a46e2cf6217cb3a6',1,'LogicalAnd::operator()()'],['../struct_logical_or.html#ade6a931324a604a3119d2220d6f5460d',1,'LogicalOr::operator()()'],['../struct_bitwise_and.html#afb48af090b01dd0200963bc12d842e36',1,'BitwiseAnd::operator()()'],['../struct_bitwise_or.html#a41f847463daafa99ee56f4035578390f',1,'BitwiseOr::operator()()'],['../struct_bitwise_xor.html#a3a3e8a56caab739d40262d9349c9c485',1,'BitwiseXor::operator()()'],['../struct_left_shift.html#aa729747784c38bfdbba34794fcf5175b',1,'LeftShift::operator()()'],['../struct_right_shift.html#a2cc59b400c68342b0e43050431323c17',1,'RightShift::operator()()'],['../struct_arc_tan2.html#ac9b7729753e13be293ab700231d061ac',1,'ArcTan2::operator()()'],['../struct_div_mod.html#a8b5758f2ea18d4c903b462331b25abfe',1,'DivMod::operator()()'],['../struct_cum_prod_3_01bool_01_4.html#ad634be0b139d10ce6d21332eef0d936b',1,'CumProd&lt; bool &gt;::operator()()'],['../struct_cum_max.html#a781b9b955c5412466da6af6c70d73c06',1,'CumMax::operator()()'],['../struct_cum_min.html#ae0b8c3761e04fa538d304ca842281a66',1,'CumMin::operator()()'],['../struct_less_than.html#a2798eb377b411c93a4ed30cf35caade2',1,'LessThan::operator()()'],['../struct_select.html#adb51692aae3038de07dd745891bf9848',1,'Select::operator()()'],['../struct_abs.html#a9e7481dfcc162509769852026ff4a344',1,'Abs::operator()(T x)'],['../struct_abs.html#a0ca113fd036151c443df3f83cc667f28',1,'Abs::operator()(uint8_t x)'],['../struct_abs.html#adaeab32a7e377dc990077ab15f3dc4c2',1,'Abs::operator()(uint16_t x)'],['../struct_abs.html#a99d2a2f37a6cddd3168b0224f2a9b963',1,'Abs::operator()(uint32_t x)'],['../struct_abs.html#ac9cbc02422d930479303f240a7ea6c71',1,'Abs::operator()(uint64_t x)'],['../struct_abs.html#ac30835b27784d451bd2e4524c8eb9e11',1,'Abs::operator()(bool x)'],['../struct_abs.html#ab82917d6b30a2c579e7eb879d305c5fc',1,'Abs::operator()(complex64_t x)'],['../struct_arc_cos.html#a5553cecf58511e24e76ac97f2d90b9ac',1,'ArcCos::operator()()'],['../struct_arc_cosh.html#a5c9e7712c14c97298b23ec48e19abc58',1,'ArcCosh::operator()()'],['../struct_arc_sin.html#a0343872f2da93bae2bb0baadf49da022',1,'ArcSin::operator()()'],['../struct_arc_sinh.html#a3066fb7dc7c3180100fb55ff94af6a7a',1,'ArcSinh::operator()()'],['../struct_arc_tan.html#af3a0aec6acec8ae8f5e4c4d5cf8c91ba',1,'ArcTan::operator()()'],['../struct_arc_tanh.html#a37dc3e01ec2830de7e82ed6c6363ac88',1,'ArcTanh::operator()()'],['../struct_ceil.html#a5e2a4ef1b012f5d352064489156e5e44',1,'Ceil::operator()(T x)'],['../struct_ceil.html#a455cd8083ba859993077f2e078ae165b',1,'Ceil::operator()(int8_t x)'],['../struct_ceil.html#a2acb61bc658c7a216795e7f76ebcf98a',1,'Ceil::operator()(int16_t x)'],['../struct_ceil.html#aef8c37f7a8ee3fc80700d605a09891fb',1,'Ceil::operator()(int32_t x)'],['../struct_ceil.html#a93d0110511ad5dd200e12d37a3d7d6e3',1,'Ceil::operator()(int64_t x)'],['../struct_ceil.html#aa335b745fa26e0f443cdb36298105484',1,'Ceil::operator()(uint8_t x)'],['../struct_ceil.html#ade17e13b7f30f5c590fae1581a2013ac',1,'Ceil::operator()(uint16_t x)'],['../struct_ceil.html#a411c75cc35cdc088402e176a1defd22d',1,'Ceil::operator()(uint32_t x)'],['../struct_ceil.html#a9ac660ca29eef7a7429fceb7b917a68a',1,'Ceil::operator()(uint64_t x)'],['../struct_ceil.html#a40de367e62f06ebd7e1330afa93a9ad9',1,'Ceil::operator()(bool x)'],['../struct_cos.html#ae222f8710f6b8254c471ebd475aa5bda',1,'Cos::operator()(T x)'],['../struct_cos.html#a5f26feb1dcc4bec5f59a9ff511c5b163',1,'Cos::operator()(complex64_t x)'],['../struct_cosh.html#a5847ebeebb236fdc926798ddc16475ba',1,'Cosh::operator()(T x)'],['../struct_cosh.html#aefdd91298dac16d528d29ee47e2f7252',1,'Cosh::operator()(complex64_t x)'],['../struct_conjugate.html#acb0a2694285f1f57c7654b371ce8cbd8',1,'Conjugate::operator()()'],['../struct_erf.html#a80719402ad7f7d418859a6677d7b604d',1,'Erf::operator()()'],['../struct_erf_inv.html#afbf3668d1a512e889f093a0bc7673309',1,'ErfInv::operator()()'],['../struct_exp.html#a5ef395868e055348c0802fd5fe45669c',1,'Exp::operator()(T x)'],['../struct_exp.html#a2b341ac400c4d145397950eb60734336',1,'Exp::operator()(complex64_t x)'],['../struct_expm1.html#a4b834d42cf0b84daf03fec62c222091a',1,'Expm1::operator()()'],['../struct_floor.html#ace3551f28429081e9f3a3dab0c84212b',1,'Floor::operator()(T x)'],['../struct_floor.html#a10d7fd05b4c224c9f135451246d13014',1,'Floor::operator()(int8_t x)'],['../struct_floor.html#a2865a04a492e3590302f4bd3215a10d7',1,'Floor::operator()(int16_t x)'],['../struct_floor.html#a41012343ff0463ec44b4d06196f41182',1,'Floor::operator()(int32_t x)'],['../struct_floor.html#aae3181d15856796aa0628cf30c92aa2e',1,'Floor::operator()(int64_t x)'],['../struct_floor.html#ac6cf38d82c8e270911afdca4c69ad51b',1,'Floor::operator()(uint8_t x)'],['../struct_floor.html#a78969b9e2b53ae248e72a67259eea5d8',1,'Floor::operator()(uint16_t x)'],['../struct_floor.html#a959009320ed622ed45b39becab1d5b98',1,'Floor::operator()(uint32_t x)'],['../struct_floor.html#a7d04b83c3345cd867315cae2d7ff68ab',1,'Floor::operator()(uint64_t x)'],['../struct_floor.html#abea845fe5e8e6b93bd4bca8717337e0b',1,'Floor::operator()(bool x)'],['../struct_imag.html#a3b29e9f8a46c194d683f6a9938314400',1,'Imag::operator()()'],['../struct_log.html#a32a383cb6be06e616a75f23bf49089c3',1,'Log::operator()()'],['../struct_log2.html#ac1e067ecdcbdbffb6106e789c2b98b64',1,'Log2::operator()()'],['../struct_log10.html#ac596a74c1642a00f3eced07ee3334122',1,'Log10::operator()()'],['../struct_log1p.html#a4464c6e7bdbe55ffd7d961c695cd13ce',1,'Log1p::operator()()'],['../struct_logical_not.html#a8a620bac957ab8c09ac85adfddd96708',1,'LogicalNot::operator()()'],['../struct_negative.html#af6879b374314a559faa321e8cce3d710',1,'Negative::operator()()'],['../struct_real.html#a85b9c5b9e65297994fa26ff68e19e809',1,'Real::operator()()'],['../struct_round.html#aa06a0195867e2ceb679c403b6909a1c4',1,'Round::operator()(T x)'],['../struct_round.html#ad3a08f2276ff1033900bc0a7da812655',1,'Round::operator()(complex64_t x)'],['../struct_sigmoid.html#a75a24cd75cb4d4c9a072811b2d70ad55',1,'Sigmoid::operator()()'],['../struct_sign.html#aa3304c6b43bcad53061614b741d8403c',1,'Sign::operator()(T x)'],['../struct_sign.html#ac48992b675b8b28be1e27e1f2ec5d2f7',1,'Sign::operator()(uint32_t x)'],['../struct_sign.html#ae07a4249e1b61419a3b9ca6c337b7bb5',1,'Sign::operator()(complex64_t x)'],['../struct_sin.html#a7caf98c777521fa5d5c6ddaaa3b779fd',1,'Sin::operator()(T x)'],['../struct_sin.html#aa510cf4595b6d49065ab6b602d8fcb14',1,'Sin::operator()(complex64_t x)'],['../struct_sinh.html#a02cf32bcf560657b9ee34fb1affed8e2',1,'Sinh::operator()(T x)'],['../struct_sinh.html#a1f8ba1858d352ee68861cd6ea861af43',1,'Sinh::operator()(complex64_t x)'],['../struct_square.html#afde739fc544e45dd30964c02dca94310',1,'Square::operator()()'],['../struct_sqrt.html#ab9b16d2b9b03a1c54190f4479a56a4ad',1,'Sqrt::operator()()'],['../struct_rsqrt.html#ae16699fd829e40416436247a39233fda',1,'Rsqrt::operator()()'],['../struct_tan.html#a1e6fb8c691621c69cb9bd393de4f6e78',1,'Tan::operator()(T x)'],['../struct_tan.html#a2ef120c9f92b0d2e9cec8389eda05724',1,'Tan::operator()(complex64_t x)'],['../struct_tanh.html#adce11a7ad33226c6ecff34f46f5c45d7',1,'Tanh::operator()(T x)'],['../struct_tanh.html#aa8423b43c725bb4b88965a11e8cf20f6',1,'Tanh::operator()(complex64_t x)']]],
-  ['operator_2a_17',['operator*',['../structpocketfft_1_1detail_1_1cmplx.html#a26bf3d709a58f06228e502af6db8e5ac',1,'pocketfft::detail::cmplx::operator*(const T2 &amp;other) const -&gt; cmplx&lt; decltype(r *other)&gt;'],['../structpocketfft_1_1detail_1_1cmplx.html#ad9c591ef8ae976293f207937d273e9a1',1,'pocketfft::detail::cmplx::operator*(const cmplx&lt; T2 &gt; &amp;other) const -&gt; cmplx&lt; decltype(r+other.r)&gt;'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a153756072fda6d3e53bcca11b46a1238',1,'mlx::core::array::ArrayIterator::operator*()'],['../backend_2metal_2kernels_2bf16_8h.html#a8f06316063fc91747533105f256b55b5',1,'operator*(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7b3bce3f6f17089d87e13e91f580a581',1,'operator*(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a54ae7216b82c5cea362f6b83e1df3a9b',1,'operator*(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a852689073c17596de4fb545bc046b380',1,'operator*(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a168300bbd04d8e97c5e4218cb14ae378',1,'operator*(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a6278bd2e0e2805090b33ef666bf7f6bb',1,'operator*(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aecf703522d9ce32dfeefe1e6e903db06',1,'operator*(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7cd44d27fa9a4f13df39894c34fdb348',1,'operator*(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aee64dc1890abb6d1035361cb8c751f96',1,'operator*(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad1a559ab88dbbb4fd2c7509d2c94e55b',1,'operator*(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a495ae2d9be5d97c4c6448fc4e50a03e1',1,'operator*(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a87ab4b7a502430da664ccb8abd383058',1,'operator*(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5f997839cf49c24ab594a0dff486a7bc',1,'operator*(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#a681d4fb076973f58f7dac894ec62a385',1,'operator*(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#aa0c2d29950926ae579adf6337fbea64b',1,'mlx::steel::operator*()'],['../group__ops.html#ga26c33f5cdb6fc10d272acd6e208034e0',1,'mlx::core::operator*(const array &amp;a, const array &amp;b)'],['../group__ops.html#gac22a67f7de797b1ae59029843cbdcab6',1,'mlx::core::operator*(T a, const array &amp;b)'],['../group__ops.html#ga6f2369ed5fae8ff9b1528670a004dde2',1,'mlx::core::operator*(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a0cc824d6318f97f7058918ab64ddfc25',1,'mlx::core::operator*(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a81e1c727c3fc48910b030cb65a9e7afa',1,'mlx::core::operator*(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a861d948220d8f48d46c68d2ddb16a096',1,'mlx::core::operator*(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a13d16561812679b36e68185dc4b2d04d',1,'mlx::core::operator*(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a5287610200ff573730c9c92413f48881',1,'mlx::core::operator*(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a377ccc6b4ef36767abca102dca56dc10',1,'mlx::core::operator*(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a5d696b63635ce6967526d6a410f7f6b1',1,'mlx::core::operator*(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abe90e9527bfa3e1c813d41df4a2372e7',1,'mlx::core::operator*(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a5f14963c77f96bcb5a3bef5661a86ba4',1,'mlx::core::operator*(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#acfb06fe9f5fee01dbb5a2b23bccfd0d3',1,'mlx::core::operator*(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#afc9a87f1fccbac05242b91bfbb35c24d',1,'mlx::core::operator*(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0b9678af9b487900cacf6639a4693de0',1,'mlx::core::operator*(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ad5950619081389e6ed7512f38358d33d',1,'mlx::core::operator*(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a65d25d082374761c05b056e1046d1d4e',1,'mlx::core::operator*(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a759191fb984e7737f0ef529c2053ad73',1,'mlx::core::operator*(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3a52675c3d4552b319dd9707844abdec',1,'mlx::core::operator*(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a45d67f5d80fba4d42e34c682a8d22beb',1,'mlx::core::operator*(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ad25880c67bbcbfafbe54dc16418bf736',1,'mlx::core::operator*(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a63c836e1141e07ae72cee770bad01200',1,'mlx::core::operator*(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a265a37b8ee4a97390213e9ec49693e66',1,'mlx::core::operator*(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab5a457da04dcb157a0b5172c4b2244b6',1,'mlx::core::operator*(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#aa56a8bda08be9ef3711496e216a75c95',1,'mlx::core::operator*(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af89612098dd355b1eefb841c753b36ab',1,'mlx::core::operator*(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a4552687a0637f710b5d55bb6378fcabe',1,'mlx::core::operator*(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af69db7def588d7da430434a69456e29c',1,'mlx::core::operator*(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a00af6e5095888f00791ee0ab6d993ad6',1,'mlx::core::operator*(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab48feddc1aa304383e5493923506ad7a',1,'mlx::core::operator*(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a0367b582e85162b4180e086f725e49e9',1,'mlx::core::operator*(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a45f0479526fbccdb00bc73ea7f3b7625',1,'mlx::core::operator*(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a394797646010ba9ef2a1f9b9a4b8ddd9',1,'mlx::core::operator*(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acaaa86b59c7ceb2e092ac07f2a75225c',1,'mlx::core::operator*(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a067d47823a322b88043cce7ce4a3ec78',1,'mlx::core::operator*(bfloat16_t lhs, float16_t rhs)']]],
-  ['operator_2a_3d_18',['operator*=',['../structpocketfft_1_1detail_1_1cmplx.html#a683fd490182c9189fa2c05b1823edd93',1,'pocketfft::detail::cmplx::operator*=(T2 other)'],['../structpocketfft_1_1detail_1_1cmplx.html#a06f2c26c6fc4722e61b44da4c242ed87',1,'pocketfft::detail::cmplx::operator*=(const cmplx&lt; T2 &gt; &amp;other)'],['../backend_2metal_2kernels_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419',1,'operator*=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ade65ebca11e38d56408c512df89b99f4',1,'operator*=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af4348ce3425dd99d069e8fdf06e25a3c',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2c3c5f793b3d957d7295d7f1faabebee',1,'operator*=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac66657077d55e94197b52b63acb50b7d',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a383165ea838cc3feeee4d9cf54aa77cc',1,'operator*=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab706af260b61f735b28464877d02137c',1,'operator*=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a979374b1dd4e0eaf602326fa901336d1',1,'operator*=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac815eec2c1b15a47b1c6ea6790e77d24',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8110fae7bcc34a0de5927546b24aa935',1,'operator*=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae4acef3e7ae7dfe359422503f894e885',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adc268cdbc30500f3009f5de2b2f0f67a',1,'operator*=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a81f65b04a87a25c7eb1a751d1be9fa55',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08c1f916302eb9d48c93f8b7260538fe',1,'operator*=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adc8e82b8f593b12c6d405e2250ab0f62',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4611728172afea51860a77fdb06cafa0',1,'operator*=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0b8736e2ae24758b6e24ea72668df5b4',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad920df9579603f0b0ee2689eba330617',1,'operator*=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae97ab6c3ddcc2754b24f86319a5398be',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3ff4ff59f411010ac8502cfabda4bd6f',1,'operator*=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abd3d82e2dec1847e97eb8fc3bab2985a',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a738078eb7d5ff94ff48156a555d763a5',1,'operator*=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a435f2f4256aadb1b57fd62bb7f733cf7',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0e4377b120d6305335d296e031ee5b30',1,'operator*=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a917354f77eac26189da8a2f610a00074',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af725f935bfa0405e5ff17ede3ac47283',1,'operator*=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7c56980c234a04260b8b19298085e526',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab840ff9de0cdd0e9afffb8baa2a850a3',1,'operator*=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a73416a7415f3fe31525e33419e5e8aab',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a16978f4b16d954ef4d4cf0f32f6c0b94',1,'operator*=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a99aa4cc110d1c7aa3b4c8c5cbf9235b7',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2179abbc91ce8763e96e39e1917bfa6e',1,'operator*=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab070ea4676d10a10ff3e9379a4068a57',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0197e039d4c65bf49649a6f250c2d436',1,'operator*=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad3565cc6fd1e088d052b1108aa065851',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a711693988c437c2fb4d7da505982fe21',1,'operator*=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aeff4c28986f98c23de1df17043edb0f5',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7dbf0c75df4817cb4ef8b60c417a89d0',1,'operator*=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a323a80492cd17a49e2c3dd18f8c8b5cc',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adb465776d3868bda0525d632ffc4d129',1,'operator*=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a12a98d71d670b409b8065e0d61672d55',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5d00eb2ec2b0e15b2753d100694c45ae',1,'operator*=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1a2a683ff40490226eb1371fb905023d',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4126fb7ed5bbb27a2332c543cf56a337',1,'operator*=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab092d9790ef20fc0386707530aee89db',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abff1fd2439e31e6e64a3d2fdee3c7821',1,'operator*=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a625dcb133f1f953f263e6200399866c6',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08b6071245513e1726ec68e3b63edc53',1,'operator*=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a13aa79165ec87710e977f33fe0361e91',1,'operator*=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3796dcf819adb1ef8152f57ba63ff6b1',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aaab79d0b4c9e9bdc059ace6ec58c5b00',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a0dd3893abc8986901872c8365ab1509d',1,'mlx::core::operator*=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a3cc5c154e4ad9a83ad43da8513146fdc',1,'mlx::core::operator*=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a600e77dbc72e78207b5f5dbf4b298781',1,'mlx::core::operator*=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a54833be1d44bc3adfc9ea218fc3685bd',1,'mlx::core::operator*=(float &amp;lhs, _MLX_Float16 rhs)']]],
-  ['operator_2b_19',['operator+',['../structpocketfft_1_1detail_1_1cmplx.html#a76447ef141c8732d57421749fc81b236',1,'pocketfft::detail::cmplx::operator+()'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ae2adde594b5a4853f6bc78263a957d85',1,'mlx::core::array::ArrayIterator::operator+()'],['../backend_2metal_2kernels_2bf16_8h.html#a09c1a797eb7f43742578680899932f50',1,'operator+(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a551b970f73bb4a3b287653021d000b60',1,'operator+(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a43a225e7e548bb041f3a5d844faaf0da',1,'operator+(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8b6c3fd9d068a2159084359df8b9b449',1,'operator+(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0a5bfe15d95ba540795f4c25ebfa4f07',1,'operator+(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa415ce182fe7582d885fe633fc3527ce',1,'operator+(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a62f891b7dbba0000749cf338f594bedb',1,'operator+(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab43932322f81bf322aa1b0deeee9a987',1,'operator+(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acd15d46ea5827a2a39898ccbb8352eb8',1,'operator+(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a006763fae6e0577fc168ec9446f0f747',1,'operator+(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a12a47e8ac0be788edff57ae0a96d7830',1,'operator+(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af87dfa2122e9c76042dc41fb7f338a87',1,'operator+(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af2737d09c887ee8cd43fdeabceddbe82',1,'operator+(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#ad6af5c6c5ed4898b49758618e5aee189',1,'operator+(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#a12ff4f38aa8474bf76770c7b8e3e18cb',1,'mlx::steel::operator+()'],['../group__ops.html#ga26e5a043eaaaf066d1400adac9c11d0c',1,'mlx::core::operator+(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga7d0ec8d01e7cefa6a6b25f11876761b5',1,'mlx::core::operator+(T a, const array &amp;b)'],['../group__ops.html#ga7cc080a4f9d4a667f2099aa0dbfefadd',1,'mlx::core::operator+(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#ac14b984970cafd8fbe24d080949515cc',1,'mlx::core::operator+(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab076069c6f0047c548a8dc29d35dd36a',1,'mlx::core::operator+(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aab9d96b0a168f4d05146000a6212b5d8',1,'mlx::core::operator+(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac4e6f03d7e4ae701b4eefa784f36185b',1,'mlx::core::operator+(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a4cabd600a5271b0d416c91e8d31dd9c1',1,'mlx::core::operator+(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af26df9dc279d71b7cc10892c72162b58',1,'mlx::core::operator+(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#ac3b97eecec9bd8efb313f8f201560343',1,'mlx::core::operator+(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2e3bb121cbde30c2e6d806df0d41ff59',1,'mlx::core::operator+(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ac87ecce4b44b0826e666a169ddc6f878',1,'mlx::core::operator+(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aed3d9cd32698ef0fe65b1280f103b3f5',1,'mlx::core::operator+(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6fa13b9359cf3f575fbda5260e6e035d',1,'mlx::core::operator+(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af240a6471ff827819192808bffeb857a',1,'mlx::core::operator+(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ac25a05679f312b724c406d8b282803c9',1,'mlx::core::operator+(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a54863a54f258acf2b5c734950618e4e1',1,'mlx::core::operator+(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a9f81f5ea8909db9660197217612ee446',1,'mlx::core::operator+(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a13e26c38da0a4e332e0ae4eb0aed9cb8',1,'mlx::core::operator+(const std::complex&lt; float &gt; &amp;x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a59bb13a0bb7f748c8de34415b248bc57',1,'mlx::core::operator+(const complex64_t &amp;x, const std::complex&lt; float &gt; &amp;y)'],['../namespacemlx_1_1core.html#a38a44c412c8be4c8b952d3082cc7db74',1,'mlx::core::operator+(const complex64_t &amp;x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a011dbdbd2413e59e744cf82b05431340',1,'mlx::core::operator+(bool x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a230e3b7c479add1b171fa0aaa3a8b13c',1,'mlx::core::operator+(const complex64_t &amp;x, bool y)'],['../namespacemlx_1_1core.html#a3a6f43c2485f0d42293184f1aecbeaee',1,'mlx::core::operator+(uint32_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a766157c5d5d00fdf3da95eb7cb2981b9',1,'mlx::core::operator+(const complex64_t &amp;x, uint32_t y)'],['../namespacemlx_1_1core.html#a64dceec2bb03eee963a2a1bc1ac69284',1,'mlx::core::operator+(uint64_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#ae36badb78a17cd7d13663a69645fc328',1,'mlx::core::operator+(const complex64_t &amp;x, uint64_t y)'],['../namespacemlx_1_1core.html#ac1afa5d4c856e4b58109eff086e70ffd',1,'mlx::core::operator+(int32_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a8978def3c2cfe2a96314d564613b80db',1,'mlx::core::operator+(const complex64_t &amp;x, int32_t y)'],['../namespacemlx_1_1core.html#a5b8af5ca4c0e37aba0b7530542bd64c2',1,'mlx::core::operator+(int64_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a3eaa72850205c18450c3af9a01cda219',1,'mlx::core::operator+(const complex64_t &amp;x, int64_t y)'],['../namespacemlx_1_1core.html#ad38b38a3faf050735d45eed4438ee27a',1,'mlx::core::operator+(float16_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a358e66ff205bda3e8542427b6d2edadc',1,'mlx::core::operator+(const complex64_t &amp;x, float16_t y)'],['../namespacemlx_1_1core.html#af56d4b85e329e39a825c01a50e3a2522',1,'mlx::core::operator+(bfloat16_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a806a495a129ebaab69cc57ca7db831d6',1,'mlx::core::operator+(const complex64_t &amp;x, bfloat16_t y)'],['../namespacemlx_1_1core.html#a09fc6ebda917969383783a112a8547e7',1,'mlx::core::operator+(float x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a7ed0e2cdb65612f54e67166762cb6408',1,'mlx::core::operator+(const complex64_t &amp;x, float y)'],['../namespacemlx_1_1core.html#af7577c91b8c43682f0ebc9eb9758aae4',1,'mlx::core::operator+(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#abe36af9951afd8dd3ffe90ceedeb7f2b',1,'mlx::core::operator+(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#afb9f780dd056a4f975518f71a3b021ee',1,'mlx::core::operator+(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a6a8e093b24c4c789b7cd160f7e7f7de9',1,'mlx::core::operator+(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#af3a603690fd3de9e4f7f2035a4d25621',1,'mlx::core::operator+(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afa2a4bccfeea9688ac922cb638341511',1,'mlx::core::operator+(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a6111e94d51de12391e5d68b765f28fc3',1,'mlx::core::operator+(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7c7dd6d346e0cdf398a896f2c6958258',1,'mlx::core::operator+(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a00872a443f462b0ae0a30c84fb001bc0',1,'mlx::core::operator+(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4f5d80d03bae6d8d90455d3c47a8c116',1,'mlx::core::operator+(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a78f1f388f9d81ed93f60311f4645d8d0',1,'mlx::core::operator+(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aa43e1d6958c5d5a6fa9a625a1660e741',1,'mlx::core::operator+(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ae877e1d5e3cf57734da8b49535fe3fb3',1,'mlx::core::operator+(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a9a5ae769f67f886d59c8e292a8218550',1,'mlx::core::operator+(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a058878237ce50baa4c909d8d15448d7e',1,'mlx::core::operator+(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a95fd207028f125eefbafe9e0522407fe',1,'mlx::core::operator+(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#abc6425a3fbb386f5ea5964b42507e989',1,'mlx::core::operator+(bfloat16_t lhs, float16_t rhs)']]],
+  ['operator_2a_17',['operator*',['../structpocketfft_1_1detail_1_1cmplx.html#a26bf3d709a58f06228e502af6db8e5ac',1,'pocketfft::detail::cmplx::operator*(const T2 &amp;other) const -&gt; cmplx&lt; decltype(r *other)&gt;'],['../structpocketfft_1_1detail_1_1cmplx.html#ad9c591ef8ae976293f207937d273e9a1',1,'pocketfft::detail::cmplx::operator*(const cmplx&lt; T2 &gt; &amp;other) const -&gt; cmplx&lt; decltype(r+other.r)&gt;'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a153756072fda6d3e53bcca11b46a1238',1,'mlx::core::array::ArrayIterator::operator*()'],['../backend_2metal_2kernels_2complex_8h.html#a681d4fb076973f58f7dac894ec62a385',1,'operator*(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8f06316063fc91747533105f256b55b5',1,'operator*(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7b3bce3f6f17089d87e13e91f580a581',1,'operator*(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a54ae7216b82c5cea362f6b83e1df3a9b',1,'operator*(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a852689073c17596de4fb545bc046b380',1,'operator*(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a168300bbd04d8e97c5e4218cb14ae378',1,'operator*(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6278bd2e0e2805090b33ef666bf7f6bb',1,'operator*(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aecf703522d9ce32dfeefe1e6e903db06',1,'operator*(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7cd44d27fa9a4f13df39894c34fdb348',1,'operator*(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aee64dc1890abb6d1035361cb8c751f96',1,'operator*(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad1a559ab88dbbb4fd2c7509d2c94e55b',1,'operator*(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a495ae2d9be5d97c4c6448fc4e50a03e1',1,'operator*(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a87ab4b7a502430da664ccb8abd383058',1,'operator*(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5f997839cf49c24ab594a0dff486a7bc',1,'operator*(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aa0c2d29950926ae579adf6337fbea64b',1,'mlx::steel::operator*()'],['../group__ops.html#ga26c33f5cdb6fc10d272acd6e208034e0',1,'mlx::core::operator*(const array &amp;a, const array &amp;b)'],['../group__ops.html#gac22a67f7de797b1ae59029843cbdcab6',1,'mlx::core::operator*(T a, const array &amp;b)'],['../group__ops.html#ga6f2369ed5fae8ff9b1528670a004dde2',1,'mlx::core::operator*(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a0cc824d6318f97f7058918ab64ddfc25',1,'mlx::core::operator*(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a81e1c727c3fc48910b030cb65a9e7afa',1,'mlx::core::operator*(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a861d948220d8f48d46c68d2ddb16a096',1,'mlx::core::operator*(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a13d16561812679b36e68185dc4b2d04d',1,'mlx::core::operator*(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a5287610200ff573730c9c92413f48881',1,'mlx::core::operator*(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a377ccc6b4ef36767abca102dca56dc10',1,'mlx::core::operator*(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a5d696b63635ce6967526d6a410f7f6b1',1,'mlx::core::operator*(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abe90e9527bfa3e1c813d41df4a2372e7',1,'mlx::core::operator*(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a5f14963c77f96bcb5a3bef5661a86ba4',1,'mlx::core::operator*(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#acfb06fe9f5fee01dbb5a2b23bccfd0d3',1,'mlx::core::operator*(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#afc9a87f1fccbac05242b91bfbb35c24d',1,'mlx::core::operator*(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0b9678af9b487900cacf6639a4693de0',1,'mlx::core::operator*(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ad5950619081389e6ed7512f38358d33d',1,'mlx::core::operator*(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a65d25d082374761c05b056e1046d1d4e',1,'mlx::core::operator*(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a759191fb984e7737f0ef529c2053ad73',1,'mlx::core::operator*(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3a52675c3d4552b319dd9707844abdec',1,'mlx::core::operator*(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a45d67f5d80fba4d42e34c682a8d22beb',1,'mlx::core::operator*(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ad25880c67bbcbfafbe54dc16418bf736',1,'mlx::core::operator*(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a63c836e1141e07ae72cee770bad01200',1,'mlx::core::operator*(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a265a37b8ee4a97390213e9ec49693e66',1,'mlx::core::operator*(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab5a457da04dcb157a0b5172c4b2244b6',1,'mlx::core::operator*(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#aa56a8bda08be9ef3711496e216a75c95',1,'mlx::core::operator*(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af89612098dd355b1eefb841c753b36ab',1,'mlx::core::operator*(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a4552687a0637f710b5d55bb6378fcabe',1,'mlx::core::operator*(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af69db7def588d7da430434a69456e29c',1,'mlx::core::operator*(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a00af6e5095888f00791ee0ab6d993ad6',1,'mlx::core::operator*(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab48feddc1aa304383e5493923506ad7a',1,'mlx::core::operator*(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a0367b582e85162b4180e086f725e49e9',1,'mlx::core::operator*(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a45f0479526fbccdb00bc73ea7f3b7625',1,'mlx::core::operator*(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a394797646010ba9ef2a1f9b9a4b8ddd9',1,'mlx::core::operator*(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acaaa86b59c7ceb2e092ac07f2a75225c',1,'mlx::core::operator*(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a067d47823a322b88043cce7ce4a3ec78',1,'mlx::core::operator*(bfloat16_t lhs, float16_t rhs)']]],
+  ['operator_2a_3d_18',['operator*=',['../structpocketfft_1_1detail_1_1cmplx.html#a683fd490182c9189fa2c05b1823edd93',1,'pocketfft::detail::cmplx::operator*=(T2 other)'],['../structpocketfft_1_1detail_1_1cmplx.html#a06f2c26c6fc4722e61b44da4c242ed87',1,'pocketfft::detail::cmplx::operator*=(const cmplx&lt; T2 &gt; &amp;other)'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7232b0a0e193b3c6172d6fc2578bf419',1,'operator*=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ade65ebca11e38d56408c512df89b99f4',1,'operator*=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af4348ce3425dd99d069e8fdf06e25a3c',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2c3c5f793b3d957d7295d7f1faabebee',1,'operator*=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac66657077d55e94197b52b63acb50b7d',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a383165ea838cc3feeee4d9cf54aa77cc',1,'operator*=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab706af260b61f735b28464877d02137c',1,'operator*=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a979374b1dd4e0eaf602326fa901336d1',1,'operator*=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac815eec2c1b15a47b1c6ea6790e77d24',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8110fae7bcc34a0de5927546b24aa935',1,'operator*=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae4acef3e7ae7dfe359422503f894e885',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adc268cdbc30500f3009f5de2b2f0f67a',1,'operator*=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a81f65b04a87a25c7eb1a751d1be9fa55',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08c1f916302eb9d48c93f8b7260538fe',1,'operator*=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adc8e82b8f593b12c6d405e2250ab0f62',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4611728172afea51860a77fdb06cafa0',1,'operator*=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0b8736e2ae24758b6e24ea72668df5b4',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad920df9579603f0b0ee2689eba330617',1,'operator*=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae97ab6c3ddcc2754b24f86319a5398be',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3ff4ff59f411010ac8502cfabda4bd6f',1,'operator*=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abd3d82e2dec1847e97eb8fc3bab2985a',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a738078eb7d5ff94ff48156a555d763a5',1,'operator*=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a435f2f4256aadb1b57fd62bb7f733cf7',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0e4377b120d6305335d296e031ee5b30',1,'operator*=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a917354f77eac26189da8a2f610a00074',1,'operator*=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af725f935bfa0405e5ff17ede3ac47283',1,'operator*=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7c56980c234a04260b8b19298085e526',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab840ff9de0cdd0e9afffb8baa2a850a3',1,'operator*=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a73416a7415f3fe31525e33419e5e8aab',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a16978f4b16d954ef4d4cf0f32f6c0b94',1,'operator*=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a99aa4cc110d1c7aa3b4c8c5cbf9235b7',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2179abbc91ce8763e96e39e1917bfa6e',1,'operator*=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab070ea4676d10a10ff3e9379a4068a57',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0197e039d4c65bf49649a6f250c2d436',1,'operator*=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad3565cc6fd1e088d052b1108aa065851',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a711693988c437c2fb4d7da505982fe21',1,'operator*=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeff4c28986f98c23de1df17043edb0f5',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7dbf0c75df4817cb4ef8b60c417a89d0',1,'operator*=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a323a80492cd17a49e2c3dd18f8c8b5cc',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adb465776d3868bda0525d632ffc4d129',1,'operator*=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a12a98d71d670b409b8065e0d61672d55',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5d00eb2ec2b0e15b2753d100694c45ae',1,'operator*=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1a2a683ff40490226eb1371fb905023d',1,'operator*=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4126fb7ed5bbb27a2332c543cf56a337',1,'operator*=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab092d9790ef20fc0386707530aee89db',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abff1fd2439e31e6e64a3d2fdee3c7821',1,'operator*=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a625dcb133f1f953f263e6200399866c6',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08b6071245513e1726ec68e3b63edc53',1,'operator*=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a13aa79165ec87710e977f33fe0361e91',1,'operator*=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3796dcf819adb1ef8152f57ba63ff6b1',1,'operator*=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aaab79d0b4c9e9bdc059ace6ec58c5b00',1,'operator*=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a0dd3893abc8986901872c8365ab1509d',1,'mlx::core::operator*=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a3cc5c154e4ad9a83ad43da8513146fdc',1,'mlx::core::operator*=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a600e77dbc72e78207b5f5dbf4b298781',1,'mlx::core::operator*=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a54833be1d44bc3adfc9ea218fc3685bd',1,'mlx::core::operator*=(float &amp;lhs, _MLX_Float16 rhs)']]],
+  ['operator_2b_19',['operator+',['../structpocketfft_1_1detail_1_1cmplx.html#a76447ef141c8732d57421749fc81b236',1,'pocketfft::detail::cmplx::operator+()'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ae2adde594b5a4853f6bc78263a957d85',1,'mlx::core::array::ArrayIterator::operator+()'],['../backend_2metal_2kernels_2complex_8h.html#ad6af5c6c5ed4898b49758618e5aee189',1,'operator+(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a09c1a797eb7f43742578680899932f50',1,'operator+(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a551b970f73bb4a3b287653021d000b60',1,'operator+(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a43a225e7e548bb041f3a5d844faaf0da',1,'operator+(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8b6c3fd9d068a2159084359df8b9b449',1,'operator+(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0a5bfe15d95ba540795f4c25ebfa4f07',1,'operator+(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa415ce182fe7582d885fe633fc3527ce',1,'operator+(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a62f891b7dbba0000749cf338f594bedb',1,'operator+(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab43932322f81bf322aa1b0deeee9a987',1,'operator+(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acd15d46ea5827a2a39898ccbb8352eb8',1,'operator+(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a006763fae6e0577fc168ec9446f0f747',1,'operator+(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a12a47e8ac0be788edff57ae0a96d7830',1,'operator+(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af87dfa2122e9c76042dc41fb7f338a87',1,'operator+(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af2737d09c887ee8cd43fdeabceddbe82',1,'operator+(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#a12ff4f38aa8474bf76770c7b8e3e18cb',1,'mlx::steel::operator+()'],['../group__ops.html#ga26e5a043eaaaf066d1400adac9c11d0c',1,'mlx::core::operator+(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga7d0ec8d01e7cefa6a6b25f11876761b5',1,'mlx::core::operator+(T a, const array &amp;b)'],['../group__ops.html#ga7cc080a4f9d4a667f2099aa0dbfefadd',1,'mlx::core::operator+(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#ac14b984970cafd8fbe24d080949515cc',1,'mlx::core::operator+(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab076069c6f0047c548a8dc29d35dd36a',1,'mlx::core::operator+(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aab9d96b0a168f4d05146000a6212b5d8',1,'mlx::core::operator+(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac4e6f03d7e4ae701b4eefa784f36185b',1,'mlx::core::operator+(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a4cabd600a5271b0d416c91e8d31dd9c1',1,'mlx::core::operator+(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af26df9dc279d71b7cc10892c72162b58',1,'mlx::core::operator+(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#ac3b97eecec9bd8efb313f8f201560343',1,'mlx::core::operator+(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2e3bb121cbde30c2e6d806df0d41ff59',1,'mlx::core::operator+(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ac87ecce4b44b0826e666a169ddc6f878',1,'mlx::core::operator+(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aed3d9cd32698ef0fe65b1280f103b3f5',1,'mlx::core::operator+(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6fa13b9359cf3f575fbda5260e6e035d',1,'mlx::core::operator+(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af240a6471ff827819192808bffeb857a',1,'mlx::core::operator+(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ac25a05679f312b724c406d8b282803c9',1,'mlx::core::operator+(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a54863a54f258acf2b5c734950618e4e1',1,'mlx::core::operator+(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a9f81f5ea8909db9660197217612ee446',1,'mlx::core::operator+(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a13e26c38da0a4e332e0ae4eb0aed9cb8',1,'mlx::core::operator+(const std::complex&lt; float &gt; &amp;x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a59bb13a0bb7f748c8de34415b248bc57',1,'mlx::core::operator+(const complex64_t &amp;x, const std::complex&lt; float &gt; &amp;y)'],['../namespacemlx_1_1core.html#a38a44c412c8be4c8b952d3082cc7db74',1,'mlx::core::operator+(const complex64_t &amp;x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a011dbdbd2413e59e744cf82b05431340',1,'mlx::core::operator+(bool x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a230e3b7c479add1b171fa0aaa3a8b13c',1,'mlx::core::operator+(const complex64_t &amp;x, bool y)'],['../namespacemlx_1_1core.html#a3a6f43c2485f0d42293184f1aecbeaee',1,'mlx::core::operator+(uint32_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a766157c5d5d00fdf3da95eb7cb2981b9',1,'mlx::core::operator+(const complex64_t &amp;x, uint32_t y)'],['../namespacemlx_1_1core.html#a64dceec2bb03eee963a2a1bc1ac69284',1,'mlx::core::operator+(uint64_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#ae36badb78a17cd7d13663a69645fc328',1,'mlx::core::operator+(const complex64_t &amp;x, uint64_t y)'],['../namespacemlx_1_1core.html#ac1afa5d4c856e4b58109eff086e70ffd',1,'mlx::core::operator+(int32_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a8978def3c2cfe2a96314d564613b80db',1,'mlx::core::operator+(const complex64_t &amp;x, int32_t y)'],['../namespacemlx_1_1core.html#a5b8af5ca4c0e37aba0b7530542bd64c2',1,'mlx::core::operator+(int64_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a3eaa72850205c18450c3af9a01cda219',1,'mlx::core::operator+(const complex64_t &amp;x, int64_t y)'],['../namespacemlx_1_1core.html#ad38b38a3faf050735d45eed4438ee27a',1,'mlx::core::operator+(float16_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a358e66ff205bda3e8542427b6d2edadc',1,'mlx::core::operator+(const complex64_t &amp;x, float16_t y)'],['../namespacemlx_1_1core.html#af56d4b85e329e39a825c01a50e3a2522',1,'mlx::core::operator+(bfloat16_t x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a806a495a129ebaab69cc57ca7db831d6',1,'mlx::core::operator+(const complex64_t &amp;x, bfloat16_t y)'],['../namespacemlx_1_1core.html#a09fc6ebda917969383783a112a8547e7',1,'mlx::core::operator+(float x, const complex64_t &amp;y)'],['../namespacemlx_1_1core.html#a7ed0e2cdb65612f54e67166762cb6408',1,'mlx::core::operator+(const complex64_t &amp;x, float y)'],['../namespacemlx_1_1core.html#af7577c91b8c43682f0ebc9eb9758aae4',1,'mlx::core::operator+(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#abe36af9951afd8dd3ffe90ceedeb7f2b',1,'mlx::core::operator+(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#afb9f780dd056a4f975518f71a3b021ee',1,'mlx::core::operator+(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a6a8e093b24c4c789b7cd160f7e7f7de9',1,'mlx::core::operator+(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#af3a603690fd3de9e4f7f2035a4d25621',1,'mlx::core::operator+(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afa2a4bccfeea9688ac922cb638341511',1,'mlx::core::operator+(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a6111e94d51de12391e5d68b765f28fc3',1,'mlx::core::operator+(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7c7dd6d346e0cdf398a896f2c6958258',1,'mlx::core::operator+(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a00872a443f462b0ae0a30c84fb001bc0',1,'mlx::core::operator+(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4f5d80d03bae6d8d90455d3c47a8c116',1,'mlx::core::operator+(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a78f1f388f9d81ed93f60311f4645d8d0',1,'mlx::core::operator+(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aa43e1d6958c5d5a6fa9a625a1660e741',1,'mlx::core::operator+(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ae877e1d5e3cf57734da8b49535fe3fb3',1,'mlx::core::operator+(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a9a5ae769f67f886d59c8e292a8218550',1,'mlx::core::operator+(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a058878237ce50baa4c909d8d15448d7e',1,'mlx::core::operator+(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a95fd207028f125eefbafe9e0522407fe',1,'mlx::core::operator+(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#abc6425a3fbb386f5ea5964b42507e989',1,'mlx::core::operator+(bfloat16_t lhs, float16_t rhs)']]],
   ['operator_2b_2b_20',['operator++',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a3efe69356a84d0d4438f033992fcbd9d',1,'mlx::core::array::ArrayIterator']]],
-  ['operator_2b_3d_21',['operator+=',['../structpocketfft_1_1detail_1_1cmplx.html#ad4e69dcd89bdb7764c9c5807168f911e',1,'pocketfft::detail::cmplx::operator+=(const cmplx &amp;other)'],['../structpocketfft_1_1detail_1_1cmplx.html#affa618d8850a7c232793b7c61db6d184',1,'pocketfft::detail::cmplx::operator+=(const cmplx&lt; T2 &gt; &amp;other)'],['../backend_2metal_2kernels_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400',1,'operator+=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a251780ac4592cc2b1a543e417ff57770',1,'operator+=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a24381d991c2d570aa953694f396a69b5',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7595740d4cc12924905d6bd1b99ee4da',1,'operator+=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac1498acb8c3623b5f412f70ab6a6528b',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abce5ab327110c164f054b43ed47f79a0',1,'operator+=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae0c70198e236ffe1a98f79987c686419',1,'operator+=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a13b3338935440ae51ecc4a356093efc5',1,'operator+=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5a0cb8544b4ebd2906ba8e7f2868e8de',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7b134429ea0c8493800ff8b465410f9c',1,'operator+=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4154f90ab7857ca856f9e15fe1bf5acf',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab9ae6a51e2027b02cac9966e05f3ba68',1,'operator+=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab93ce536eb7998bee00de4af868e31a9',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad0ae9e2b4874f991a2c853e1c1fe735d',1,'operator+=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a194a6670cc25ade35a24b566f31af785',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3d0d689516c99003659c5d026847bd2e',1,'operator+=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a007f58508b98bb79e5c323ed0dec89b6',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa7198e580e2a83c1fd01a4b6fdf86a80',1,'operator+=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a15573fefd880adefbba079b1c1bd8082',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a104cf94cb9e359d1b6ef92ced2ce0c27',1,'operator+=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa04cfcb52191fd23205a1a3572b46ae0',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad684bc2ae1a2a627cd3e4a4c641e2d77',1,'operator+=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad1e28448e35f4934075b397c34ba3d66',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8ad16afd7f1711de83c0cec5af868f76',1,'operator+=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac45e9ca0c7155caebe3d0f7261518077',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3c62ac679d6aa515144d40ebafe4a188',1,'operator+=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9ff5ab3aef1057fa083b53a65c8aba03',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae74bb0a3c12cd1a23f3d29ce307d6fb1',1,'operator+=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac188bd19f236b098d603b0d8acd08921',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aef9fa600d107b509f2e3df7d6b080e01',1,'operator+=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af5713afb3a62967a02c3c20661951ee4',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7f1b84352a3ed6171444a43da1fc7e92',1,'operator+=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af1983edd26245e6e51c6e47354095e32',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8cd55d1a579540eb450e12a8a8a950be',1,'operator+=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a588ef0f7e03f306758524d378278976f',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a74751abec7086f85f4f26ced44f1ca1f',1,'operator+=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4dd3cf0e5aa116ff330352a50c18cde7',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afb9a0e18c0e40c77e6143fb7d84ebfba',1,'operator+=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adf0cfd9a608a6fb3d57933e32e7d81d2',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4bd92db6c8b9b5dc96332c7ae3eff8c7',1,'operator+=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5d628a5bc4fa755610392f47a523a1f1',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7c790442f77f2437b482c4a55e224fc3',1,'operator+=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a77bab4481b41be50297b257e95058706',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7816a97d16b1d2f8a90227bb1da2f6ac',1,'operator+=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac244d140c6149726ea44174d3e836ca3',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af802541c4c65ee4442acd495de4d27fe',1,'operator+=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac06eb2fea47a09a8a8abdaa1aa9b4603',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5263b2463fecdc97f9521d00bffea059',1,'operator+=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a24ca436ab299a710263d65302532dd3b',1,'operator+=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aee1bdf0ab2e445293708b476e8cfde3b',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a190e27077f0fba642a86f5c8f488bcc2',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a9f2c9d2f21fbf9fbbacd940c6967c9d1',1,'mlx::core::operator+=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a0b1b3c48afc0a785282e43435bba8418',1,'mlx::core::operator+=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7b763db8194e6fcb1b87eab143dfa47a',1,'mlx::core::operator+=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a827167f6a1ae55428fd218ddd51ec3b6',1,'mlx::core::operator+=(float &amp;lhs, _MLX_Float16 rhs)']]],
-  ['operator_2d_22',['operator-',['../structpocketfft_1_1detail_1_1cmplx.html#a460da5db36d1c72fb1ed3496fd3abde4',1,'pocketfft::detail::cmplx::operator-()'],['../backend_2metal_2kernels_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855',1,'operator-(_MLX_BFloat16 x):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a333f67614dbf8027439a7e124052cb85',1,'operator-(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a891aa4bf46c20a26a55061736aba25f1',1,'operator-(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7ad7ff44a3200853711869f7a577d931',1,'operator-(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af69ef8f1d8ecae0e6f755bf1c46cf075',1,'operator-(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5bd875a54b79b2dcedf674807c3e53c5',1,'operator-(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab02f8646b47806e1d2038f248df03f06',1,'operator-(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab27b26182c7c6e08af37e6d511fd9253',1,'operator-(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5868c85c988ec3432cf86d7df40e464d',1,'operator-(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad03ef47e6cc7521bbfb45740dee20f88',1,'operator-(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab789f8a400512ff27e36b3373170f0c5',1,'operator-(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7f601b22ecc480132d82ad782e5363bf',1,'operator-(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a152366ab4e2ccc867e919af6c74ced91',1,'operator-(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a42bead8ef0beb9f3452128d64cd4df9d',1,'operator-(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#a226cfd54d49f02e35c5aab3139c7596b',1,'operator-(complex64_t x):&#160;complex.h'],['../backend_2metal_2kernels_2complex_8h.html#af5608264cf920688607059b4e8cd3117',1,'operator-(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#aca8ef21c16984ccb329b3bd0c1e4be48',1,'mlx::steel::operator-()'],['../group__ops.html#gade2eea48989f4caaf36e89f7bd2a8816',1,'mlx::core::operator-(const array &amp;a)'],['../group__ops.html#ga0c7f3cb36d4ca516c7a33142f88b9181',1,'mlx::core::operator-(const array &amp;a, const array &amp;b)'],['../group__ops.html#gae68d3d0691ba951501218e98439f3465',1,'mlx::core::operator-(T a, const array &amp;b)'],['../group__ops.html#gaf5e5d882c51ad0a0ea315c274d5439b2',1,'mlx::core::operator-(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a622ce842fe44e4b6a95e03242341b459',1,'mlx::core::operator-(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af32a99d930d49e9b178472d7a65531ab',1,'mlx::core::operator-(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a3555a2b31fc0925850d3240e85e03ec5',1,'mlx::core::operator-(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a46080889fd9e5c3f9916508e97dff5ad',1,'mlx::core::operator-(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a9ca27fd1e512c8ed126342e565da12ae',1,'mlx::core::operator-(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3803f8d36558d32bb7dd6e580ea683b4',1,'mlx::core::operator-(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#af5d865528989ca66b3d357e5ce4e0300',1,'mlx::core::operator-(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#afb784b960f55aeb4edd7f567fa74d443',1,'mlx::core::operator-(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a29cbacf4b399c24728fb0808fad498f9',1,'mlx::core::operator-(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aececc0e451237aa6c0d1a2c3d828c86e',1,'mlx::core::operator-(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a383a26cc2689c98fd6c4435ade8dc669',1,'mlx::core::operator-(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad6311ef8df59bdfb212b5cf8169246b2',1,'mlx::core::operator-(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a23b7329bc1c93c8ac0a1f576565fefb0',1,'mlx::core::operator-(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad8d650bf63998abd716ee0ca28e1cbb9',1,'mlx::core::operator-(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a7339b33201254e9119d99d3a728ded72',1,'mlx::core::operator-(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a064318b7a16e5cb6d0a6407501b5c7dc',1,'mlx::core::operator-(_MLX_BFloat16 lhs)'],['../namespacemlx_1_1core.html#a7bae3ff296d9a60ff3c7e448f7fbc6bd',1,'mlx::core::operator-(const complex64_t &amp;v)'],['../namespacemlx_1_1core.html#afb5069ecebdfd9d388c26f83df12c93c',1,'mlx::core::operator-(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a8d126e3f3fa9f8c1c1ae1b09f94df487',1,'mlx::core::operator-(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ad04f1ccd2cd7c487a2f2aaa055939f64',1,'mlx::core::operator-(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a15eb2ea76508ff823fa0591e811d0b7d',1,'mlx::core::operator-(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a96d9577db38d6809d022893e32feeda1',1,'mlx::core::operator-(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a5d9c02765c1672930757416411567bf2',1,'mlx::core::operator-(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a6105d3b5266666b7c6bb9469285a9ec3',1,'mlx::core::operator-(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a777aa772dfb205b25d26f3180d98a2f6',1,'mlx::core::operator-(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a085eb092f4ada47f8169de62886cff90',1,'mlx::core::operator-(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab25e5d211e2c8785b45c3a81a6282e2b',1,'mlx::core::operator-(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#abf5d09561a81b0f0b32d59d77e32e16f',1,'mlx::core::operator-(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4ce6867dbb4d1631d1870dac14022dbb',1,'mlx::core::operator-(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a8a049e646e0442064cfe9e202d7047c5',1,'mlx::core::operator-(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a78e2a1cfc65453185bcca13bd4f523cf',1,'mlx::core::operator-(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#af143cf68673e06390d4bb2ec2892bd22',1,'mlx::core::operator-(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a46d502dfe0b027955950d4e716c2eb26',1,'mlx::core::operator-(_MLX_Float16 lhs)'],['../namespacemlx_1_1core.html#a2631e78c6f0a602f6754ac577ec75f83',1,'mlx::core::operator-(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a73d79cbd75d543d0837b8a51bf103f9e',1,'mlx::core::operator-(bfloat16_t lhs, float16_t rhs)']]],
-  ['operator_2d_3d_23',['operator-=',['../structpocketfft_1_1detail_1_1cmplx.html#a12441ff423274bd1b54245933d69ad7e',1,'pocketfft::detail::cmplx::operator-=()'],['../backend_2metal_2kernels_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca',1,'operator-=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac2f1e1f2365cfa531b1519aa9ff67695',1,'operator-=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a513501355a5912a1263fd8b10864142b',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab4f4ecd62c3d8b3363d02019573dc9f1',1,'operator-=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a92d1348f201d78fcd474f75d5b23ef68',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3eefe9a7f5fb226335ea687012f32d5c',1,'operator-=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aef62c7e3e494b6a511a7833c0d942a60',1,'operator-=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad30726cc8b69fd300d33c2a46e123c28',1,'operator-=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8859b5b8dc241e4f58243c85d2630cc8',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7003e1e5881e3d106257f22b6a3e59fe',1,'operator-=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3165e37d393be50c2cfa9ddcba153684',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a76f5bd895b7214cbc3cea3440992718a',1,'operator-=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7167343d90eb70e5a0d5fa9ec5398e94',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9b31c363ebc93d592b6fa0e27b00335a',1,'operator-=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a967a1d7b5664f616e5b6f2d257367f0c',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aff19193e1b2cee29a8737318e95cc74a',1,'operator-=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aede0cc4179507b739849948f1a2fed4b',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7e1a6056f9c96f3c89fe204dbf103be5',1,'operator-=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9d06cceea5c179bcc608452188bd7d6a',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0aa9ffe056f49fda181bbacbd60556ea',1,'operator-=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ada5685d99c2d6708d1c4ef826d68e879',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a726cecf778b8584b6f7c37db1b064576',1,'operator-=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3816a35f8468156d59c239256c12dcf3',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa332fae098e7c6dc23b98bc0026f1070',1,'operator-=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afb3cd302e0b78902c62111dce4494fe8',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abb884888f14086cc674657677cb4b8bc',1,'operator-=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a38bb89f925eca4f9c042f6ee7a2c0193',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac30c580713f354916088a7dc049ae4cd',1,'operator-=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a715c824ee8c87e0256114a85624d9949',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7bc91aaaf476a37063264d1d53d862cc',1,'operator-=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab155f418f15cabd86ff942c6f9472ddb',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aaa66dc6d7b2c5efbfaa97ca9c7872bd8',1,'operator-=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a696978d9401e09200045b2d8aad045c2',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae998d8f423a9fb73405cfbd4b836bc72',1,'operator-=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a279d09ab8542f1c1a8dc8173b65946b6',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a491dadfae957cd7cc0c36188d910f6f6',1,'operator-=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9a837c3b9c4e42f53d7cd1ed0d266e2f',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acf7af2284269544064b68e807064bba4',1,'operator-=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a28d297705e29009197418546ef435393',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a948579a4d9ba276523190b03b09578fb',1,'operator-=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5a4b98a0a11db5b77cf9168df37c8bc7',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a31a3d8f2ff8038f7e0d717845c039808',1,'operator-=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1dac193d9f1c8c0eb4473441895f8c58',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad2817d53fdd4b112babfb6f0b38c8f39',1,'operator-=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa705d87cf4b78e9d7c6b07dd0c66cac6',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a542affc376726840647a6e93acf2c1a7',1,'operator-=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#add18cfe4c0d38e95c6dff6bab3e7a932',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab1de7e7e7304ff3598925d2e69134764',1,'operator-=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0d3fb52437c677c5d0f1a3642384b15c',1,'operator-=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adda64cae388baac1f138b06dc8595237',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af20874a61c6c3f4c3fd045a96e806644',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a8b8a55690df46d97fcfc2a60120783af',1,'mlx::core::operator-=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#ab03949b1f60fa035ce454a894cd73ae9',1,'mlx::core::operator-=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#adaf70bbfb3667df0d08fd3c99896e20a',1,'mlx::core::operator-=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a321c98e5a78621d3c9a3895f707f2f1c',1,'mlx::core::operator-=(float &amp;lhs, _MLX_Float16 rhs)']]],
-  ['operator_2d_3e_24',['operator-&gt;',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966',1,'mlx::core::metal::CommandEncoder']]],
-  ['operator_2f_25',['operator/',['../backend_2metal_2kernels_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c',1,'operator/(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aacaedf12f862c76457133336dd6fc446',1,'operator/(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a584a513596de20663dad951a5b81695e',1,'operator/(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad8f7b11669736fbd6ed2e28211d877d4',1,'operator/(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a59515695ebc48844345fa5120511aed1',1,'operator/(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a8c8ac6736440fdca366ebdefe2a12b9f',1,'operator/(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad6859b04680d0d26d75fd6c4dd74ee24',1,'operator/(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4720cc79ab2b8e39952ea9ef20e51250',1,'operator/(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a72d10ec0e62949247da129eb3a83fb9b',1,'operator/(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad6399ba2b8708899739b4cdbb44add8d',1,'operator/(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a998b1ba877a606aedf722ab46b290403',1,'operator/(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa3277ae33976c70f7bd937ddff027b72',1,'operator/(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa708a970a200822c99c0489f389469fa',1,'operator/(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#ae6a708f67d6fd9b0962aa8877cec6d35',1,'operator/(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#a6bde717aca2051499f73a3eee199bfdd',1,'mlx::steel::operator/()'],['../group__ops.html#gaeedf77f722b394429f1a7f6c367883bf',1,'mlx::core::operator/(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga7366ec7f453be2a4dc449f0faa1bf554',1,'mlx::core::operator/(double a, const array &amp;b)'],['../group__ops.html#gadfb324ae9b4feb2c7ea0ac6ade639f38',1,'mlx::core::operator/(const array &amp;a, double b)'],['../namespacemlx_1_1core.html#a7573ac3b93ddecd69e9c88a26fc84ba9',1,'mlx::core::operator/(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a40e868dad70401d9aa9ee9c32235c315',1,'mlx::core::operator/(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a7587c28fbd2023b134e5fc12bb0dde23',1,'mlx::core::operator/(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a92cdd377c408becf4cf83c1ee9b7085d',1,'mlx::core::operator/(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aef89566301cb133d98c8e7bdd2b7bec6',1,'mlx::core::operator/(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a94e7b51185590492b46916685641276f',1,'mlx::core::operator/(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a04584788c08180835219d0ea1e2b97b1',1,'mlx::core::operator/(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad5af96e2ff09d207eb1e1980fe3e7c2d',1,'mlx::core::operator/(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ac2217bf760038cd011781158923149ed',1,'mlx::core::operator/(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aea414c04bddc4b9b609262e97398f1b4',1,'mlx::core::operator/(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a27fe23230cd082c0363b9451b731ce6b',1,'mlx::core::operator/(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abdd9bb8fb4411e5924f3eb7ef1bb52f8',1,'mlx::core::operator/(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a50bae338a7353f8b0ed3441071bb0cf6',1,'mlx::core::operator/(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aab26a3284dd3ac7d47c8b5b3a3290ce3',1,'mlx::core::operator/(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a749f48db01de38f259a0c6750a97fa77',1,'mlx::core::operator/(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a32a6a08a2a4652975b0a1bd1fcf3eafd',1,'mlx::core::operator/(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4b66fb38ddc5cc0c2489583d5c499602',1,'mlx::core::operator/(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a45726f1905b709cf8253e6efa046027b',1,'mlx::core::operator/(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afd4170c1e364384f30e6bae341146fa6',1,'mlx::core::operator/(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aef85739d150b9d5609973da8a3f1086a',1,'mlx::core::operator/(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af52a941f8ed9b25eec91402c7b9e281f',1,'mlx::core::operator/(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a477cade78296bc85894170f62db68870',1,'mlx::core::operator/(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a22f5a2257e11423fc2fe18e2dce91590',1,'mlx::core::operator/(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a640d3574dfe6ad934c720ae8bdd78bfa',1,'mlx::core::operator/(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a6f65d8fd0cdddc96fc01f6af95804873',1,'mlx::core::operator/(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a517019d42d4e426b7b98e1c719bb47ce',1,'mlx::core::operator/(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a0beb7a223c542015a4eff4aed814a9dd',1,'mlx::core::operator/(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#abc9b1bd5018d46514bc19d23db2e5063',1,'mlx::core::operator/(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af22937df654ddbd6e398ef12764d18c0',1,'mlx::core::operator/(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a775aed5f49b530c57e71cbac81404d45',1,'mlx::core::operator/(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a97efcd96d6be666e5608034ae77289ef',1,'mlx::core::operator/(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a899851f85dbddd96f9d36319b82542a0',1,'mlx::core::operator/(bfloat16_t lhs, float16_t rhs)']]],
-  ['operator_2f_3d_26',['operator/=',['../backend_2metal_2kernels_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095',1,'operator/=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a90a1c5130db515db48624d8587edbb91',1,'operator/=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a65f30a2dc199134e35bc7c5d431b2263',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7172d84db640e6c49dff0d08dd64b53e',1,'operator/=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acf7cb9927bf09022088401923f2e1916',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a86b2a001cbec0d3a8d762a3c7ff47b0b',1,'operator/=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a744f72ba83522fe3cc2a49a007b42543',1,'operator/=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a77c678665b34df7652dcde053ca73185',1,'operator/=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae0614b6b199d8a65ae95d4621b118b82',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa846fde89c7d2d18b18ef180a8a9c8a3',1,'operator/=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08e778be18e4a291c108fcc528b981d3',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a6b9e49ad9ea256d2d0220c0d81552602',1,'operator/=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab933bc3cdf9adfea10ab9dba5292c812',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a25e7c5d2ecf3375756d59074f333858f',1,'operator/=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4ae4a80fde67eea9a0a37b2803946544',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a912393b7208fa45bd1e87f30b218b68b',1,'operator/=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a18963246f2b640874bef6dca7049f64d',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0e2c2c2cb50b3a55ff213f18978aca35',1,'operator/=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a64f1136b17006f168ef837e17240814f',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae46d75b8046d557452d74513f1106710',1,'operator/=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08d2460e259b9106d90d889481ad60d5',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0f7fd418408806ef498745c6fdb2c062',1,'operator/=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac703495cb370b52526a5a2d36ae26038',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4ca11d43174baf0a729f93b35eabcbea',1,'operator/=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9f835a0a80c411580c97b65fdc5bdfd3',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a17f47ec9cff60f8e1b3477a2793b7ac0',1,'operator/=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5be23e296bbed3a885586a6424b1666e',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afba39221eb54e272aae79910b3cd7ef5',1,'operator/=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac057d95a2bf087575584aa6f9a2c6bf5',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab986ae2cec780a1f494b7b4468b7ba11',1,'operator/=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a44522c2304c6396bbe6b9d32000f4b6f',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aef8e7e499ea9d432aa743d83c076f945',1,'operator/=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3a0a3edbf1ba2314551454059c3f422b',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acb9f0aef9fbdfde8a4f46e33b0d6c52f',1,'operator/=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a303dfcc81ffd355f866f863d7d9f0fa5',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a359edd4bcb8776861ceb26a3005624c0',1,'operator/=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#adc9f32cc6f40768df4285fba2e4783c7',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae71f66d814a03f6377c9d86cf0a2b5d7',1,'operator/=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad0125b6baba3065a87a174ec27aa9a61',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5cc74ad3e522d7104e6e2117751151ad',1,'operator/=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab3b594321fb42b0c2da99954d1e0976c',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4a0023e2fd08875156cd6ef747fbb5cd',1,'operator/=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a4358ee606e66ba2081fcf94f9c3b5915',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ad1e7ef6f065695d4b1d017547b60ef62',1,'operator/=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a284dfc702f0f67b9c233b87162eeabdd',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab8f211ea896fc5190004f3ad6ad8932f',1,'operator/=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7e1bcf3bc06cbcbc304c0cdf729802bc',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abbe42648a46092137b303ccd08f7df86',1,'operator/=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af1a12a1efb618a57da6dd41ae18cb53c',1,'operator/=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a94686039356dfa9aa45608a8b0562fdc',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa251d6483d3b099d1b5311fbe6f0bce2',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a045ff27257cb6d8ab7a94771ba5a17e6',1,'mlx::core::operator/=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a58112951a56a0f9f8c90b60fe74f9508',1,'mlx::core::operator/=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae736defc89a04fbaf7627ad2695bb838',1,'mlx::core::operator/=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#ab1f260710251256ef737dd59be9e143c',1,'mlx::core::operator/=(float &amp;lhs, _MLX_Float16 rhs)']]],
-  ['operator_3c_27',['operator&lt;',['../backend_2metal_2kernels_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25',1,'operator&lt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aab02c65bc38ea66335b2192ead4095a8',1,'operator&lt;(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae91686513e284bcc9635833744bbdda1',1,'operator&lt;(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2486f3b5de85b0d57f458d8f21f82b42',1,'operator&lt;(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a435a2aec4c777b4b184ff5d24992e8a1',1,'operator&lt;(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abdd04257e6a73883b5f56f1186d0e906',1,'operator&lt;(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a69984aaa05ae1d4fccccf7f57e8ecb4a',1,'operator&lt;(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a501cc01d5bf15d9f03aa28545f9624ea',1,'operator&lt;(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1b029e4ca72125a5f9471f582c819705',1,'operator&lt;(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0736a76f56578d26ba1422dc8b744a18',1,'operator&lt;(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a24b1fa8998c892f90f8dde7c34fb10a5',1,'operator&lt;(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#af80ff2020ec2c4b406c5fdae3fe55e63',1,'operator&lt;(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac03f6eefb836373d37dc280b0d813d78',1,'operator&lt;(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#a67674e32596a9dae2258bb8e0e6a2058',1,'operator&lt;(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#adb5f24b57d98214fc215a06475f21412',1,'mlx::steel::operator&lt;()'],['../group__ops.html#gaee41e2b8f61d563200ff03575ac1d6c3',1,'mlx::core::operator&lt;(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga1ef8ea11cf15ce628c54201fa42748ef',1,'mlx::core::operator&lt;(T a, const array &amp;b)'],['../group__ops.html#ga95e72226dc7a79c40b3d16f990922050',1,'mlx::core::operator&lt;(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a987d631e1508e8df55d98ddd57e4d086',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad3fb46370cd8f0992866fad9e2c64a3c',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a3026691bf7ee5095243a8611bf3411aa',1,'mlx::core::operator&lt;(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0d42d6c1d5f77a96e2f296b8ebd79ee6',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ab5ce08a7de0a0ca00d61f7a7f8ea3ab4',1,'mlx::core::operator&lt;(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abce8b7f24b61e5ec0f9a3afe20845caf',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#aff97612627ae1ed260c43c0a7af0d306',1,'mlx::core::operator&lt;(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a9119e518234df7923cae2b3802d59bf2',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#aefb9b05ce8864ada99a920ab32017b89',1,'mlx::core::operator&lt;(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abc55f3676c2d112a6e9ab276bd6b1796',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#afe6581a2c45f24d7fab1e4006c1e3c70',1,'mlx::core::operator&lt;(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aca1d50cdd9506481dcc4cd1ad4a4f734',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a310720f513b6a2490e9df80c65f1bfb3',1,'mlx::core::operator&lt;(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a29e457a170b6cefb6ba1e394c96c6f7b',1,'mlx::core::operator&lt;(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#afd4519985b6b207ec41ad8530d1036df',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae1e41ca94022e43a00cdfc5845102daa',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ac80f4022bffd95b57526685ce8e1cbc1',1,'mlx::core::operator&lt;(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a3a8f6f0af477788c4f0aa98abfc5f1ab',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a3728ed9b6cbd152bf675251a0501b466',1,'mlx::core::operator&lt;(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a5b9ad811a5e1358100c5423dd70ea387',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a5c77e1db83995d3e06a8a26265bce5d6',1,'mlx::core::operator&lt;(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab8a0a3f70664049b35ce1887bd8ff5c2',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6652d93bfb2d426e261a1712a181a4d2',1,'mlx::core::operator&lt;(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a03758b8d13da2de07cc4f4fc45d2854b',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a325161b81a9ff179fd37d949780a17ba',1,'mlx::core::operator&lt;(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a92eca79fce8233e4299343eee3996511',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#adb016662b8f7eb680abfe1a421eabe72',1,'mlx::core::operator&lt;(uint64_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_3c_3c_28',['operator&lt;&lt;',['../group__ops.html#gad656c30f9fd7d9467e405657b325aa7e',1,'mlx::core::operator&lt;&lt;(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#a1e5c30e316afa30c14bc48b92afdb794',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Device &amp;d)'],['../namespacemlx_1_1core.html#a4ddd07021b36c848d6fb1dd9ac276822',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a0023c267cf81345fad65e7a797954cd3',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Dtype &amp;d)'],['../namespacemlx_1_1core.html#a1fd58658474fb842d648dcf8f7d9f078',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Dtype::Kind &amp;k)'],['../namespacemlx_1_1core.html#a123331f01188bd76e37623b63b6b4340',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, array a)'],['../namespacemlx_1_1core.html#a4e733bba89760abed32393e085812b22',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const std::vector&lt; int &gt; &amp;v)'],['../namespacemlx_1_1core.html#a6276bb9bad43ed4a27a1e2c3f5bfd990',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const std::vector&lt; size_t &gt; &amp;v)'],['../namespacemlx_1_1core.html#a5e5bd5c57b1cf19776bdb41e732861d9',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const std::vector&lt; int64_t &gt; &amp;v)'],['../namespacemlx_1_1core.html#a42a19c8442b173606e714364227e7d45',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const complex64_t &amp;v)'],['../namespacemlx_1_1core.html#a57eb97a5eba99a846ac429795e407574',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const float16_t &amp;v)'],['../namespacemlx_1_1core.html#a7db909d54cf07375e89424c32c07a29c',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const bfloat16_t &amp;v)']]],
-  ['operator_3c_3d_29',['operator&lt;=',['../backend_2metal_2kernels_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05',1,'operator&lt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a5a81eae168dfafd299c2b94e3e8558cf',1,'operator&lt;=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0f486bf02c6ad5b9b6a96d3450f03e47',1,'operator&lt;=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#acba9efe192d22b7781b4622103c7a944',1,'operator&lt;=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aff100489cc40ad276c2d5d67a9df67db',1,'operator&lt;=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a7eac96f64ca42991caf819c8e8c8d2bc',1,'operator&lt;=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a88c11cd37600de5480570da3d2ae5732',1,'operator&lt;=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a08c7d12a0d16565fbf052dba2db8b22d',1,'operator&lt;=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2b9de9624c0a507b4ead85f898ad9daf',1,'operator&lt;=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a28f8d21c5eef047c701cf690ce9c2ef0',1,'operator&lt;=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a14b56c687053ee2432398a25663c068f',1,'operator&lt;=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0f360806708b95a3be400af0b8871b57',1,'operator&lt;=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a80d288f22cadfdf5e904410349e616a1',1,'operator&lt;=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#aee04c9a63c6716a99a027418354debb0',1,'operator&lt;=(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#a6cc3bab5e7f6e7c719c82afa90ad2827',1,'mlx::steel::operator&lt;=()'],['../group__ops.html#ga4c8b8a1632944acaae50f0de6c23ece6',1,'mlx::core::operator&lt;=(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga150a9be467c9f91482a6d6fc13504bc4',1,'mlx::core::operator&lt;=(T a, const array &amp;b)'],['../group__ops.html#ga624eeccef0cc4b130e1325abfea057cb',1,'mlx::core::operator&lt;=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a0066a47cb21223ddebc77992ee874fb9',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2593dbace3ce50e7146d9514726a543f',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a88654bcf6c9728517a2933ca2e29a7c1',1,'mlx::core::operator&lt;=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a5d4f449e9c1699b99fcf894dd15e8af3',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a6b678bea8fdcda1f11c6691b56a15211',1,'mlx::core::operator&lt;=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae8aacc606ea16f018a90eae758830a35',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a25668dea4ffb51c7c00eeecb9530d1d8',1,'mlx::core::operator&lt;=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a084558b6a5487549799c49c37c9e9652',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ade2e2a0daa79d5c52f278f85f03dde2e',1,'mlx::core::operator&lt;=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a750a2d2b4976ad94b08994d081f83445',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ade5a175ff45347689ac4c798d04c8ffc',1,'mlx::core::operator&lt;=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae25e0c01b46612f039313a4825ba6428',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a5c90f16d8f6edf4b75c96b945b9fa591',1,'mlx::core::operator&lt;=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a8cd6583fa0fc9957f993e00b2ec01d91',1,'mlx::core::operator&lt;=(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a012130a0458cbc30b88365e0e0eab232',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae8c890bdcffadee8c5dab85c907f57eb',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a43cb070553c1f2fffb32ef6670e30980',1,'mlx::core::operator&lt;=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ac759b7798d668a99535e59e26d6ba192',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a70e528a789b5660d98e783b045aaa379',1,'mlx::core::operator&lt;=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a40bd8abb8a4d989ddabbb298518bd7f5',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a4155d4b0c76f37ab5e0b54f9cd683f35',1,'mlx::core::operator&lt;=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad8bb648d0603a206e0392990c911ca0b',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ace72a5853f2afd6510dcb97d54fa650d',1,'mlx::core::operator&lt;=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab38f7a0d3c0809071ff5d3af859018d6',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a7904b886d7b535a6af0a885d00597323',1,'mlx::core::operator&lt;=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a57952168bd0b54c2677204d4ab1cb6e5',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a6235dc5f4db517618bb3449b08c96e8b',1,'mlx::core::operator&lt;=(uint64_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_3d_30',['operator=',['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a027b84cddc8d476f736ac1f1a9991fe4',1,'mlx::core::allocator::Allocator::operator=(const Allocator &amp;other)=delete'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a2e971b47339b1d0849a334a902a9df3c',1,'mlx::core::allocator::Allocator::operator=(Allocator &amp;&amp;other)=delete'],['../classmlx_1_1core_1_1array.html#a8acf2b4c75f9b7f79da6675dbc36cf36',1,'mlx::core::array::operator=(const array &amp;other) &amp;&amp;=delete'],['../classmlx_1_1core_1_1array.html#a5c89c2406a610b32943955f9a5060fbd',1,'mlx::core::array::operator=(array &amp;&amp;other) &amp;&amp;=delete'],['../classmlx_1_1core_1_1array.html#ad3277ff68f1336aa217f9cbe40181479',1,'mlx::core::array::operator=(array &amp;&amp;other) &amp;=default'],['../classmlx_1_1core_1_1array.html#a5da41aabecf4c8055b7515341bf57147',1,'mlx::core::array::operator=(const array &amp;other) &amp;'],['../structmlx_1_1core_1_1array_1_1_data.html#a68e9417954fe811b5e41e6317a526748',1,'mlx::core::array::Data::operator=()'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e',1,'mlx::core::metal::CommandEncoder::operator=()'],['../classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73',1,'mlx::core::metal::Device::operator=()'],['../classmlx_1_1core_1_1metal_1_1_residency_set.html#aef97dbbc755940789f99a26164591c45',1,'mlx::core::metal::ResidencySet::operator=()'],['../classmlx_1_1core_1_1_primitive.html#a6b1be7ea92f3a7bb19875c70259dad6b',1,'mlx::core::Primitive::operator=(const Primitive &amp;other)=delete'],['../classmlx_1_1core_1_1_primitive.html#a50bbddd43e1ba0cf5f127cd7aa756a9e',1,'mlx::core::Primitive::operator=(Primitive &amp;&amp;other)=delete'],['../classmlx_1_1core_1_1_unary_primitive.html#a0a859309a4f192f2679e07f2e4ff4d22',1,'mlx::core::UnaryPrimitive::operator=(const UnaryPrimitive &amp;other)=delete'],['../classmlx_1_1core_1_1_unary_primitive.html#ab90b2ea80f1d914be03cf44def5db5a5',1,'mlx::core::UnaryPrimitive::operator=(UnaryPrimitive &amp;&amp;other)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ab170dbd2ce34c51e2eeebf5d08e7e2db',1,'mlx::core::scheduler::Scheduler::operator=(const Scheduler &amp;)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a035ea35f4dd8ee985973080f14029379',1,'mlx::core::scheduler::Scheduler::operator=(Scheduler &amp;&amp;)=delete'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#a0f65b0523b8ddd989f338da6cb2860e3',1,'mlx::core::_MLX_BFloat16::operator=(std::vector&lt; bool &gt;::reference x)'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#abb8cd44ee22b17c55333ff2eb4e13a14',1,'mlx::core::_MLX_BFloat16::operator=(const float &amp;x)'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a608a099bf7116ee608dcfd31ea3ade2c',1,'mlx::core::_MLX_Float16::operator=(std::vector&lt; bool &gt;::reference x)'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a35543c3653d477c46350697fb808373d',1,'mlx::core::_MLX_Float16::operator=(const float &amp;x)']]],
-  ['operator_3d_3d_31',['operator==',['../backend_2metal_2kernels_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065',1,'operator==(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a0aa3bfcfab53700488e5f386e6de60d5',1,'operator==(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3936148781ab1c4f33f58d12c116f370',1,'operator==(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae753526b669fba27771089dc809abd66',1,'operator==(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a05a4f197a71d0f16879032f44492bb79',1,'operator==(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae86f5917847b1ec9f313996250f2e0be',1,'operator==(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aab74ec4d33a64b92b908717d500f1ecf',1,'operator==(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac30a2c1fa6f172af903fdeb6a8632606',1,'operator==(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab4e9ad547aa23daa351075e0ecc58fa2',1,'operator==(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa5fa1a8f2b39c3508fe38205469756d1',1,'operator==(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aeadc1f36c6bdc219294ce9341d80afa5',1,'operator==(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a3ae2091ada1e39e857fbc53c97bdb79f',1,'operator==(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac7b4d295f3c7b1e09964f24f306422da',1,'operator==(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#abfc19f03616441245dfc7726b278f190',1,'operator==(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#abcc797f27e87e857b41c1a8d33ee2c78',1,'mlx::steel::operator==()'],['../namespacemlx_1_1core.html#a937503d72b66c661bf3f5fdcd98ef97c',1,'mlx::core::operator==(const Device &amp;lhs, const Device &amp;rhs)'],['../group__ops.html#gaa30cf69f3d22f65615f5e1696dd5703f',1,'mlx::core::operator==(const array &amp;a, const array &amp;b)'],['../group__ops.html#gaf115782d009ac2a547fcca395c9ec797',1,'mlx::core::operator==(T a, const array &amp;b)'],['../group__ops.html#ga3ad3ed7aece2650943a35082dbe3a0a5',1,'mlx::core::operator==(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#ac470f937a379d6356c8f567c97cd7481',1,'mlx::core::operator==(const Stream &amp;lhs, const Stream &amp;rhs)'],['../namespacemlx_1_1core.html#aec63a0472cb943fe39f31e7678555572',1,'mlx::core::operator==(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad05311ca8e2f19ffe5849e963837cec7',1,'mlx::core::operator==(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aaaf591cb2188381e6cbd857132d04eb7',1,'mlx::core::operator==(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7ef33c33509ccccf1ab217500e8b3c1a',1,'mlx::core::operator==(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#abec4200a718b7c5ed80b7abcc4447260',1,'mlx::core::operator==(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad853981b1c5ba69b07d54c7b77055d22',1,'mlx::core::operator==(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a752d6cb4172a9cb91e5da19582329c6d',1,'mlx::core::operator==(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0175beb3de139faa08479a88215b35ea',1,'mlx::core::operator==(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a61da2851cb3beeef28049228346c28b5',1,'mlx::core::operator==(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aa24713cb9e39bacb516c992eb03d2b2b',1,'mlx::core::operator==(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a6d565dd93c46259f9486d9fdf0969589',1,'mlx::core::operator==(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a24e79a82557861de64dad66d36e6ff30',1,'mlx::core::operator==(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#af27d515ac390d62bd852b73ea759a947',1,'mlx::core::operator==(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae3e1e8b7a5410e0edf35f31f74295e2f',1,'mlx::core::operator==(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aaa22230a66b15c3e774d8ce45783a746',1,'mlx::core::operator==(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ae2a0bcdc171d7e9745d33e1d9aac4f8a',1,'mlx::core::operator==(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a331ec62442a8d3eb8ccba7b4de5168d1',1,'mlx::core::operator==(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#acfcaefe0990eb3533e2b11a6f2657492',1,'mlx::core::operator==(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a8d48dbd49cccff07777affb2a412058c',1,'mlx::core::operator==(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a88eae27edd22fa4418776672023cb276',1,'mlx::core::operator==(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a188b363f633ea360407b3f9cf4e1f1a6',1,'mlx::core::operator==(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ae065fe5c42c1a333d7858d19f6434fa9',1,'mlx::core::operator==(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a2f98db199deb6d7a82551fa4afec655a',1,'mlx::core::operator==(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a85f83add412cb320b5cd1c3da6aadbd5',1,'mlx::core::operator==(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7e2cee66c3ca1b56f4f3d7fd1d6e0be1',1,'mlx::core::operator==(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#ad436557da5c7fea71fc58182a876cfe5',1,'mlx::core::operator==(uint64_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_3e_32',['operator&gt;',['../backend_2metal_2kernels_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57',1,'operator&gt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ab9cd098786d2f4c855c42e4a6f30ab3e',1,'operator&gt;(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a55600f3b9859e2891e0e0b5690867b72',1,'operator&gt;(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#afd7cdb8ed2a9820efe9cf322c06f188c',1,'operator&gt;(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a31bbdbe0b62b90a4d6ea4bb0a7db586b',1,'operator&gt;(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a68125e66f74eaffe5ea9267638ce870d',1,'operator&gt;(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac89eb6b29edad8cca63727ab97171c29',1,'operator&gt;(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a74e477567c9477c2cf0684f81ef4498f',1,'operator&gt;(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2d37130b6fd79b425f5ba92b65e36bed',1,'operator&gt;(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a41d55d167e9dc63bf29d15e0ff004869',1,'operator&gt;(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aa95f9ebfdab3c5f524775651362ce914',1,'operator&gt;(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2826bd301bb5393473ccd363f2052c0d',1,'operator&gt;(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a62a512d0edd894759c69f724b970fbdb',1,'operator&gt;(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#a032a8d3eec2384c9f03066f7fd945995',1,'operator&gt;(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#a7512eadda6160e4c9d9e6aa4049fac20',1,'mlx::steel::operator&gt;()'],['../group__ops.html#ga74fd2777adef10e6fe628a9cdadb01cb',1,'mlx::core::operator&gt;(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga32e106e794e2c32e4e7decee2df2477f',1,'mlx::core::operator&gt;(T a, const array &amp;b)'],['../group__ops.html#ga96552b90e89923c5d2064cc427775ec5',1,'mlx::core::operator&gt;(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#aedc4e9df4bf71c0ac34fcfae60cdf550',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a14c188303d09b97867bcfd34519aa4a6',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ac97736fadafa7efa201624d0e1128ee8',1,'mlx::core::operator&gt;(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3c41a304126bc225bdc68062d1eb6e7e',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ab594f3ae1ee13227fae940fef0d00cb9',1,'mlx::core::operator&gt;(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a01dabc077a872c115a9a9ccd95f1acec',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#adabbd8768d216873617768249473a5c7',1,'mlx::core::operator&gt;(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#adae1b14669d27ce1fe0c214771c07b77',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ab03a22961d99fa12d3e74b3116e94e8f',1,'mlx::core::operator&gt;(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a42011a27a3d23a60be5be44ee7cac87c',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a50f6a94bb36d89cf28817aff88ab89c8',1,'mlx::core::operator&gt;(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac173de50ee57b1b066d49363ba978c53',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#ab09f1b4879aa3190c2f66c9bd1224021',1,'mlx::core::operator&gt;(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a91eb6ca854217424129a55ae95a123b5',1,'mlx::core::operator&gt;(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a58d5795d8312599d101ae16f194e4a2a',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aafa3bbeda78610c4285f3e57042268f3',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a8a928d76a6fbf3d336296401e14617a4',1,'mlx::core::operator&gt;(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ade2f9222fd433cd4d673c6182f256235',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ae24c337810c841ff23e327efde7045e1',1,'mlx::core::operator&gt;(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acf401ede354fcc998b13ea6442994d7e',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a2bb28a9a0894a73ae1b27e7f4da0841a',1,'mlx::core::operator&gt;(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a09d631e8a85fd7ae72e1a868b8f9b9cb',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a49421ea65b5a98df080d75b1636b2157',1,'mlx::core::operator&gt;(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a692ce931b660415e17f92d18a8e0d446',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a579bb87b3ede5663d7cd68c7c0f6fb9e',1,'mlx::core::operator&gt;(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af810587a17e692f4eec256d3c3cd27de',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a50f4177d3ca03a95fc2614e100c7391d',1,'mlx::core::operator&gt;(uint64_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_3e_3d_33',['operator&gt;=',['../backend_2metal_2kernels_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f',1,'operator&gt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a64f6787a96386246f83a8981d274150e',1,'operator&gt;=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1a788f82212afad30e4c2ee40f1c313c',1,'operator&gt;=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ae88617c4a012c5dc12781a349a28c886',1,'operator&gt;=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a467a88531150a4d9d30fce07c49c126e',1,'operator&gt;=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a9e21c5ea9dd724dc2ca8c54ad908f09c',1,'operator&gt;=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2f6286d222e2176bcbdc824c5d598100',1,'operator&gt;=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#abec53064aa96265385ecc57de5fbc74c',1,'operator&gt;=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#ac766839f8f9e4863e8e18418c342c875',1,'operator&gt;=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a2807fa6862b0f9689c81199b1e695ed8',1,'operator&gt;=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#aee3ae0d0d1f941463b06eca0bf041b2b',1,'operator&gt;=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a523eda93c809733368e2b45382d2add6',1,'operator&gt;=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2bf16_8h.html#a1f4e90909ac1c7280f4c7d1977c55fb7',1,'operator&gt;=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2complex_8h.html#aafbd686c180398c98b33d7643f893a46',1,'operator&gt;=(complex64_t a, complex64_t b):&#160;complex.h'],['../namespacemlx_1_1steel.html#aa3c95c60cf69603705bb4636de547bcb',1,'mlx::steel::operator&gt;=()'],['../group__ops.html#ga3a41895f25ed083a36994d95fa102546',1,'mlx::core::operator&gt;=(const array &amp;a, const array &amp;b)'],['../group__ops.html#gaf509f2cb3b18963232f20d6c3bd229b2',1,'mlx::core::operator&gt;=(T a, const array &amp;b)'],['../group__ops.html#gafa0eb25d5978674bfc9e59d4145ec590',1,'mlx::core::operator&gt;=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a8494764f5c686743ede66dc76d85d955',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a019df48807b506d9995856684bf7797a',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a96ab6405430efb887cdb5c828cb67d6e',1,'mlx::core::operator&gt;=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac18be72269b1bcfb0249cc00a0600681',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aeb879815228efbd2c8f80986e1c8d41f',1,'mlx::core::operator&gt;=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0051156f6a568f58cd54850f746fb507',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ae93556906e115625ed1b62d36cf21b70',1,'mlx::core::operator&gt;=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab81ad16e3be591dfc9e42ac3c19b055f',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6cfe9b03e7c5f1eb9374208a552c3cc9',1,'mlx::core::operator&gt;=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2f5add83812fb137dd9226c6c01e45d5',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ad1014a836e7ce9301de8588eef1e89ee',1,'mlx::core::operator&gt;=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a17791561434dc995de9f268d145c0ed1',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a3755925b24a903045937464be117de2f',1,'mlx::core::operator&gt;=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a6262aeb513d27fc8313293b261e72abb',1,'mlx::core::operator&gt;=(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a6feb4b3ea511b0eda4d1ec9725f3fb4c',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a03b3f7fcb755ec075985ab26336926f0',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aecfbf5ef4872ae447eb4a374e4db28e4',1,'mlx::core::operator&gt;=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae4690f349b2483f5d1a4b75aba67399f',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a667e95146dd5199e67bcb121b984b1f0',1,'mlx::core::operator&gt;=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a3375f1562f148bdc07451f2b6e54e6df',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ae83df12368cb07ccb1c10c1117ff3922',1,'mlx::core::operator&gt;=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad41251938cf852b5560c1180944ebb49',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a4ddb5ef0b88929086f9b09729fda0dde',1,'mlx::core::operator&gt;=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a0908a61ab261aff726922b33fa6ed159',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a0fdadf87edd8a0a57c63953fb0ebe053',1,'mlx::core::operator&gt;=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a47c82778e43032c0bbf5d59407e81dc9',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a14e6c43b924eacca1b2dac1d5d00ca2b',1,'mlx::core::operator&gt;=(uint64_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_3e_3e_34',['operator&gt;&gt;',['../group__ops.html#ga498b61f7e8f056ae00297fa0dc17303a',1,'mlx::core']]],
-  ['operator_5b_5d_35',['operator[]',['../classpocketfft_1_1detail_1_1arr.html#aea0bd899b19e03f54dfd6c188727061a',1,'pocketfft::detail::arr::operator[](size_t idx)'],['../classpocketfft_1_1detail_1_1arr.html#a99c54f96bc79c7cdd8925c1663462842',1,'pocketfft::detail::arr::operator[](size_t idx) const'],['../classpocketfft_1_1detail_1_1sincos__2pibyn.html#a71b02f67c47b24adb296eafd2c7a3598',1,'pocketfft::detail::sincos_2pibyn::operator[]()'],['../classpocketfft_1_1detail_1_1cndarr.html#ae4852d1fe936a5d61832b507816c7054',1,'pocketfft::detail::cndarr::operator[]()'],['../classpocketfft_1_1detail_1_1ndarr.html#a2b2c4e205e8b5c32c9fe55dfd7b8c8d8',1,'pocketfft::detail::ndarr::operator[]()']]],
-  ['operator_5e_36',['operator^',['../group__ops.html#gac3a6fe18694e84b3d63458e9553ac181',1,'mlx::core::operator^(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#ae36ea40b8477bfa12d41aae8245225c9',1,'mlx::core::operator^(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a03fc96696f5c6d9411841889d05f4670',1,'mlx::core::operator^(_MLX_BFloat16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a55130edf926366db0d6207989e609b7c',1,'mlx::core::operator^(uint16_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0b75198f364d742a1c25dd13e398f2c2',1,'mlx::core::operator^(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7f205f1b10b23180a23bf2be4bb726b1',1,'mlx::core::operator^(_MLX_Float16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a9edfe65f3c6da583c7b109290ec94b22',1,'mlx::core::operator^(uint16_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_5e_3d_37',['operator^=',['../namespacemlx_1_1core.html#a97cb7d3eac404a442e84656cefe7cfb4',1,'mlx::core::operator^=(_MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abcfd2d9615c96561fd44dfb9c341cf8e',1,'mlx::core::operator^=(_MLX_BFloat16 &amp;lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#ae78083d766b9cf6f87cded341bbcd63e',1,'mlx::core::operator^=(_MLX_Float16 &amp;lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acf36c10779fbf1efbe1e6a7fd41176cd',1,'mlx::core::operator^=(_MLX_Float16 &amp;lhs, uint16_t rhs)']]],
-  ['operator_7c_38',['operator|',['../group__ops.html#ga52392a2a98f09a80da8d338c4908bd02',1,'mlx::core::operator|(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#af84ed854132c1514dca5a524fdb7ed05',1,'mlx::core::operator|(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7423aac70f9f2e3fb6a5c9a3fc96f703',1,'mlx::core::operator|(_MLX_BFloat16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a19805f505cb7ac72bfab66c339ea7900',1,'mlx::core::operator|(uint16_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2d933573edf4ed305fddd8a0caef1ee8',1,'mlx::core::operator|(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afab3d4eb1b36a276922879ce6e44b7f5',1,'mlx::core::operator|(_MLX_Float16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#ab132729fa6912d22a8e402057eb4ba12',1,'mlx::core::operator|(uint16_t lhs, _MLX_Float16 rhs)']]],
-  ['operator_7c_3d_39',['operator|=',['../namespacemlx_1_1core.html#a8e1d21375ae4b89b3cbea3a46d262abd',1,'mlx::core::operator|=(_MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a28d6c2f89e73b7b874dd1f67f853a96f',1,'mlx::core::operator|=(_MLX_BFloat16 &amp;lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a2d8470b69cbbeefece08d3ffd46c0082',1,'mlx::core::operator|=(_MLX_Float16 &amp;lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a359c6257097a304c00d41d64296ef4c9',1,'mlx::core::operator|=(_MLX_Float16 &amp;lhs, uint16_t rhs)']]],
-  ['operator_7c_7c_40',['operator||',['../namespacemlx_1_1steel.html#a1bb3ac5061a04e407fc4cdcc9f6ea03f',1,'mlx::steel::operator||()'],['../group__ops.html#ga27af56a98270d4d76d139f0f9171b83a',1,'mlx::core::operator||()']]],
-  ['out_5fof_5fbounds_41',['out_of_bounds',['../struct_read_writer.html#a08e10626fbc789b6dff9172fd6c36f7c',1,'ReadWriter::out_of_bounds() const'],['../struct_read_writer.html#a6f946aea5452109dca7fc70ed39c6efe',1,'ReadWriter::out_of_bounds() const'],['../struct_read_writer.html#a8f40d7f343d32134fe27a694abfde6bf',1,'ReadWriter::out_of_bounds() const']]],
-  ['outer_42',['outer',['../group__ops.html#ga866af24e10db2797e1c5a5986dbf6c0d',1,'mlx::core']]],
-  ['output_5fshapes_43',['output_shapes',['../classmlx_1_1core_1_1_primitive.html#a8849dc20991398f6f9a24d6785673853',1,'mlx::core::Primitive::output_shapes()'],['../classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32',1,'mlx::core::Abs::output_shapes()'],['../classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594',1,'mlx::core::Add::output_shapes()'],['../classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974',1,'mlx::core::ArcCos::output_shapes()'],['../classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b',1,'mlx::core::ArcCosh::output_shapes()'],['../classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5',1,'mlx::core::ArcSin::output_shapes()'],['../classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed',1,'mlx::core::ArcSinh::output_shapes()'],['../classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a',1,'mlx::core::ArcTan::output_shapes()'],['../classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03',1,'mlx::core::ArcTan2::output_shapes()'],['../classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8',1,'mlx::core::ArcTanh::output_shapes()'],['../classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64',1,'mlx::core::ArgPartition::output_shapes()'],['../classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179',1,'mlx::core::ArgReduce::output_shapes()'],['../classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859',1,'mlx::core::ArgSort::output_shapes()'],['../classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4',1,'mlx::core::AsType::output_shapes()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599',1,'mlx::core::BitwiseBinary::output_shapes()'],['../classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea',1,'mlx::core::Ceil::output_shapes()'],['../classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c',1,'mlx::core::Compiled::output_shapes()'],['../classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f',1,'mlx::core::Conjugate::output_shapes()'],['../classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3',1,'mlx::core::Copy::output_shapes()'],['../classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b',1,'mlx::core::Cos::output_shapes()'],['../classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962',1,'mlx::core::Cosh::output_shapes()'],['../classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994',1,'mlx::core::Divide::output_shapes()'],['../classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b',1,'mlx::core::DivMod::output_shapes()'],['../classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867',1,'mlx::core::Select::output_shapes()'],['../classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666',1,'mlx::core::Remainder::output_shapes()'],['../classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9',1,'mlx::core::Equal::output_shapes()'],['../classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187',1,'mlx::core::Erf::output_shapes()'],['../classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639',1,'mlx::core::ErfInv::output_shapes()'],['../classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670',1,'mlx::core::Exp::output_shapes()'],['../classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08',1,'mlx::core::Expm1::output_shapes()'],['../classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015',1,'mlx::core::Floor::output_shapes()'],['../classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46',1,'mlx::core::Greater::output_shapes()'],['../classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f',1,'mlx::core::GreaterEqual::output_shapes()'],['../classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde',1,'mlx::core::Hadamard::output_shapes()'],['../classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48',1,'mlx::core::Imag::output_shapes()'],['../classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278',1,'mlx::core::Less::output_shapes()'],['../classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f',1,'mlx::core::LessEqual::output_shapes()'],['../classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d',1,'mlx::core::Log::output_shapes()'],['../classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df',1,'mlx::core::Log1p::output_shapes()'],['../classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c',1,'mlx::core::LogicalNot::output_shapes()'],['../classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617',1,'mlx::core::LogicalAnd::output_shapes()'],['../classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4',1,'mlx::core::LogicalOr::output_shapes()'],['../classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635',1,'mlx::core::LogAddExp::output_shapes()'],['../classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b',1,'mlx::core::Maximum::output_shapes()'],['../classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70',1,'mlx::core::Minimum::output_shapes()'],['../classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061',1,'mlx::core::Multiply::output_shapes()'],['../classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014',1,'mlx::core::Negative::output_shapes()'],['../classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a',1,'mlx::core::NotEqual::output_shapes()'],['../classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8',1,'mlx::core::NumberOfElements::output_shapes()'],['../classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf',1,'mlx::core::Partition::output_shapes()'],['../classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1',1,'mlx::core::Power::output_shapes()'],['../classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5',1,'mlx::core::Real::output_shapes()'],['../classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65',1,'mlx::core::Reduce::output_shapes()'],['../classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047',1,'mlx::core::Round::output_shapes()'],['../classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43',1,'mlx::core::Sigmoid::output_shapes()'],['../classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67',1,'mlx::core::Sign::output_shapes()'],['../classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a',1,'mlx::core::Sin::output_shapes()'],['../classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28',1,'mlx::core::Sinh::output_shapes()'],['../classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35',1,'mlx::core::Softmax::output_shapes()'],['../classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d',1,'mlx::core::Sort::output_shapes()'],['../classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02',1,'mlx::core::Square::output_shapes()'],['../classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5',1,'mlx::core::Sqrt::output_shapes()'],['../classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e',1,'mlx::core::StopGradient::output_shapes()'],['../classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc',1,'mlx::core::Subtract::output_shapes()'],['../classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37',1,'mlx::core::Tan::output_shapes()'],['../classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325',1,'mlx::core::Tanh::output_shapes()'],['../classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5',1,'mlx::core::Eigh::output_shapes()']]],
-  ['outputs_44',['outputs',['../classmlx_1_1core_1_1array.html#a2c186fd527f984f0589d4183b4976289',1,'mlx::core::array::outputs()'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f',1,'mlx::core::metal::CommandEncoder::outputs()']]],
-  ['overwrite_5fdescriptor_45',['overwrite_descriptor',['../classmlx_1_1core_1_1array.html#a95e6b156c8e05439f076b85c05079387',1,'mlx::core::array']]]
+  ['operator_2b_3d_21',['operator+=',['../structpocketfft_1_1detail_1_1cmplx.html#ad4e69dcd89bdb7764c9c5807168f911e',1,'pocketfft::detail::cmplx::operator+=(const cmplx &amp;other)'],['../structpocketfft_1_1detail_1_1cmplx.html#affa618d8850a7c232793b7c61db6d184',1,'pocketfft::detail::cmplx::operator+=(const cmplx&lt; T2 &gt; &amp;other)'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab04f480aea9fbba0895068c7558dd400',1,'operator+=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a251780ac4592cc2b1a543e417ff57770',1,'operator+=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24381d991c2d570aa953694f396a69b5',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7595740d4cc12924905d6bd1b99ee4da',1,'operator+=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac1498acb8c3623b5f412f70ab6a6528b',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abce5ab327110c164f054b43ed47f79a0',1,'operator+=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae0c70198e236ffe1a98f79987c686419',1,'operator+=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a13b3338935440ae51ecc4a356093efc5',1,'operator+=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5a0cb8544b4ebd2906ba8e7f2868e8de',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7b134429ea0c8493800ff8b465410f9c',1,'operator+=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4154f90ab7857ca856f9e15fe1bf5acf',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab9ae6a51e2027b02cac9966e05f3ba68',1,'operator+=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab93ce536eb7998bee00de4af868e31a9',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad0ae9e2b4874f991a2c853e1c1fe735d',1,'operator+=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a194a6670cc25ade35a24b566f31af785',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3d0d689516c99003659c5d026847bd2e',1,'operator+=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a007f58508b98bb79e5c323ed0dec89b6',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa7198e580e2a83c1fd01a4b6fdf86a80',1,'operator+=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a15573fefd880adefbba079b1c1bd8082',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a104cf94cb9e359d1b6ef92ced2ce0c27',1,'operator+=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa04cfcb52191fd23205a1a3572b46ae0',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad684bc2ae1a2a627cd3e4a4c641e2d77',1,'operator+=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad1e28448e35f4934075b397c34ba3d66',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8ad16afd7f1711de83c0cec5af868f76',1,'operator+=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac45e9ca0c7155caebe3d0f7261518077',1,'operator+=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3c62ac679d6aa515144d40ebafe4a188',1,'operator+=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9ff5ab3aef1057fa083b53a65c8aba03',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae74bb0a3c12cd1a23f3d29ce307d6fb1',1,'operator+=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac188bd19f236b098d603b0d8acd08921',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aef9fa600d107b509f2e3df7d6b080e01',1,'operator+=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af5713afb3a62967a02c3c20661951ee4',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7f1b84352a3ed6171444a43da1fc7e92',1,'operator+=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af1983edd26245e6e51c6e47354095e32',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8cd55d1a579540eb450e12a8a8a950be',1,'operator+=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a588ef0f7e03f306758524d378278976f',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a74751abec7086f85f4f26ced44f1ca1f',1,'operator+=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4dd3cf0e5aa116ff330352a50c18cde7',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afb9a0e18c0e40c77e6143fb7d84ebfba',1,'operator+=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adf0cfd9a608a6fb3d57933e32e7d81d2',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4bd92db6c8b9b5dc96332c7ae3eff8c7',1,'operator+=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5d628a5bc4fa755610392f47a523a1f1',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7c790442f77f2437b482c4a55e224fc3',1,'operator+=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a77bab4481b41be50297b257e95058706',1,'operator+=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7816a97d16b1d2f8a90227bb1da2f6ac',1,'operator+=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac244d140c6149726ea44174d3e836ca3',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af802541c4c65ee4442acd495de4d27fe',1,'operator+=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac06eb2fea47a09a8a8abdaa1aa9b4603',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5263b2463fecdc97f9521d00bffea059',1,'operator+=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24ca436ab299a710263d65302532dd3b',1,'operator+=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aee1bdf0ab2e445293708b476e8cfde3b',1,'operator+=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a190e27077f0fba642a86f5c8f488bcc2',1,'operator+=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a9f2c9d2f21fbf9fbbacd940c6967c9d1',1,'mlx::core::operator+=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a0b1b3c48afc0a785282e43435bba8418',1,'mlx::core::operator+=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7b763db8194e6fcb1b87eab143dfa47a',1,'mlx::core::operator+=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a827167f6a1ae55428fd218ddd51ec3b6',1,'mlx::core::operator+=(float &amp;lhs, _MLX_Float16 rhs)']]],
+  ['operator_2d_22',['operator-',['../structpocketfft_1_1detail_1_1cmplx.html#a460da5db36d1c72fb1ed3496fd3abde4',1,'pocketfft::detail::cmplx::operator-()'],['../backend_2metal_2kernels_2complex_8h.html#a226cfd54d49f02e35c5aab3139c7596b',1,'operator-(complex64_t x):&#160;complex.h'],['../backend_2metal_2kernels_2complex_8h.html#af5608264cf920688607059b4e8cd3117',1,'operator-(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6aedc8d6d0980134ac69b96f22d9a855',1,'operator-(_MLX_BFloat16 x):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a333f67614dbf8027439a7e124052cb85',1,'operator-(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a891aa4bf46c20a26a55061736aba25f1',1,'operator-(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7ad7ff44a3200853711869f7a577d931',1,'operator-(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af69ef8f1d8ecae0e6f755bf1c46cf075',1,'operator-(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5bd875a54b79b2dcedf674807c3e53c5',1,'operator-(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab02f8646b47806e1d2038f248df03f06',1,'operator-(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab27b26182c7c6e08af37e6d511fd9253',1,'operator-(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5868c85c988ec3432cf86d7df40e464d',1,'operator-(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad03ef47e6cc7521bbfb45740dee20f88',1,'operator-(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab789f8a400512ff27e36b3373170f0c5',1,'operator-(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7f601b22ecc480132d82ad782e5363bf',1,'operator-(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a152366ab4e2ccc867e919af6c74ced91',1,'operator-(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a42bead8ef0beb9f3452128d64cd4df9d',1,'operator-(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aca8ef21c16984ccb329b3bd0c1e4be48',1,'mlx::steel::operator-()'],['../group__ops.html#gade2eea48989f4caaf36e89f7bd2a8816',1,'mlx::core::operator-(const array &amp;a)'],['../group__ops.html#ga0c7f3cb36d4ca516c7a33142f88b9181',1,'mlx::core::operator-(const array &amp;a, const array &amp;b)'],['../group__ops.html#gae68d3d0691ba951501218e98439f3465',1,'mlx::core::operator-(T a, const array &amp;b)'],['../group__ops.html#gaf5e5d882c51ad0a0ea315c274d5439b2',1,'mlx::core::operator-(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a622ce842fe44e4b6a95e03242341b459',1,'mlx::core::operator-(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#af32a99d930d49e9b178472d7a65531ab',1,'mlx::core::operator-(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a3555a2b31fc0925850d3240e85e03ec5',1,'mlx::core::operator-(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a46080889fd9e5c3f9916508e97dff5ad',1,'mlx::core::operator-(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a9ca27fd1e512c8ed126342e565da12ae',1,'mlx::core::operator-(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3803f8d36558d32bb7dd6e580ea683b4',1,'mlx::core::operator-(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#af5d865528989ca66b3d357e5ce4e0300',1,'mlx::core::operator-(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#afb784b960f55aeb4edd7f567fa74d443',1,'mlx::core::operator-(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a29cbacf4b399c24728fb0808fad498f9',1,'mlx::core::operator-(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aececc0e451237aa6c0d1a2c3d828c86e',1,'mlx::core::operator-(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a383a26cc2689c98fd6c4435ade8dc669',1,'mlx::core::operator-(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad6311ef8df59bdfb212b5cf8169246b2',1,'mlx::core::operator-(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a23b7329bc1c93c8ac0a1f576565fefb0',1,'mlx::core::operator-(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad8d650bf63998abd716ee0ca28e1cbb9',1,'mlx::core::operator-(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a7339b33201254e9119d99d3a728ded72',1,'mlx::core::operator-(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a064318b7a16e5cb6d0a6407501b5c7dc',1,'mlx::core::operator-(_MLX_BFloat16 lhs)'],['../namespacemlx_1_1core.html#a7bae3ff296d9a60ff3c7e448f7fbc6bd',1,'mlx::core::operator-(const complex64_t &amp;v)'],['../namespacemlx_1_1core.html#afb5069ecebdfd9d388c26f83df12c93c',1,'mlx::core::operator-(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a8d126e3f3fa9f8c1c1ae1b09f94df487',1,'mlx::core::operator-(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ad04f1ccd2cd7c487a2f2aaa055939f64',1,'mlx::core::operator-(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a15eb2ea76508ff823fa0591e811d0b7d',1,'mlx::core::operator-(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a96d9577db38d6809d022893e32feeda1',1,'mlx::core::operator-(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a5d9c02765c1672930757416411567bf2',1,'mlx::core::operator-(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a6105d3b5266666b7c6bb9469285a9ec3',1,'mlx::core::operator-(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a777aa772dfb205b25d26f3180d98a2f6',1,'mlx::core::operator-(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a085eb092f4ada47f8169de62886cff90',1,'mlx::core::operator-(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab25e5d211e2c8785b45c3a81a6282e2b',1,'mlx::core::operator-(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#abf5d09561a81b0f0b32d59d77e32e16f',1,'mlx::core::operator-(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4ce6867dbb4d1631d1870dac14022dbb',1,'mlx::core::operator-(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a8a049e646e0442064cfe9e202d7047c5',1,'mlx::core::operator-(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a78e2a1cfc65453185bcca13bd4f523cf',1,'mlx::core::operator-(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#af143cf68673e06390d4bb2ec2892bd22',1,'mlx::core::operator-(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a46d502dfe0b027955950d4e716c2eb26',1,'mlx::core::operator-(_MLX_Float16 lhs)'],['../namespacemlx_1_1core.html#a2631e78c6f0a602f6754ac577ec75f83',1,'mlx::core::operator-(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a73d79cbd75d543d0837b8a51bf103f9e',1,'mlx::core::operator-(bfloat16_t lhs, float16_t rhs)']]],
+  ['operator_2d_3d_23',['operator-=',['../structpocketfft_1_1detail_1_1cmplx.html#a12441ff423274bd1b54245933d69ad7e',1,'pocketfft::detail::cmplx::operator-=()'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab225043bd02bb423930bc98aae9c2bca',1,'operator-=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac2f1e1f2365cfa531b1519aa9ff67695',1,'operator-=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a513501355a5912a1263fd8b10864142b',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab4f4ecd62c3d8b3363d02019573dc9f1',1,'operator-=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a92d1348f201d78fcd474f75d5b23ef68',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3eefe9a7f5fb226335ea687012f32d5c',1,'operator-=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aef62c7e3e494b6a511a7833c0d942a60',1,'operator-=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad30726cc8b69fd300d33c2a46e123c28',1,'operator-=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8859b5b8dc241e4f58243c85d2630cc8',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7003e1e5881e3d106257f22b6a3e59fe',1,'operator-=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3165e37d393be50c2cfa9ddcba153684',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a76f5bd895b7214cbc3cea3440992718a',1,'operator-=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7167343d90eb70e5a0d5fa9ec5398e94',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9b31c363ebc93d592b6fa0e27b00335a',1,'operator-=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a967a1d7b5664f616e5b6f2d257367f0c',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aff19193e1b2cee29a8737318e95cc74a',1,'operator-=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aede0cc4179507b739849948f1a2fed4b',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e1a6056f9c96f3c89fe204dbf103be5',1,'operator-=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9d06cceea5c179bcc608452188bd7d6a',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0aa9ffe056f49fda181bbacbd60556ea',1,'operator-=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ada5685d99c2d6708d1c4ef826d68e879',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a726cecf778b8584b6f7c37db1b064576',1,'operator-=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3816a35f8468156d59c239256c12dcf3',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa332fae098e7c6dc23b98bc0026f1070',1,'operator-=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afb3cd302e0b78902c62111dce4494fe8',1,'operator-=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abb884888f14086cc674657677cb4b8bc',1,'operator-=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a38bb89f925eca4f9c042f6ee7a2c0193',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac30c580713f354916088a7dc049ae4cd',1,'operator-=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a715c824ee8c87e0256114a85624d9949',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7bc91aaaf476a37063264d1d53d862cc',1,'operator-=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab155f418f15cabd86ff942c6f9472ddb',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aaa66dc6d7b2c5efbfaa97ca9c7872bd8',1,'operator-=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a696978d9401e09200045b2d8aad045c2',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae998d8f423a9fb73405cfbd4b836bc72',1,'operator-=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a279d09ab8542f1c1a8dc8173b65946b6',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a491dadfae957cd7cc0c36188d910f6f6',1,'operator-=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9a837c3b9c4e42f53d7cd1ed0d266e2f',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acf7af2284269544064b68e807064bba4',1,'operator-=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a28d297705e29009197418546ef435393',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a948579a4d9ba276523190b03b09578fb',1,'operator-=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5a4b98a0a11db5b77cf9168df37c8bc7',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31a3d8f2ff8038f7e0d717845c039808',1,'operator-=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1dac193d9f1c8c0eb4473441895f8c58',1,'operator-=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad2817d53fdd4b112babfb6f0b38c8f39',1,'operator-=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa705d87cf4b78e9d7c6b07dd0c66cac6',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a542affc376726840647a6e93acf2c1a7',1,'operator-=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#add18cfe4c0d38e95c6dff6bab3e7a932',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab1de7e7e7304ff3598925d2e69134764',1,'operator-=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0d3fb52437c677c5d0f1a3642384b15c',1,'operator-=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adda64cae388baac1f138b06dc8595237',1,'operator-=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af20874a61c6c3f4c3fd045a96e806644',1,'operator-=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a8b8a55690df46d97fcfc2a60120783af',1,'mlx::core::operator-=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#ab03949b1f60fa035ce454a894cd73ae9',1,'mlx::core::operator-=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#adaf70bbfb3667df0d08fd3c99896e20a',1,'mlx::core::operator-=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a321c98e5a78621d3c9a3895f707f2f1c',1,'mlx::core::operator-=(float &amp;lhs, _MLX_Float16 rhs)']]],
+  ['operator_2f_24',['operator/',['../backend_2metal_2kernels_2complex_8h.html#ae6a708f67d6fd9b0962aa8877cec6d35',1,'operator/(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9f16a44e1c9836ca57edc1d7b93b5d7c',1,'operator/(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aacaedf12f862c76457133336dd6fc446',1,'operator/(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a584a513596de20663dad951a5b81695e',1,'operator/(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad8f7b11669736fbd6ed2e28211d877d4',1,'operator/(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a59515695ebc48844345fa5120511aed1',1,'operator/(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a8c8ac6736440fdca366ebdefe2a12b9f',1,'operator/(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad6859b04680d0d26d75fd6c4dd74ee24',1,'operator/(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4720cc79ab2b8e39952ea9ef20e51250',1,'operator/(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a72d10ec0e62949247da129eb3a83fb9b',1,'operator/(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad6399ba2b8708899739b4cdbb44add8d',1,'operator/(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a998b1ba877a606aedf722ab46b290403',1,'operator/(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa3277ae33976c70f7bd937ddff027b72',1,'operator/(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa708a970a200822c99c0489f389469fa',1,'operator/(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#a6bde717aca2051499f73a3eee199bfdd',1,'mlx::steel::operator/()'],['../group__ops.html#gaeedf77f722b394429f1a7f6c367883bf',1,'mlx::core::operator/(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga7366ec7f453be2a4dc449f0faa1bf554',1,'mlx::core::operator/(double a, const array &amp;b)'],['../group__ops.html#gadfb324ae9b4feb2c7ea0ac6ade639f38',1,'mlx::core::operator/(const array &amp;a, double b)'],['../namespacemlx_1_1core.html#a7573ac3b93ddecd69e9c88a26fc84ba9',1,'mlx::core::operator/(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a40e868dad70401d9aa9ee9c32235c315',1,'mlx::core::operator/(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a7587c28fbd2023b134e5fc12bb0dde23',1,'mlx::core::operator/(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a92cdd377c408becf4cf83c1ee9b7085d',1,'mlx::core::operator/(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aef89566301cb133d98c8e7bdd2b7bec6',1,'mlx::core::operator/(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a94e7b51185590492b46916685641276f',1,'mlx::core::operator/(_MLX_BFloat16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a04584788c08180835219d0ea1e2b97b1',1,'mlx::core::operator/(bool lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad5af96e2ff09d207eb1e1980fe3e7c2d',1,'mlx::core::operator/(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ac2217bf760038cd011781158923149ed',1,'mlx::core::operator/(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aea414c04bddc4b9b609262e97398f1b4',1,'mlx::core::operator/(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a27fe23230cd082c0363b9451b731ce6b',1,'mlx::core::operator/(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abdd9bb8fb4411e5924f3eb7ef1bb52f8',1,'mlx::core::operator/(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a50bae338a7353f8b0ed3441071bb0cf6',1,'mlx::core::operator/(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aab26a3284dd3ac7d47c8b5b3a3290ce3',1,'mlx::core::operator/(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a749f48db01de38f259a0c6750a97fa77',1,'mlx::core::operator/(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a32a6a08a2a4652975b0a1bd1fcf3eafd',1,'mlx::core::operator/(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a4b66fb38ddc5cc0c2489583d5c499602',1,'mlx::core::operator/(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a45726f1905b709cf8253e6efa046027b',1,'mlx::core::operator/(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afd4170c1e364384f30e6bae341146fa6',1,'mlx::core::operator/(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aef85739d150b9d5609973da8a3f1086a',1,'mlx::core::operator/(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af52a941f8ed9b25eec91402c7b9e281f',1,'mlx::core::operator/(_MLX_Float16 lhs, bool rhs)'],['../namespacemlx_1_1core.html#a477cade78296bc85894170f62db68870',1,'mlx::core::operator/(bool lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a22f5a2257e11423fc2fe18e2dce91590',1,'mlx::core::operator/(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a640d3574dfe6ad934c720ae8bdd78bfa',1,'mlx::core::operator/(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a6f65d8fd0cdddc96fc01f6af95804873',1,'mlx::core::operator/(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a517019d42d4e426b7b98e1c719bb47ce',1,'mlx::core::operator/(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a0beb7a223c542015a4eff4aed814a9dd',1,'mlx::core::operator/(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#abc9b1bd5018d46514bc19d23db2e5063',1,'mlx::core::operator/(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af22937df654ddbd6e398ef12764d18c0',1,'mlx::core::operator/(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a775aed5f49b530c57e71cbac81404d45',1,'mlx::core::operator/(uint64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a97efcd96d6be666e5608034ae77289ef',1,'mlx::core::operator/(float16_t lhs, bfloat16_t rhs)'],['../namespacemlx_1_1core.html#a899851f85dbddd96f9d36319b82542a0',1,'mlx::core::operator/(bfloat16_t lhs, float16_t rhs)']]],
+  ['operator_2f_3d_25',['operator/=',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5aa3b8c68a2b58d41ea33eaabbf83095',1,'operator/=(device _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a90a1c5130db515db48624d8587edbb91',1,'operator/=(device float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a65f30a2dc199134e35bc7c5d431b2263',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7172d84db640e6c49dff0d08dd64b53e',1,'operator/=(thread float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acf7cb9927bf09022088401923f2e1916',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a86b2a001cbec0d3a8d762a3c7ff47b0b',1,'operator/=(threadgroup float &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a744f72ba83522fe3cc2a49a007b42543',1,'operator/=(device _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a77c678665b34df7652dcde053ca73185',1,'operator/=(device half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae0614b6b199d8a65ae95d4621b118b82',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa846fde89c7d2d18b18ef180a8a9c8a3',1,'operator/=(thread half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08e778be18e4a291c108fcc528b981d3',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a6b9e49ad9ea256d2d0220c0d81552602',1,'operator/=(threadgroup half &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab933bc3cdf9adfea10ab9dba5292c812',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a25e7c5d2ecf3375756d59074f333858f',1,'operator/=(device int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ae4a80fde67eea9a0a37b2803946544',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a912393b7208fa45bd1e87f30b218b68b',1,'operator/=(thread int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a18963246f2b640874bef6dca7049f64d',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0e2c2c2cb50b3a55ff213f18978aca35',1,'operator/=(threadgroup int16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a64f1136b17006f168ef837e17240814f',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae46d75b8046d557452d74513f1106710',1,'operator/=(device int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08d2460e259b9106d90d889481ad60d5',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0f7fd418408806ef498745c6fdb2c062',1,'operator/=(thread int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac703495cb370b52526a5a2d36ae26038',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ca11d43174baf0a729f93b35eabcbea',1,'operator/=(threadgroup int32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9f835a0a80c411580c97b65fdc5bdfd3',1,'operator/=(device _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a17f47ec9cff60f8e1b3477a2793b7ac0',1,'operator/=(device int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5be23e296bbed3a885586a6424b1666e',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afba39221eb54e272aae79910b3cd7ef5',1,'operator/=(thread int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac057d95a2bf087575584aa6f9a2c6bf5',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab986ae2cec780a1f494b7b4468b7ba11',1,'operator/=(threadgroup int64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a44522c2304c6396bbe6b9d32000f4b6f',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aef8e7e499ea9d432aa743d83c076f945',1,'operator/=(device uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3a0a3edbf1ba2314551454059c3f422b',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acb9f0aef9fbdfde8a4f46e33b0d6c52f',1,'operator/=(thread uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a303dfcc81ffd355f866f863d7d9f0fa5',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint16_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a359edd4bcb8776861ceb26a3005624c0',1,'operator/=(threadgroup uint16_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#adc9f32cc6f40768df4285fba2e4783c7',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae71f66d814a03f6377c9d86cf0a2b5d7',1,'operator/=(device uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad0125b6baba3065a87a174ec27aa9a61',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5cc74ad3e522d7104e6e2117751151ad',1,'operator/=(thread uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab3b594321fb42b0c2da99954d1e0976c',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4a0023e2fd08875156cd6ef747fbb5cd',1,'operator/=(threadgroup uint32_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4358ee606e66ba2081fcf94f9c3b5915',1,'operator/=(device _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ad1e7ef6f065695d4b1d017547b60ef62',1,'operator/=(device uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a284dfc702f0f67b9c233b87162eeabdd',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab8f211ea896fc5190004f3ad6ad8932f',1,'operator/=(thread uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e1bcf3bc06cbcbc304c0cdf729802bc',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abbe42648a46092137b303ccd08f7df86',1,'operator/=(threadgroup uint64_t &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af1a12a1efb618a57da6dd41ae18cb53c',1,'operator/=(device _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a94686039356dfa9aa45608a8b0562fdc',1,'operator/=(thread _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa251d6483d3b099d1b5311fbe6f0bce2',1,'operator/=(threadgroup _MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1core.html#a045ff27257cb6d8ab7a94771ba5a17e6',1,'mlx::core::operator/=(_MLX_BFloat16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#a58112951a56a0f9f8c90b60fe74f9508',1,'mlx::core::operator/=(float &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae736defc89a04fbaf7627ad2695bb838',1,'mlx::core::operator/=(_MLX_Float16 &amp;lhs, const float &amp;rhs)'],['../namespacemlx_1_1core.html#ab1f260710251256ef737dd59be9e143c',1,'mlx::core::operator/=(float &amp;lhs, _MLX_Float16 rhs)']]],
+  ['operator_3c_26',['operator&lt;',['../backend_2metal_2kernels_2complex_8h.html#a67674e32596a9dae2258bb8e0e6a2058',1,'operator&lt;(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9ef6a57b7185e9ca49e255fec1a44e25',1,'operator&lt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aab02c65bc38ea66335b2192ead4095a8',1,'operator&lt;(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae91686513e284bcc9635833744bbdda1',1,'operator&lt;(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2486f3b5de85b0d57f458d8f21f82b42',1,'operator&lt;(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a435a2aec4c777b4b184ff5d24992e8a1',1,'operator&lt;(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abdd04257e6a73883b5f56f1186d0e906',1,'operator&lt;(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a69984aaa05ae1d4fccccf7f57e8ecb4a',1,'operator&lt;(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a501cc01d5bf15d9f03aa28545f9624ea',1,'operator&lt;(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1b029e4ca72125a5f9471f582c819705',1,'operator&lt;(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0736a76f56578d26ba1422dc8b744a18',1,'operator&lt;(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a24b1fa8998c892f90f8dde7c34fb10a5',1,'operator&lt;(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af80ff2020ec2c4b406c5fdae3fe55e63',1,'operator&lt;(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac03f6eefb836373d37dc280b0d813d78',1,'operator&lt;(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#adb5f24b57d98214fc215a06475f21412',1,'mlx::steel::operator&lt;()'],['../group__ops.html#gaee41e2b8f61d563200ff03575ac1d6c3',1,'mlx::core::operator&lt;(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga1ef8ea11cf15ce628c54201fa42748ef',1,'mlx::core::operator&lt;(T a, const array &amp;b)'],['../group__ops.html#ga95e72226dc7a79c40b3d16f990922050',1,'mlx::core::operator&lt;(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a987d631e1508e8df55d98ddd57e4d086',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad3fb46370cd8f0992866fad9e2c64a3c',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a3026691bf7ee5095243a8611bf3411aa',1,'mlx::core::operator&lt;(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0d42d6c1d5f77a96e2f296b8ebd79ee6',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ab5ce08a7de0a0ca00d61f7a7f8ea3ab4',1,'mlx::core::operator&lt;(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abce8b7f24b61e5ec0f9a3afe20845caf',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#aff97612627ae1ed260c43c0a7af0d306',1,'mlx::core::operator&lt;(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a9119e518234df7923cae2b3802d59bf2',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#aefb9b05ce8864ada99a920ab32017b89',1,'mlx::core::operator&lt;(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abc55f3676c2d112a6e9ab276bd6b1796',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#afe6581a2c45f24d7fab1e4006c1e3c70',1,'mlx::core::operator&lt;(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aca1d50cdd9506481dcc4cd1ad4a4f734',1,'mlx::core::operator&lt;(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a310720f513b6a2490e9df80c65f1bfb3',1,'mlx::core::operator&lt;(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a29e457a170b6cefb6ba1e394c96c6f7b',1,'mlx::core::operator&lt;(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#afd4519985b6b207ec41ad8530d1036df',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae1e41ca94022e43a00cdfc5845102daa',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ac80f4022bffd95b57526685ce8e1cbc1',1,'mlx::core::operator&lt;(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a3a8f6f0af477788c4f0aa98abfc5f1ab',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a3728ed9b6cbd152bf675251a0501b466',1,'mlx::core::operator&lt;(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a5b9ad811a5e1358100c5423dd70ea387',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a5c77e1db83995d3e06a8a26265bce5d6',1,'mlx::core::operator&lt;(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab8a0a3f70664049b35ce1887bd8ff5c2',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6652d93bfb2d426e261a1712a181a4d2',1,'mlx::core::operator&lt;(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a03758b8d13da2de07cc4f4fc45d2854b',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a325161b81a9ff179fd37d949780a17ba',1,'mlx::core::operator&lt;(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a92eca79fce8233e4299343eee3996511',1,'mlx::core::operator&lt;(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#adb016662b8f7eb680abfe1a421eabe72',1,'mlx::core::operator&lt;(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_3c_3c_27',['operator&lt;&lt;',['../group__ops.html#gad656c30f9fd7d9467e405657b325aa7e',1,'mlx::core::operator&lt;&lt;(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#a1e5c30e316afa30c14bc48b92afdb794',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Device &amp;d)'],['../namespacemlx_1_1core.html#a4ddd07021b36c848d6fb1dd9ac276822',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a0023c267cf81345fad65e7a797954cd3',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Dtype &amp;d)'],['../namespacemlx_1_1core.html#a1fd58658474fb842d648dcf8f7d9f078',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const Dtype::Kind &amp;k)'],['../namespacemlx_1_1core.html#a123331f01188bd76e37623b63b6b4340',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, array a)'],['../namespacemlx_1_1core.html#a4e733bba89760abed32393e085812b22',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const std::vector&lt; int &gt; &amp;v)'],['../namespacemlx_1_1core.html#a6276bb9bad43ed4a27a1e2c3f5bfd990',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const std::vector&lt; size_t &gt; &amp;v)'],['../namespacemlx_1_1core.html#a5e5bd5c57b1cf19776bdb41e732861d9',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const std::vector&lt; int64_t &gt; &amp;v)'],['../namespacemlx_1_1core.html#a42a19c8442b173606e714364227e7d45',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const complex64_t &amp;v)'],['../namespacemlx_1_1core.html#a57eb97a5eba99a846ac429795e407574',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const float16_t &amp;v)'],['../namespacemlx_1_1core.html#a7db909d54cf07375e89424c32c07a29c',1,'mlx::core::operator&lt;&lt;(std::ostream &amp;os, const bfloat16_t &amp;v)']]],
+  ['operator_3c_3d_28',['operator&lt;=',['../backend_2metal_2kernels_2complex_8h.html#aee04c9a63c6716a99a027418354debb0',1,'operator&lt;=(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#af469c58cffeab488c681f4b33f02cd05',1,'operator&lt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a5a81eae168dfafd299c2b94e3e8558cf',1,'operator&lt;=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0f486bf02c6ad5b9b6a96d3450f03e47',1,'operator&lt;=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#acba9efe192d22b7781b4622103c7a944',1,'operator&lt;=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aff100489cc40ad276c2d5d67a9df67db',1,'operator&lt;=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7eac96f64ca42991caf819c8e8c8d2bc',1,'operator&lt;=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a88c11cd37600de5480570da3d2ae5732',1,'operator&lt;=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a08c7d12a0d16565fbf052dba2db8b22d',1,'operator&lt;=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2b9de9624c0a507b4ead85f898ad9daf',1,'operator&lt;=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a28f8d21c5eef047c701cf690ce9c2ef0',1,'operator&lt;=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a14b56c687053ee2432398a25663c068f',1,'operator&lt;=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0f360806708b95a3be400af0b8871b57',1,'operator&lt;=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a80d288f22cadfdf5e904410349e616a1',1,'operator&lt;=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#a6cc3bab5e7f6e7c719c82afa90ad2827',1,'mlx::steel::operator&lt;=()'],['../group__ops.html#ga4c8b8a1632944acaae50f0de6c23ece6',1,'mlx::core::operator&lt;=(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga150a9be467c9f91482a6d6fc13504bc4',1,'mlx::core::operator&lt;=(T a, const array &amp;b)'],['../group__ops.html#ga624eeccef0cc4b130e1325abfea057cb',1,'mlx::core::operator&lt;=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a0066a47cb21223ddebc77992ee874fb9',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2593dbace3ce50e7146d9514726a543f',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a88654bcf6c9728517a2933ca2e29a7c1',1,'mlx::core::operator&lt;=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a5d4f449e9c1699b99fcf894dd15e8af3',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a6b678bea8fdcda1f11c6691b56a15211',1,'mlx::core::operator&lt;=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae8aacc606ea16f018a90eae758830a35',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a25668dea4ffb51c7c00eeecb9530d1d8',1,'mlx::core::operator&lt;=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a084558b6a5487549799c49c37c9e9652',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ade2e2a0daa79d5c52f278f85f03dde2e',1,'mlx::core::operator&lt;=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a750a2d2b4976ad94b08994d081f83445',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ade5a175ff45347689ac4c798d04c8ffc',1,'mlx::core::operator&lt;=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae25e0c01b46612f039313a4825ba6428',1,'mlx::core::operator&lt;=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a5c90f16d8f6edf4b75c96b945b9fa591',1,'mlx::core::operator&lt;=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a8cd6583fa0fc9957f993e00b2ec01d91',1,'mlx::core::operator&lt;=(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a012130a0458cbc30b88365e0e0eab232',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae8c890bdcffadee8c5dab85c907f57eb',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a43cb070553c1f2fffb32ef6670e30980',1,'mlx::core::operator&lt;=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ac759b7798d668a99535e59e26d6ba192',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a70e528a789b5660d98e783b045aaa379',1,'mlx::core::operator&lt;=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a40bd8abb8a4d989ddabbb298518bd7f5',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a4155d4b0c76f37ab5e0b54f9cd683f35',1,'mlx::core::operator&lt;=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad8bb648d0603a206e0392990c911ca0b',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ace72a5853f2afd6510dcb97d54fa650d',1,'mlx::core::operator&lt;=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ab38f7a0d3c0809071ff5d3af859018d6',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a7904b886d7b535a6af0a885d00597323',1,'mlx::core::operator&lt;=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a57952168bd0b54c2677204d4ab1cb6e5',1,'mlx::core::operator&lt;=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a6235dc5f4db517618bb3449b08c96e8b',1,'mlx::core::operator&lt;=(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_3d_29',['operator=',['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a027b84cddc8d476f736ac1f1a9991fe4',1,'mlx::core::allocator::Allocator::operator=(const Allocator &amp;other)=delete'],['../classmlx_1_1core_1_1allocator_1_1_allocator.html#a2e971b47339b1d0849a334a902a9df3c',1,'mlx::core::allocator::Allocator::operator=(Allocator &amp;&amp;other)=delete'],['../classmlx_1_1core_1_1array.html#a8acf2b4c75f9b7f79da6675dbc36cf36',1,'mlx::core::array::operator=(const array &amp;other) &amp;&amp;=delete'],['../classmlx_1_1core_1_1array.html#a5c89c2406a610b32943955f9a5060fbd',1,'mlx::core::array::operator=(array &amp;&amp;other) &amp;&amp;=delete'],['../classmlx_1_1core_1_1array.html#ad3277ff68f1336aa217f9cbe40181479',1,'mlx::core::array::operator=(array &amp;&amp;other) &amp;=default'],['../classmlx_1_1core_1_1array.html#a5da41aabecf4c8055b7515341bf57147',1,'mlx::core::array::operator=(const array &amp;other) &amp;'],['../structmlx_1_1core_1_1array_1_1_data.html#a68e9417954fe811b5e41e6317a526748',1,'mlx::core::array::Data::operator=()'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e',1,'mlx::core::metal::CommandEncoder::operator=()'],['../classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73',1,'mlx::core::metal::Device::operator=()'],['../classmlx_1_1core_1_1metal_1_1_residency_set.html#aef97dbbc755940789f99a26164591c45',1,'mlx::core::metal::ResidencySet::operator=()'],['../classmlx_1_1core_1_1_primitive.html#a6b1be7ea92f3a7bb19875c70259dad6b',1,'mlx::core::Primitive::operator=(const Primitive &amp;other)=delete'],['../classmlx_1_1core_1_1_primitive.html#a50bbddd43e1ba0cf5f127cd7aa756a9e',1,'mlx::core::Primitive::operator=(Primitive &amp;&amp;other)=delete'],['../classmlx_1_1core_1_1_unary_primitive.html#a0a859309a4f192f2679e07f2e4ff4d22',1,'mlx::core::UnaryPrimitive::operator=(const UnaryPrimitive &amp;other)=delete'],['../classmlx_1_1core_1_1_unary_primitive.html#ab90b2ea80f1d914be03cf44def5db5a5',1,'mlx::core::UnaryPrimitive::operator=(UnaryPrimitive &amp;&amp;other)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ab170dbd2ce34c51e2eeebf5d08e7e2db',1,'mlx::core::scheduler::Scheduler::operator=(const Scheduler &amp;)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a035ea35f4dd8ee985973080f14029379',1,'mlx::core::scheduler::Scheduler::operator=(Scheduler &amp;&amp;)=delete'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#a0f65b0523b8ddd989f338da6cb2860e3',1,'mlx::core::_MLX_BFloat16::operator=(std::vector&lt; bool &gt;::reference x)'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#abb8cd44ee22b17c55333ff2eb4e13a14',1,'mlx::core::_MLX_BFloat16::operator=(const float &amp;x)'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a608a099bf7116ee608dcfd31ea3ade2c',1,'mlx::core::_MLX_Float16::operator=(std::vector&lt; bool &gt;::reference x)'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a35543c3653d477c46350697fb808373d',1,'mlx::core::_MLX_Float16::operator=(const float &amp;x)']]],
+  ['operator_3d_3d_30',['operator==',['../backend_2metal_2kernels_2complex_8h.html#abfc19f03616441245dfc7726b278f190',1,'operator==(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a49a13b06a325ed3cca4004b6a0cde065',1,'operator==(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a0aa3bfcfab53700488e5f386e6de60d5',1,'operator==(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3936148781ab1c4f33f58d12c116f370',1,'operator==(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae753526b669fba27771089dc809abd66',1,'operator==(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a05a4f197a71d0f16879032f44492bb79',1,'operator==(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae86f5917847b1ec9f313996250f2e0be',1,'operator==(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aab74ec4d33a64b92b908717d500f1ecf',1,'operator==(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac30a2c1fa6f172af903fdeb6a8632606',1,'operator==(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab4e9ad547aa23daa351075e0ecc58fa2',1,'operator==(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa5fa1a8f2b39c3508fe38205469756d1',1,'operator==(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aeadc1f36c6bdc219294ce9341d80afa5',1,'operator==(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a3ae2091ada1e39e857fbc53c97bdb79f',1,'operator==(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac7b4d295f3c7b1e09964f24f306422da',1,'operator==(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#abcc797f27e87e857b41c1a8d33ee2c78',1,'mlx::steel::operator==()'],['../namespacemlx_1_1core.html#a937503d72b66c661bf3f5fdcd98ef97c',1,'mlx::core::operator==(const Device &amp;lhs, const Device &amp;rhs)'],['../group__ops.html#gaa30cf69f3d22f65615f5e1696dd5703f',1,'mlx::core::operator==(const array &amp;a, const array &amp;b)'],['../group__ops.html#gaf115782d009ac2a547fcca395c9ec797',1,'mlx::core::operator==(T a, const array &amp;b)'],['../group__ops.html#ga3ad3ed7aece2650943a35082dbe3a0a5',1,'mlx::core::operator==(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#ac470f937a379d6356c8f567c97cd7481',1,'mlx::core::operator==(const Stream &amp;lhs, const Stream &amp;rhs)'],['../namespacemlx_1_1core.html#aec63a0472cb943fe39f31e7678555572',1,'mlx::core::operator==(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad05311ca8e2f19ffe5849e963837cec7',1,'mlx::core::operator==(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aaaf591cb2188381e6cbd857132d04eb7',1,'mlx::core::operator==(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7ef33c33509ccccf1ab217500e8b3c1a',1,'mlx::core::operator==(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#abec4200a718b7c5ed80b7abcc4447260',1,'mlx::core::operator==(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ad853981b1c5ba69b07d54c7b77055d22',1,'mlx::core::operator==(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a752d6cb4172a9cb91e5da19582329c6d',1,'mlx::core::operator==(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0175beb3de139faa08479a88215b35ea',1,'mlx::core::operator==(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a61da2851cb3beeef28049228346c28b5',1,'mlx::core::operator==(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#aa24713cb9e39bacb516c992eb03d2b2b',1,'mlx::core::operator==(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a6d565dd93c46259f9486d9fdf0969589',1,'mlx::core::operator==(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a24e79a82557861de64dad66d36e6ff30',1,'mlx::core::operator==(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#af27d515ac390d62bd852b73ea759a947',1,'mlx::core::operator==(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ae3e1e8b7a5410e0edf35f31f74295e2f',1,'mlx::core::operator==(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aaa22230a66b15c3e774d8ce45783a746',1,'mlx::core::operator==(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ae2a0bcdc171d7e9745d33e1d9aac4f8a',1,'mlx::core::operator==(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a331ec62442a8d3eb8ccba7b4de5168d1',1,'mlx::core::operator==(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#acfcaefe0990eb3533e2b11a6f2657492',1,'mlx::core::operator==(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a8d48dbd49cccff07777affb2a412058c',1,'mlx::core::operator==(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a88eae27edd22fa4418776672023cb276',1,'mlx::core::operator==(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a188b363f633ea360407b3f9cf4e1f1a6',1,'mlx::core::operator==(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ae065fe5c42c1a333d7858d19f6434fa9',1,'mlx::core::operator==(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a2f98db199deb6d7a82551fa4afec655a',1,'mlx::core::operator==(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a85f83add412cb320b5cd1c3da6aadbd5',1,'mlx::core::operator==(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7e2cee66c3ca1b56f4f3d7fd1d6e0be1',1,'mlx::core::operator==(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#ad436557da5c7fea71fc58182a876cfe5',1,'mlx::core::operator==(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_3e_31',['operator&gt;',['../backend_2metal_2kernels_2complex_8h.html#a032a8d3eec2384c9f03066f7fd945995',1,'operator&gt;(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae394c0a10e47d1d047854a888402eb57',1,'operator&gt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ab9cd098786d2f4c855c42e4a6f30ab3e',1,'operator&gt;(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a55600f3b9859e2891e0e0b5690867b72',1,'operator&gt;(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#afd7cdb8ed2a9820efe9cf322c06f188c',1,'operator&gt;(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a31bbdbe0b62b90a4d6ea4bb0a7db586b',1,'operator&gt;(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a68125e66f74eaffe5ea9267638ce870d',1,'operator&gt;(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac89eb6b29edad8cca63727ab97171c29',1,'operator&gt;(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a74e477567c9477c2cf0684f81ef4498f',1,'operator&gt;(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2d37130b6fd79b425f5ba92b65e36bed',1,'operator&gt;(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a41d55d167e9dc63bf29d15e0ff004869',1,'operator&gt;(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aa95f9ebfdab3c5f524775651362ce914',1,'operator&gt;(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2826bd301bb5393473ccd363f2052c0d',1,'operator&gt;(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a62a512d0edd894759c69f724b970fbdb',1,'operator&gt;(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#a7512eadda6160e4c9d9e6aa4049fac20',1,'mlx::steel::operator&gt;()'],['../group__ops.html#ga74fd2777adef10e6fe628a9cdadb01cb',1,'mlx::core::operator&gt;(const array &amp;a, const array &amp;b)'],['../group__ops.html#ga32e106e794e2c32e4e7decee2df2477f',1,'mlx::core::operator&gt;(T a, const array &amp;b)'],['../group__ops.html#ga96552b90e89923c5d2064cc427775ec5',1,'mlx::core::operator&gt;(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#aedc4e9df4bf71c0ac34fcfae60cdf550',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a14c188303d09b97867bcfd34519aa4a6',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#ac97736fadafa7efa201624d0e1128ee8',1,'mlx::core::operator&gt;(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a3c41a304126bc225bdc68062d1eb6e7e',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ab594f3ae1ee13227fae940fef0d00cb9',1,'mlx::core::operator&gt;(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a01dabc077a872c115a9a9ccd95f1acec',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#adabbd8768d216873617768249473a5c7',1,'mlx::core::operator&gt;(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#adae1b14669d27ce1fe0c214771c07b77',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#ab03a22961d99fa12d3e74b3116e94e8f',1,'mlx::core::operator&gt;(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a42011a27a3d23a60be5be44ee7cac87c',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a50f6a94bb36d89cf28817aff88ab89c8',1,'mlx::core::operator&gt;(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac173de50ee57b1b066d49363ba978c53',1,'mlx::core::operator&gt;(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#ab09f1b4879aa3190c2f66c9bd1224021',1,'mlx::core::operator&gt;(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a91eb6ca854217424129a55ae95a123b5',1,'mlx::core::operator&gt;(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a58d5795d8312599d101ae16f194e4a2a',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#aafa3bbeda78610c4285f3e57042268f3',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a8a928d76a6fbf3d336296401e14617a4',1,'mlx::core::operator&gt;(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ade2f9222fd433cd4d673c6182f256235',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#ae24c337810c841ff23e327efde7045e1',1,'mlx::core::operator&gt;(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acf401ede354fcc998b13ea6442994d7e',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#a2bb28a9a0894a73ae1b27e7f4da0841a',1,'mlx::core::operator&gt;(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a09d631e8a85fd7ae72e1a868b8f9b9cb',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a49421ea65b5a98df080d75b1636b2157',1,'mlx::core::operator&gt;(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a692ce931b660415e17f92d18a8e0d446',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a579bb87b3ede5663d7cd68c7c0f6fb9e',1,'mlx::core::operator&gt;(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#af810587a17e692f4eec256d3c3cd27de',1,'mlx::core::operator&gt;(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a50f4177d3ca03a95fc2614e100c7391d',1,'mlx::core::operator&gt;(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_3e_3d_32',['operator&gt;=',['../backend_2metal_2kernels_2complex_8h.html#aafbd686c180398c98b33d7643f893a46',1,'operator&gt;=(complex64_t a, complex64_t b):&#160;complex.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a430dd11fbf4c6f39bc1506ab43b2341f',1,'operator&gt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a64f6787a96386246f83a8981d274150e',1,'operator&gt;=(_MLX_BFloat16 lhs, float rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1a788f82212afad30e4c2ee40f1c313c',1,'operator&gt;=(float lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ae88617c4a012c5dc12781a349a28c886',1,'operator&gt;=(_MLX_BFloat16 lhs, half rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a467a88531150a4d9d30fce07c49c126e',1,'operator&gt;=(half lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a9e21c5ea9dd724dc2ca8c54ad908f09c',1,'operator&gt;=(_MLX_BFloat16 lhs, int32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2f6286d222e2176bcbdc824c5d598100',1,'operator&gt;=(int32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#abec53064aa96265385ecc57de5fbc74c',1,'operator&gt;=(_MLX_BFloat16 lhs, uint32_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#ac766839f8f9e4863e8e18418c342c875',1,'operator&gt;=(uint32_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a2807fa6862b0f9689c81199b1e695ed8',1,'operator&gt;=(_MLX_BFloat16 lhs, int64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aee3ae0d0d1f941463b06eca0bf041b2b',1,'operator&gt;=(int64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a523eda93c809733368e2b45382d2add6',1,'operator&gt;=(_MLX_BFloat16 lhs, uint64_t rhs):&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a1f4e90909ac1c7280f4c7d1977c55fb7',1,'operator&gt;=(uint64_t lhs, _MLX_BFloat16 rhs):&#160;bf16.h'],['../namespacemlx_1_1steel.html#aa3c95c60cf69603705bb4636de547bcb',1,'mlx::steel::operator&gt;=()'],['../group__ops.html#ga3a41895f25ed083a36994d95fa102546',1,'mlx::core::operator&gt;=(const array &amp;a, const array &amp;b)'],['../group__ops.html#gaf509f2cb3b18963232f20d6c3bd229b2',1,'mlx::core::operator&gt;=(T a, const array &amp;b)'],['../group__ops.html#gafa0eb25d5978674bfc9e59d4145ec590',1,'mlx::core::operator&gt;=(const array &amp;a, T b)'],['../namespacemlx_1_1core.html#a8494764f5c686743ede66dc76d85d955',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a019df48807b506d9995856684bf7797a',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, float rhs)'],['../namespacemlx_1_1core.html#a96ab6405430efb887cdb5c828cb67d6e',1,'mlx::core::operator&gt;=(float lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ac18be72269b1bcfb0249cc00a0600681',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, double rhs)'],['../namespacemlx_1_1core.html#aeb879815228efbd2c8f80986e1c8d41f',1,'mlx::core::operator&gt;=(double lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0051156f6a568f58cd54850f746fb507',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ae93556906e115625ed1b62d36cf21b70',1,'mlx::core::operator&gt;=(int32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#ab81ad16e3be591dfc9e42ac3c19b055f',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a6cfe9b03e7c5f1eb9374208a552c3cc9',1,'mlx::core::operator&gt;=(uint32_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2f5add83812fb137dd9226c6c01e45d5',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#ad1014a836e7ce9301de8588eef1e89ee',1,'mlx::core::operator&gt;=(int64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a17791561434dc995de9f268d145c0ed1',1,'mlx::core::operator&gt;=(_MLX_BFloat16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a3755925b24a903045937464be117de2f',1,'mlx::core::operator&gt;=(uint64_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a6262aeb513d27fc8313293b261e72abb',1,'mlx::core::operator&gt;=(const complex64_t &amp;a, const complex64_t &amp;b)'],['../namespacemlx_1_1core.html#a6feb4b3ea511b0eda4d1ec9725f3fb4c',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a03b3f7fcb755ec075985ab26336926f0',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, float rhs)'],['../namespacemlx_1_1core.html#aecfbf5ef4872ae447eb4a374e4db28e4',1,'mlx::core::operator&gt;=(float lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ae4690f349b2483f5d1a4b75aba67399f',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, double rhs)'],['../namespacemlx_1_1core.html#a667e95146dd5199e67bcb121b984b1f0',1,'mlx::core::operator&gt;=(double lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a3375f1562f148bdc07451f2b6e54e6df',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, int32_t rhs)'],['../namespacemlx_1_1core.html#ae83df12368cb07ccb1c10c1117ff3922',1,'mlx::core::operator&gt;=(int32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#ad41251938cf852b5560c1180944ebb49',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, uint32_t rhs)'],['../namespacemlx_1_1core.html#a4ddb5ef0b88929086f9b09729fda0dde',1,'mlx::core::operator&gt;=(uint32_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a0908a61ab261aff726922b33fa6ed159',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, int64_t rhs)'],['../namespacemlx_1_1core.html#a0fdadf87edd8a0a57c63953fb0ebe053',1,'mlx::core::operator&gt;=(int64_t lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a47c82778e43032c0bbf5d59407e81dc9',1,'mlx::core::operator&gt;=(_MLX_Float16 lhs, uint64_t rhs)'],['../namespacemlx_1_1core.html#a14e6c43b924eacca1b2dac1d5d00ca2b',1,'mlx::core::operator&gt;=(uint64_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_3e_3e_33',['operator&gt;&gt;',['../group__ops.html#ga498b61f7e8f056ae00297fa0dc17303a',1,'mlx::core']]],
+  ['operator_5b_5d_34',['operator[]',['../classpocketfft_1_1detail_1_1arr.html#aea0bd899b19e03f54dfd6c188727061a',1,'pocketfft::detail::arr::operator[](size_t idx)'],['../classpocketfft_1_1detail_1_1arr.html#a99c54f96bc79c7cdd8925c1663462842',1,'pocketfft::detail::arr::operator[](size_t idx) const'],['../classpocketfft_1_1detail_1_1sincos__2pibyn.html#a71b02f67c47b24adb296eafd2c7a3598',1,'pocketfft::detail::sincos_2pibyn::operator[]()'],['../classpocketfft_1_1detail_1_1cndarr.html#ae4852d1fe936a5d61832b507816c7054',1,'pocketfft::detail::cndarr::operator[]()'],['../classpocketfft_1_1detail_1_1ndarr.html#a2b2c4e205e8b5c32c9fe55dfd7b8c8d8',1,'pocketfft::detail::ndarr::operator[]()']]],
+  ['operator_5e_35',['operator^',['../group__ops.html#gac3a6fe18694e84b3d63458e9553ac181',1,'mlx::core::operator^(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#ae36ea40b8477bfa12d41aae8245225c9',1,'mlx::core::operator^(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a03fc96696f5c6d9411841889d05f4670',1,'mlx::core::operator^(_MLX_BFloat16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a55130edf926366db0d6207989e609b7c',1,'mlx::core::operator^(uint16_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a0b75198f364d742a1c25dd13e398f2c2',1,'mlx::core::operator^(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a7f205f1b10b23180a23bf2be4bb726b1',1,'mlx::core::operator^(_MLX_Float16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a9edfe65f3c6da583c7b109290ec94b22',1,'mlx::core::operator^(uint16_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_5e_3d_36',['operator^=',['../namespacemlx_1_1core.html#a97cb7d3eac404a442e84656cefe7cfb4',1,'mlx::core::operator^=(_MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#abcfd2d9615c96561fd44dfb9c341cf8e',1,'mlx::core::operator^=(_MLX_BFloat16 &amp;lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#ae78083d766b9cf6f87cded341bbcd63e',1,'mlx::core::operator^=(_MLX_Float16 &amp;lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#acf36c10779fbf1efbe1e6a7fd41176cd',1,'mlx::core::operator^=(_MLX_Float16 &amp;lhs, uint16_t rhs)']]],
+  ['operator_7c_37',['operator|',['../group__ops.html#ga52392a2a98f09a80da8d338c4908bd02',1,'mlx::core::operator|(const array &amp;a, const array &amp;b)'],['../namespacemlx_1_1core.html#af84ed854132c1514dca5a524fdb7ed05',1,'mlx::core::operator|(_MLX_BFloat16 lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a7423aac70f9f2e3fb6a5c9a3fc96f703',1,'mlx::core::operator|(_MLX_BFloat16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a19805f505cb7ac72bfab66c339ea7900',1,'mlx::core::operator|(uint16_t lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a2d933573edf4ed305fddd8a0caef1ee8',1,'mlx::core::operator|(_MLX_Float16 lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#afab3d4eb1b36a276922879ce6e44b7f5',1,'mlx::core::operator|(_MLX_Float16 lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#ab132729fa6912d22a8e402057eb4ba12',1,'mlx::core::operator|(uint16_t lhs, _MLX_Float16 rhs)']]],
+  ['operator_7c_3d_38',['operator|=',['../namespacemlx_1_1core.html#a8e1d21375ae4b89b3cbea3a46d262abd',1,'mlx::core::operator|=(_MLX_BFloat16 &amp;lhs, _MLX_BFloat16 rhs)'],['../namespacemlx_1_1core.html#a28d6c2f89e73b7b874dd1f67f853a96f',1,'mlx::core::operator|=(_MLX_BFloat16 &amp;lhs, uint16_t rhs)'],['../namespacemlx_1_1core.html#a2d8470b69cbbeefece08d3ffd46c0082',1,'mlx::core::operator|=(_MLX_Float16 &amp;lhs, _MLX_Float16 rhs)'],['../namespacemlx_1_1core.html#a359c6257097a304c00d41d64296ef4c9',1,'mlx::core::operator|=(_MLX_Float16 &amp;lhs, uint16_t rhs)']]],
+  ['operator_7c_7c_39',['operator||',['../namespacemlx_1_1steel.html#a1bb3ac5061a04e407fc4cdcc9f6ea03f',1,'mlx::steel::operator||()'],['../group__ops.html#ga27af56a98270d4d76d139f0f9171b83a',1,'mlx::core::operator||()']]],
+  ['out_5fof_5fbounds_40',['out_of_bounds',['../struct_read_writer.html#a08e10626fbc789b6dff9172fd6c36f7c',1,'ReadWriter::out_of_bounds() const'],['../struct_read_writer.html#a6f946aea5452109dca7fc70ed39c6efe',1,'ReadWriter::out_of_bounds() const'],['../struct_read_writer.html#a8f40d7f343d32134fe27a694abfde6bf',1,'ReadWriter::out_of_bounds() const']]],
+  ['outer_41',['outer',['../group__ops.html#ga866af24e10db2797e1c5a5986dbf6c0d',1,'mlx::core']]],
+  ['output_5fshapes_42',['output_shapes',['../classmlx_1_1core_1_1_primitive.html#a8849dc20991398f6f9a24d6785673853',1,'mlx::core::Primitive::output_shapes()'],['../classmlx_1_1core_1_1_abs.html#ab6a2b147f58c83439ecefb9189c2da32',1,'mlx::core::Abs::output_shapes()'],['../classmlx_1_1core_1_1_add.html#a9884fece6ca4061a65241c985fcf1594',1,'mlx::core::Add::output_shapes()'],['../classmlx_1_1core_1_1_arc_cos.html#a8ecd5b9a8cc9cba841768a5b2b497974',1,'mlx::core::ArcCos::output_shapes()'],['../classmlx_1_1core_1_1_arc_cosh.html#ae5d6660121f7f5a55824b95e7fd3dc6b',1,'mlx::core::ArcCosh::output_shapes()'],['../classmlx_1_1core_1_1_arc_sin.html#a1c6e478804eb5d171e4859b872db29f5',1,'mlx::core::ArcSin::output_shapes()'],['../classmlx_1_1core_1_1_arc_sinh.html#a6e0319a3cee5f6b9d43a3ac256b2c2ed',1,'mlx::core::ArcSinh::output_shapes()'],['../classmlx_1_1core_1_1_arc_tan.html#aa5d1e60d50ffa77b1d0e14af8d7e127a',1,'mlx::core::ArcTan::output_shapes()'],['../classmlx_1_1core_1_1_arc_tan2.html#a3f4ad806a0c59c2d1ae1c55c9889bf03',1,'mlx::core::ArcTan2::output_shapes()'],['../classmlx_1_1core_1_1_arc_tanh.html#a30246c56e1d75638c3696f84323861d8',1,'mlx::core::ArcTanh::output_shapes()'],['../classmlx_1_1core_1_1_arg_partition.html#a28608aee76a2db25f6455da561526c64',1,'mlx::core::ArgPartition::output_shapes()'],['../classmlx_1_1core_1_1_arg_reduce.html#a40a047cb3ed8d1445d42100b3fd85179',1,'mlx::core::ArgReduce::output_shapes()'],['../classmlx_1_1core_1_1_arg_sort.html#ac50e0b76c457aae944425b3a57c33859',1,'mlx::core::ArgSort::output_shapes()'],['../classmlx_1_1core_1_1_as_type.html#a18922e68006b5cf005355f5c9ac57ac4',1,'mlx::core::AsType::output_shapes()'],['../classmlx_1_1core_1_1_bitwise_binary.html#a7d2dfa8884832fc1a94ce6400d0ed599',1,'mlx::core::BitwiseBinary::output_shapes()'],['../classmlx_1_1core_1_1_ceil.html#a1eb263c04df810e212855a17af0658ea',1,'mlx::core::Ceil::output_shapes()'],['../classmlx_1_1core_1_1_compiled.html#a453a10c68b7825def5b53207bc04a71c',1,'mlx::core::Compiled::output_shapes()'],['../classmlx_1_1core_1_1_conjugate.html#ada40413e9f210251476a37cc0d0ea37f',1,'mlx::core::Conjugate::output_shapes()'],['../classmlx_1_1core_1_1_contiguous.html#a1a53623d7c591ba6567ac1533fbc2b7c',1,'mlx::core::Contiguous::output_shapes()'],['../classmlx_1_1core_1_1_copy.html#ac3d13ebc6464403962fa1a9897fe6df3',1,'mlx::core::Copy::output_shapes()'],['../classmlx_1_1core_1_1_cos.html#a05b2d43942aa1d93a40c20ae8b90a25b',1,'mlx::core::Cos::output_shapes()'],['../classmlx_1_1core_1_1_cosh.html#a1bef7feac9a387ea80e7fc774f579962',1,'mlx::core::Cosh::output_shapes()'],['../classmlx_1_1core_1_1_divide.html#ad514bed77ad94742e26c93e446940994',1,'mlx::core::Divide::output_shapes()'],['../classmlx_1_1core_1_1_div_mod.html#a61d835d777c8063089dc708898ff314b',1,'mlx::core::DivMod::output_shapes()'],['../classmlx_1_1core_1_1_select.html#a295cd22e2284f2216bc93fdcf0b54867',1,'mlx::core::Select::output_shapes()'],['../classmlx_1_1core_1_1_remainder.html#a74bf3a9723b59200573ff8bac9a0b666',1,'mlx::core::Remainder::output_shapes()'],['../classmlx_1_1core_1_1_equal.html#a2f91e9603f63ad539837356b1ff3e7a9',1,'mlx::core::Equal::output_shapes()'],['../classmlx_1_1core_1_1_erf.html#afa4abbd7786b474c44c336a95481d187',1,'mlx::core::Erf::output_shapes()'],['../classmlx_1_1core_1_1_erf_inv.html#a22a95594e68b43b50c05355c82779639',1,'mlx::core::ErfInv::output_shapes()'],['../classmlx_1_1core_1_1_exp.html#aee7ba8d5be4a11f4b8f359b0338ab670',1,'mlx::core::Exp::output_shapes()'],['../classmlx_1_1core_1_1_expm1.html#ab9dbf34806eb43b928722ed9e8feed08',1,'mlx::core::Expm1::output_shapes()'],['../classmlx_1_1core_1_1_floor.html#aaf86becc7bfba6ee2af0d1f6d8e25015',1,'mlx::core::Floor::output_shapes()'],['../classmlx_1_1core_1_1_greater.html#ab2167a38c3baff99f527f17eb4c71d46',1,'mlx::core::Greater::output_shapes()'],['../classmlx_1_1core_1_1_greater_equal.html#a636a9cc00b0333e49978f39814af640f',1,'mlx::core::GreaterEqual::output_shapes()'],['../classmlx_1_1core_1_1_hadamard.html#a458614bc7820ae56493eb56d813b2cde',1,'mlx::core::Hadamard::output_shapes()'],['../classmlx_1_1core_1_1_imag.html#ab396ef74748abd3d4121ffee33a08d48',1,'mlx::core::Imag::output_shapes()'],['../classmlx_1_1core_1_1_less.html#a5e8b56574ccb91c065548f4bda40e278',1,'mlx::core::Less::output_shapes()'],['../classmlx_1_1core_1_1_less_equal.html#a2e259f3de11f97f3bd38a2e65667d78f',1,'mlx::core::LessEqual::output_shapes()'],['../classmlx_1_1core_1_1_log.html#a113dcc95e2a1a052238b1f5c8935a63d',1,'mlx::core::Log::output_shapes()'],['../classmlx_1_1core_1_1_log1p.html#aebf8f5b6670f55fa24283a934f4b25df',1,'mlx::core::Log1p::output_shapes()'],['../classmlx_1_1core_1_1_logical_not.html#a4a40511a052a6627085be378bbebe69c',1,'mlx::core::LogicalNot::output_shapes()'],['../classmlx_1_1core_1_1_logical_and.html#a191d69d92c01ed5ad82d4688f1de2617',1,'mlx::core::LogicalAnd::output_shapes()'],['../classmlx_1_1core_1_1_logical_or.html#a26259843be2de75d5e07cb7ea94fcfe4',1,'mlx::core::LogicalOr::output_shapes()'],['../classmlx_1_1core_1_1_log_add_exp.html#ac35cf432ecdd141d957b55fc4bff6635',1,'mlx::core::LogAddExp::output_shapes()'],['../classmlx_1_1core_1_1_maximum.html#a7bb80360ba4b74d0b0f3f74a5ff90d1b',1,'mlx::core::Maximum::output_shapes()'],['../classmlx_1_1core_1_1_minimum.html#ab4a85741dffaa64d8ead028f11539d70',1,'mlx::core::Minimum::output_shapes()'],['../classmlx_1_1core_1_1_multiply.html#a072de3911113247c95c28d3b52400061',1,'mlx::core::Multiply::output_shapes()'],['../classmlx_1_1core_1_1_negative.html#a253c08c7461bf2dce05f555c8dbf0014',1,'mlx::core::Negative::output_shapes()'],['../classmlx_1_1core_1_1_not_equal.html#a5b10e99bc564197e7b16dccb0577d89a',1,'mlx::core::NotEqual::output_shapes()'],['../classmlx_1_1core_1_1_number_of_elements.html#aae36bb1e125c0a2d7cd54e78be0f2af8',1,'mlx::core::NumberOfElements::output_shapes()'],['../classmlx_1_1core_1_1_partition.html#ae5b792df683bc14dde89f75ac6bcbeaf',1,'mlx::core::Partition::output_shapes()'],['../classmlx_1_1core_1_1_power.html#a1c17867ea1bad8899adb38185c9423c1',1,'mlx::core::Power::output_shapes()'],['../classmlx_1_1core_1_1_real.html#a75d7b85e68a7a03ec911c7acc09ddde5',1,'mlx::core::Real::output_shapes()'],['../classmlx_1_1core_1_1_reduce.html#a0f73c2a55dc324145e11020c9b4d9a65',1,'mlx::core::Reduce::output_shapes()'],['../classmlx_1_1core_1_1_round.html#ad9a26817864dfc94b56e66bc6d80b047',1,'mlx::core::Round::output_shapes()'],['../classmlx_1_1core_1_1_sigmoid.html#a34572023c8748971289c2cb109ff9a43',1,'mlx::core::Sigmoid::output_shapes()'],['../classmlx_1_1core_1_1_sign.html#a719709b3c5d6b15a75614bdadd185f67',1,'mlx::core::Sign::output_shapes()'],['../classmlx_1_1core_1_1_sin.html#a46f059f04fd540f175f6031d28dc9f3a',1,'mlx::core::Sin::output_shapes()'],['../classmlx_1_1core_1_1_sinh.html#a4f10e7e6daf500575d97e077901e7d28',1,'mlx::core::Sinh::output_shapes()'],['../classmlx_1_1core_1_1_softmax.html#afea757ba328b9d8f35058793eae73e35',1,'mlx::core::Softmax::output_shapes()'],['../classmlx_1_1core_1_1_sort.html#a271545b66607b22e5f06a0fefe69f22d',1,'mlx::core::Sort::output_shapes()'],['../classmlx_1_1core_1_1_square.html#ac4c4927639cab1c5b91a074e7f68da02',1,'mlx::core::Square::output_shapes()'],['../classmlx_1_1core_1_1_sqrt.html#ae3d4f99729a7e72be7decf5a56d095d5',1,'mlx::core::Sqrt::output_shapes()'],['../classmlx_1_1core_1_1_stop_gradient.html#a12e7f55e087aea58b2a56f239c69bb4e',1,'mlx::core::StopGradient::output_shapes()'],['../classmlx_1_1core_1_1_subtract.html#a0fbf4bc9a0c76edc37ebb4083d98f3fc',1,'mlx::core::Subtract::output_shapes()'],['../classmlx_1_1core_1_1_tan.html#a7be9fd77491a48b07b6e126ab68bdf37',1,'mlx::core::Tan::output_shapes()'],['../classmlx_1_1core_1_1_tanh.html#a0392f51a9e51915d4691615757ba4325',1,'mlx::core::Tanh::output_shapes()'],['../classmlx_1_1core_1_1_eigh.html#a68c890a4172711fbab8baef8da40a9c5',1,'mlx::core::Eigh::output_shapes()']]],
+  ['outputs_43',['outputs',['../classmlx_1_1core_1_1array.html#a2c186fd527f984f0589d4183b4976289',1,'mlx::core::array::outputs()'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f',1,'mlx::core::metal::CommandEncoder::outputs()']]],
+  ['overwrite_5fdescriptor_44',['overwrite_descriptor',['../classmlx_1_1core_1_1array.html#a95e6b156c8e05439f076b85c05079387',1,'mlx::core::array']]]
 ];
diff --git a/docs/build/html/search/namespaces_0.js b/docs/build/html/search/namespaces_0.js
index b24736856..8c537988d 100644
--- a/docs/build/html/search/namespaces_0.js
+++ b/docs/build/html/search/namespaces_0.js
@@ -9,12 +9,13 @@ var searchData=
   ['mlx_3a_3acore_3a_3adetail_6',['detail',['../namespacemlx_1_1core_1_1detail.html',1,'mlx::core']]],
   ['mlx_3a_3acore_3a_3adistributed_7',['distributed',['../namespacemlx_1_1core_1_1distributed.html',1,'mlx::core']]],
   ['mlx_3a_3acore_3a_3adistributed_3a_3adetail_8',['detail',['../namespacemlx_1_1core_1_1distributed_1_1detail.html',1,'mlx::core::distributed']]],
-  ['mlx_3a_3acore_3a_3afast_9',['fast',['../namespacemlx_1_1core_1_1fast.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3afft_10',['fft',['../namespacemlx_1_1core_1_1fft.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3aio_11',['io',['../namespacemlx_1_1core_1_1io.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3alinalg_12',['linalg',['../namespacemlx_1_1core_1_1linalg.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3ametal_13',['metal',['../namespacemlx_1_1core_1_1metal.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3arandom_14',['random',['../namespacemlx_1_1core_1_1random.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3ascheduler_15',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html',1,'mlx::core']]],
-  ['mlx_3a_3asteel_16',['steel',['../namespacemlx_1_1steel.html',1,'mlx']]]
+  ['mlx_3a_3acore_3a_3aenv_9',['env',['../namespacemlx_1_1core_1_1env.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3afast_10',['fast',['../namespacemlx_1_1core_1_1fast.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3afft_11',['fft',['../namespacemlx_1_1core_1_1fft.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3aio_12',['io',['../namespacemlx_1_1core_1_1io.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3alinalg_13',['linalg',['../namespacemlx_1_1core_1_1linalg.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3ametal_14',['metal',['../namespacemlx_1_1core_1_1metal.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3arandom_15',['random',['../namespacemlx_1_1core_1_1random.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3ascheduler_16',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html',1,'mlx::core']]],
+  ['mlx_3a_3asteel_17',['steel',['../namespacemlx_1_1steel.html',1,'mlx']]]
 ];
diff --git a/docs/build/html/search/searchdata.js b/docs/build/html/search/searchdata.js
index 454a00fcb..89f749d22 100644
--- a/docs/build/html/search/searchdata.js
+++ b/docs/build/html/search/searchdata.js
@@ -6,11 +6,11 @@ var indexSectionsWithContent =
   3: "abcdefghiklmopqrstu",
   4: "_abcdefghijklmnopqrstuvwz~",
   5: "abcdefghijklmnopqrstuvwxz",
-  6: "abdefgilmnprstv",
+  6: "abcdefgilmnprstv",
   7: "bcdkorsv",
   8: "abcdefgilmnoprstuvx",
   9: "ao",
-  10: "_abcdfhimprsu",
+  10: "_abcdfhijmprs",
   11: "aco"
 };
 
diff --git a/docs/build/html/search/typedefs_0.js b/docs/build/html/search/typedefs_0.js
index 55f2dcb7d..085c54578 100644
--- a/docs/build/html/search/typedefs_0.js
+++ b/docs/build/html/search/typedefs_0.js
@@ -1,5 +1,5 @@
 var searchData=
 [
-  ['accum_5ftype_0',['accum_type',['../structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da',1,'mlx::steel::AccumHelper']]],
+  ['accum_5ftype_0',['accum_type',['../structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26',1,'mlx::steel::AccumHelper']]],
   ['add_5fvec_5ft_1',['add_vec_t',['../namespacepocketfft_1_1detail.html#a421aa74fbee775a96463246f72b144d6',1,'pocketfft::detail']]]
 ];
diff --git a/docs/build/html/search/typedefs_1.js b/docs/build/html/search/typedefs_1.js
index f95e553f0..98974c15a 100644
--- a/docs/build/html/search/typedefs_1.js
+++ b/docs/build/html/search/typedefs_1.js
@@ -1,6 +1,6 @@
 var searchData=
 [
-  ['bfloat16_5ft_0',['bfloat16_t',['../backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82',1,'bf16.h']]],
+  ['bfloat16_5ft_0',['bfloat16_t',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82',1,'bfloat16_t:&#160;bf16.h'],['../backend_2metal_2kernels_2metal__3__1_2bf16_8h.html#a58e15a77da988b9104fee00cdf8b280e',1,'bfloat16_t:&#160;bf16.h']]],
   ['block_5fmerge_5fsort_5ft_1',['block_merge_sort_t',['../struct_kernel_merge_sort.html#adae7850e057fc30d5328c7b3dcc998fa',1,'KernelMergeSort::block_merge_sort_t'],['../struct_kernel_multi_block_merge_sort.html#af27e9af4b58640c0aa620bc4efc68dff',1,'KernelMultiBlockMergeSort::block_merge_sort_t']]],
   ['bool_5fconstant_2',['bool_constant',['../namespacemlx_1_1steel.html#adbb34bcf0d2dca6b9fb803d591d00da9',1,'mlx::steel']]]
 ];
diff --git a/docs/build/html/search/typedefs_2.js b/docs/build/html/search/typedefs_2.js
index 8bf519014..ed86072e8 100644
--- a/docs/build/html/search/typedefs_2.js
+++ b/docs/build/html/search/typedefs_2.js
@@ -1,5 +1,4 @@
 var searchData=
 [
-  ['deleter_5ft_0',['deleter_t',['../namespacemlx_1_1core.html#a1e6cec03ebd80fd2d6b12b288367bfa8',1,'mlx::core']]],
-  ['difference_5ftype_1',['difference_type',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#adcee44c77980fc2370a2c31e203aead5',1,'mlx::core::array::ArrayIterator']]]
+  ['col_5ffrag_5ftype_0',['col_frag_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aab8dd1c6917247da41dd3a31139a665f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]]
 ];
diff --git a/docs/build/html/search/typedefs_3.js b/docs/build/html/search/typedefs_3.js
index 60de4b6f5..8bf519014 100644
--- a/docs/build/html/search/typedefs_3.js
+++ b/docs/build/html/search/typedefs_3.js
@@ -1,5 +1,5 @@
 var searchData=
 [
-  ['elem_5ftype_0',['elem_type',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c',1,'mlx::steel::MMATile']]],
-  ['enable_5ffor_5farrays_5ft_1',['enable_for_arrays_t',['../namespacemlx_1_1core.html#af89751d79339f3e4d9318ea97d64d114',1,'mlx::core']]]
+  ['deleter_5ft_0',['deleter_t',['../namespacemlx_1_1core.html#a1e6cec03ebd80fd2d6b12b288367bfa8',1,'mlx::core']]],
+  ['difference_5ftype_1',['difference_type',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#adcee44c77980fc2370a2c31e203aead5',1,'mlx::core::array::ArrayIterator']]]
 ];
diff --git a/docs/build/html/search/typedefs_4.js b/docs/build/html/search/typedefs_4.js
index b89c3c5a3..e1ab7d1f1 100644
--- a/docs/build/html/search/typedefs_4.js
+++ b/docs/build/html/search/typedefs_4.js
@@ -1,6 +1,5 @@
 var searchData=
 [
-  ['false_5ftype_0',['false_type',['../namespacemlx_1_1steel.html#ab0ef721cedc2b5a97f60d76b765aff2e',1,'mlx::steel']]],
-  ['float16_5ft_1',['float16_t',['../backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773',1,'float16_t:&#160;utils.h'],['../namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52',1,'mlx::core::float16_t']]],
-  ['frag_5ftype_2',['frag_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::frag_type'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef',1,'mlx::steel::MMATile::frag_type']]]
+  ['elem_5ftype_0',['elem_type',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628',1,'mlx::steel::MMATile']]],
+  ['enable_5ffor_5farrays_5ft_1',['enable_for_arrays_t',['../namespacemlx_1_1core.html#af89751d79339f3e4d9318ea97d64d114',1,'mlx::core']]]
 ];
diff --git a/docs/build/html/search/typedefs_5.js b/docs/build/html/search/typedefs_5.js
index e5f6b91fb..68428d4ab 100644
--- a/docs/build/html/search/typedefs_5.js
+++ b/docs/build/html/search/typedefs_5.js
@@ -1,5 +1,6 @@
 var searchData=
 [
-  ['ggufload_0',['GGUFLoad',['../namespacemlx_1_1core.html#aa5b0f7f13a941e1f41c411194e9033c7',1,'mlx::core']]],
-  ['ggufmetadata_1',['GGUFMetaData',['../namespacemlx_1_1core.html#a8c2c1b9a37aadfb48f4c3a7e806e32e3',1,'mlx::core']]]
+  ['false_5ftype_0',['false_type',['../namespacemlx_1_1steel.html#ab0ef721cedc2b5a97f60d76b765aff2e',1,'mlx::steel']]],
+  ['float16_5ft_1',['float16_t',['../backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773',1,'float16_t:&#160;utils.h'],['../namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52',1,'mlx::core::float16_t']]],
+  ['frag_5ftype_2',['frag_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::frag_type'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171',1,'mlx::steel::MMATile::frag_type']]]
 ];
diff --git a/docs/build/html/search/typedefs_6.js b/docs/build/html/search/typedefs_6.js
index 80c07e679..e5f6b91fb 100644
--- a/docs/build/html/search/typedefs_6.js
+++ b/docs/build/html/search/typedefs_6.js
@@ -1,6 +1,5 @@
 var searchData=
 [
-  ['idx_5ft_0',['idx_t',['../struct_kernel_merge_sort.html#a0df65b709ae7f153a2bf381179d55e00',1,'KernelMergeSort']]],
-  ['int_1',['Int',['../namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891',1,'mlx::steel']]],
-  ['iterator_5fcategory_2',['iterator_category',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a2cbf481e39164245668b3be6cbcc614d',1,'mlx::core::array::ArrayIterator']]]
+  ['ggufload_0',['GGUFLoad',['../namespacemlx_1_1core.html#aa5b0f7f13a941e1f41c411194e9033c7',1,'mlx::core']]],
+  ['ggufmetadata_1',['GGUFMetaData',['../namespacemlx_1_1core.html#a8c2c1b9a37aadfb48f4c3a7e806e32e3',1,'mlx::core']]]
 ];
diff --git a/docs/build/html/search/typedefs_7.js b/docs/build/html/search/typedefs_7.js
index d546c2d24..80c07e679 100644
--- a/docs/build/html/search/typedefs_7.js
+++ b/docs/build/html/search/typedefs_7.js
@@ -1,5 +1,6 @@
 var searchData=
 [
-  ['loader_5fa_5ft_0',['loader_a_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa8a04ed74d2259f99b337d4662c64d83',1,'mlx::steel::GEMMKernel']]],
-  ['loader_5fb_5ft_1',['loader_b_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa98f32278b5fd98c93ae5483c3596395',1,'mlx::steel::GEMMKernel']]]
+  ['idx_5ft_0',['idx_t',['../struct_kernel_merge_sort.html#a0df65b709ae7f153a2bf381179d55e00',1,'KernelMergeSort']]],
+  ['int_1',['Int',['../namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891',1,'mlx::steel']]],
+  ['iterator_5fcategory_2',['iterator_category',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a2cbf481e39164245668b3be6cbcc614d',1,'mlx::core::array::ArrayIterator']]]
 ];
diff --git a/docs/build/html/search/typedefs_8.js b/docs/build/html/search/typedefs_8.js
index 9f1e64bc3..507c562e7 100644
--- a/docs/build/html/search/typedefs_8.js
+++ b/docs/build/html/search/typedefs_8.js
@@ -1,10 +1,5 @@
 var searchData=
 [
-  ['mask_5ft_0',['mask_t',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a270ab3da7c98a12525a59952742cc97d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
-  ['mat_5ftype_1',['mat_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mat_type'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190',1,'mlx::steel::MMATile::mat_type']]],
-  ['metalkernelfunction_2',['MetalKernelFunction',['../namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0',1,'mlx::core::fast']]],
-  ['mma_5ft_3',['mma_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#add8c6a31011a4895667c2a94a5af3782',1,'mlx::steel::GEMMKernel']]],
-  ['mmafrag_5facc_5ft_4',['MMAFrag_acc_t',['../structmlx_1_1steel_1_1_block_m_m_a.html#ae2c42cb6d0dde785859164c195f4d13c',1,'mlx::steel::BlockMMA']]],
-  ['mmafrag_5ft_5',['MMAFrag_t',['../structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382',1,'mlx::steel::MMATile']]],
-  ['mtlfclist_6',['MTLFCList',['../namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54',1,'mlx::core::metal']]]
+  ['loader_5fa_5ft_0',['loader_a_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a98b6ec692580510081e2aa887a61944b',1,'mlx::steel::GEMMKernel']]],
+  ['loader_5fb_5ft_1',['loader_b_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1a115d5af0fb6e260165adba2e377635',1,'mlx::steel::GEMMKernel']]]
 ];
diff --git a/docs/build/html/search/typedefs_9.js b/docs/build/html/search/typedefs_9.js
index 335d411ea..412c684b5 100644
--- a/docs/build/html/search/typedefs_9.js
+++ b/docs/build/html/search/typedefs_9.js
@@ -1,4 +1,10 @@
 var searchData=
 [
-  ['nomask_5ft_0',['nomask_t',['../kernels_2gemv__masked_8h.html#a1480c8cdff1cae1462a5a71632969bca',1,'gemv_masked.h']]]
+  ['mask_5ft_0',['mask_t',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a270ab3da7c98a12525a59952742cc97d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
+  ['mat_5ftype_1',['mat_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mat_type'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a',1,'mlx::steel::MMATile::mat_type']]],
+  ['metalkernelfunction_2',['MetalKernelFunction',['../namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0',1,'mlx::core::fast']]],
+  ['mma_5ft_3',['mma_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ae52eb09c9478cd4f199662346ac0c83e',1,'mlx::steel::GEMMKernel']]],
+  ['mmafrag_5facc_5ft_4',['MMAFrag_acc_t',['../structmlx_1_1steel_1_1_block_m_m_a.html#a8231b0e3475077c1381eb8f5daf62e35',1,'mlx::steel::BlockMMA']]],
+  ['mmafrag_5ft_5',['MMAFrag_t',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4',1,'mlx::steel::MMATile']]],
+  ['mtlfclist_6',['MTLFCList',['../namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54',1,'mlx::core::metal']]]
 ];
diff --git a/docs/build/html/search/typedefs_a.js b/docs/build/html/search/typedefs_a.js
index d34992426..335d411ea 100644
--- a/docs/build/html/search/typedefs_a.js
+++ b/docs/build/html/search/typedefs_a.js
@@ -1,4 +1,4 @@
 var searchData=
 [
-  ['pointer_5felement_5ft_0',['pointer_element_t',['../namespacemetal.html#ac82ee6c3fbe9ec5c78c07329424aaec9',1,'metal']]]
+  ['nomask_5ft_0',['nomask_t',['../kernels_2gemv__masked_8h.html#a1480c8cdff1cae1462a5a71632969bca',1,'gemv_masked.h']]]
 ];
diff --git a/docs/build/html/search/typedefs_b.js b/docs/build/html/search/typedefs_b.js
index ba03c67d8..d34992426 100644
--- a/docs/build/html/search/typedefs_b.js
+++ b/docs/build/html/search/typedefs_b.js
@@ -1,5 +1,4 @@
 var searchData=
 [
-  ['radixfunc_0',['RadixFunc',['../backend_2metal_2kernels_2fft_8h.html#a6ba62eabfd5428644aabf89ddaa0128d',1,'fft.h']]],
-  ['reference_1',['reference',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a44e2e1f29191c20ec4390de4fa0bd59f',1,'mlx::core::array::ArrayIterator']]]
+  ['pointer_5felement_5ft_0',['pointer_element_t',['../namespacemetal.html#ac82ee6c3fbe9ec5c78c07329424aaec9',1,'metal']]]
 ];
diff --git a/docs/build/html/search/typedefs_c.js b/docs/build/html/search/typedefs_c.js
index d427dcde6..c7ffdc21a 100644
--- a/docs/build/html/search/typedefs_c.js
+++ b/docs/build/html/search/typedefs_c.js
@@ -1,8 +1,6 @@
 var searchData=
 [
-  ['safetensorsload_0',['SafetensorsLoad',['../namespacemlx_1_1core.html#a688cd7917b1365065e8059e9964c3d45',1,'mlx::core']]],
-  ['shape_5ft_1',['shape_t',['../namespacepocketfft_1_1detail.html#a885ee37fcf564a268a5c8ca9ea8603e1',1,'pocketfft::detail']]],
-  ['simplevalueandgradfn_2',['SimpleValueAndGradFn',['../namespacemlx_1_1core.html#a2689b8f1181648cb1685204fea9f3066',1,'mlx::core']]],
-  ['streamordevice_3',['StreamOrDevice',['../namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58',1,'mlx::core']]],
-  ['stride_5ft_4',['stride_t',['../namespacepocketfft_1_1detail.html#afb987c919e9424a996d0fc8b3c23cc84',1,'pocketfft::detail']]]
+  ['radixfunc_0',['RadixFunc',['../backend_2metal_2kernels_2fft_8h.html#a6ba62eabfd5428644aabf89ddaa0128d',1,'fft.h']]],
+  ['reference_1',['reference',['../structmlx_1_1core_1_1array_1_1_array_iterator.html#a44e2e1f29191c20ec4390de4fa0bd59f',1,'mlx::core::array::ArrayIterator']]],
+  ['row_5ffrag_5ftype_2',['row_frag_type',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a5ec2e40a8f5ad98c71b825544cdd878b',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]]
 ];
diff --git a/docs/build/html/search/typedefs_d.js b/docs/build/html/search/typedefs_d.js
index fd5deb26c..d427dcde6 100644
--- a/docs/build/html/search/typedefs_d.js
+++ b/docs/build/html/search/typedefs_d.js
@@ -1,7 +1,8 @@
 var searchData=
 [
-  ['templatearg_0',['TemplateArg',['../namespacemlx_1_1core_1_1fast.html#a9390693ff7be931f3ef3428e2ea4c3f9',1,'mlx::core::fast']]],
-  ['thread_5fsort_5ft_1',['thread_sort_t',['../struct_block_merge_sort.html#ad2474d16721f4ceb954125728a0e2ea2',1,'BlockMergeSort']]],
-  ['true_5ftype_2',['true_type',['../namespacemlx_1_1steel.html#a594a6ccb75b38b5ae4ddd0d9ad047b3a',1,'mlx::steel']]],
-  ['type_3',['type',['../structpocketfft_1_1detail_1_1add__vec.html#a7568dc83136c1b41eb71dcb78527227e',1,'pocketfft::detail::add_vec::type'],['../structpocketfft_1_1detail_1_1add__vec_3_01cmplx_3_01_t_01_4_01_4.html#a257b1c81fb9f559c48ee90497013494e',1,'pocketfft::detail::add_vec&lt; cmplx&lt; T &gt; &gt;::type'],['../structmlx_1_1steel_1_1integral__constant.html#a6492c15b37d160d3a33e1cbe770aa3f1',1,'mlx::steel::integral_constant::type'],['../structmetal_1_1make__void.html#aee74916713465374928c5379ab0d9b75',1,'metal::make_void::type'],['../structmetal_1_1pointer__element_3_01thread_01_t_01_5_01_4.html#a98fbc2aa99dd26bb35aa9cd1826318d8',1,'metal::pointer_element&lt; thread T * &gt;::type'],['../structmetal_1_1pointer__element_3_01device_01_t_01_5_01_4.html#ab36a7c5a64c0693dd3d8ccb322c163d4',1,'metal::pointer_element&lt; device T * &gt;::type'],['../structmetal_1_1pointer__element_3_01constant_01_t_01_5_01_4.html#ad154b55b9e450a6376016488c8e68c53',1,'metal::pointer_element&lt; constant T * &gt;::type'],['../structmetal_1_1pointer__element_3_01threadgroup_01_t_01_5_01_4.html#a78c718d6da9d393c139a385f42472362',1,'metal::pointer_element&lt; threadgroup T * &gt;::type']]]
+  ['safetensorsload_0',['SafetensorsLoad',['../namespacemlx_1_1core.html#a688cd7917b1365065e8059e9964c3d45',1,'mlx::core']]],
+  ['shape_5ft_1',['shape_t',['../namespacepocketfft_1_1detail.html#a885ee37fcf564a268a5c8ca9ea8603e1',1,'pocketfft::detail']]],
+  ['simplevalueandgradfn_2',['SimpleValueAndGradFn',['../namespacemlx_1_1core.html#a2689b8f1181648cb1685204fea9f3066',1,'mlx::core']]],
+  ['streamordevice_3',['StreamOrDevice',['../namespacemlx_1_1core.html#a95fc1013cc48fbfee0c54310711a5e58',1,'mlx::core']]],
+  ['stride_5ft_4',['stride_t',['../namespacepocketfft_1_1detail.html#afb987c919e9424a996d0fc8b3c23cc84',1,'pocketfft::detail']]]
 ];
diff --git a/docs/build/html/search/typedefs_e.js b/docs/build/html/search/typedefs_e.js
index cfd1d3816..fd5deb26c 100644
--- a/docs/build/html/search/typedefs_e.js
+++ b/docs/build/html/search/typedefs_e.js
@@ -1,8 +1,7 @@
 var searchData=
 [
-  ['val_5ft_0',['val_t',['../struct_kernel_merge_sort.html#a4e3f09896275956fc4c23e1f157dca3b',1,'KernelMergeSort']]],
-  ['value_5ftype_1',['value_type',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#ab86a2740ed9ce3199135372ff1d88c76',1,'pocketfft::detail::threading::aligned_allocator::value_type'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ae24fe304397e961687d0d4c7012b8ae4',1,'mlx::core::array::ArrayIterator::value_type'],['../structmlx_1_1steel_1_1integral__constant.html#a0569cc1334e0bc4f474304b33d365759',1,'mlx::steel::integral_constant::value_type']]],
-  ['valueandgradfn_2',['ValueAndGradFn',['../namespacemlx_1_1core.html#ab79d66ddf1ec38b2f2c01234892a2230',1,'mlx::core']]],
-  ['void_5ft_3',['void_t',['../namespacemetal.html#a192322c772aa8b168d59edc55fb806f1',1,'metal']]],
-  ['vtype_5ft_4',['vtype_t',['../namespacepocketfft_1_1detail.html#a3edfb93aeed2f8258183d463ea291d62',1,'pocketfft::detail']]]
+  ['templatearg_0',['TemplateArg',['../namespacemlx_1_1core_1_1fast.html#a9390693ff7be931f3ef3428e2ea4c3f9',1,'mlx::core::fast']]],
+  ['thread_5fsort_5ft_1',['thread_sort_t',['../struct_block_merge_sort.html#ad2474d16721f4ceb954125728a0e2ea2',1,'BlockMergeSort']]],
+  ['true_5ftype_2',['true_type',['../namespacemlx_1_1steel.html#a594a6ccb75b38b5ae4ddd0d9ad047b3a',1,'mlx::steel']]],
+  ['type_3',['type',['../structpocketfft_1_1detail_1_1add__vec.html#a7568dc83136c1b41eb71dcb78527227e',1,'pocketfft::detail::add_vec::type'],['../structpocketfft_1_1detail_1_1add__vec_3_01cmplx_3_01_t_01_4_01_4.html#a257b1c81fb9f559c48ee90497013494e',1,'pocketfft::detail::add_vec&lt; cmplx&lt; T &gt; &gt;::type'],['../structmlx_1_1steel_1_1integral__constant.html#a6492c15b37d160d3a33e1cbe770aa3f1',1,'mlx::steel::integral_constant::type'],['../structmetal_1_1make__void.html#aee74916713465374928c5379ab0d9b75',1,'metal::make_void::type'],['../structmetal_1_1pointer__element_3_01thread_01_t_01_5_01_4.html#a98fbc2aa99dd26bb35aa9cd1826318d8',1,'metal::pointer_element&lt; thread T * &gt;::type'],['../structmetal_1_1pointer__element_3_01device_01_t_01_5_01_4.html#ab36a7c5a64c0693dd3d8ccb322c163d4',1,'metal::pointer_element&lt; device T * &gt;::type'],['../structmetal_1_1pointer__element_3_01constant_01_t_01_5_01_4.html#ad154b55b9e450a6376016488c8e68c53',1,'metal::pointer_element&lt; constant T * &gt;::type'],['../structmetal_1_1pointer__element_3_01threadgroup_01_t_01_5_01_4.html#a78c718d6da9d393c139a385f42472362',1,'metal::pointer_element&lt; threadgroup T * &gt;::type']]]
 ];
diff --git a/docs/build/html/search/typedefs_f.js b/docs/build/html/search/typedefs_f.js
new file mode 100644
index 000000000..cfd1d3816
--- /dev/null
+++ b/docs/build/html/search/typedefs_f.js
@@ -0,0 +1,8 @@
+var searchData=
+[
+  ['val_5ft_0',['val_t',['../struct_kernel_merge_sort.html#a4e3f09896275956fc4c23e1f157dca3b',1,'KernelMergeSort']]],
+  ['value_5ftype_1',['value_type',['../structpocketfft_1_1detail_1_1threading_1_1aligned__allocator.html#ab86a2740ed9ce3199135372ff1d88c76',1,'pocketfft::detail::threading::aligned_allocator::value_type'],['../structmlx_1_1core_1_1array_1_1_array_iterator.html#ae24fe304397e961687d0d4c7012b8ae4',1,'mlx::core::array::ArrayIterator::value_type'],['../structmlx_1_1steel_1_1integral__constant.html#a0569cc1334e0bc4f474304b33d365759',1,'mlx::steel::integral_constant::value_type']]],
+  ['valueandgradfn_2',['ValueAndGradFn',['../namespacemlx_1_1core.html#ab79d66ddf1ec38b2f2c01234892a2230',1,'mlx::core']]],
+  ['void_5ft_3',['void_t',['../namespacemetal.html#a192322c772aa8b168d59edc55fb806f1',1,'metal']]],
+  ['vtype_5ft_4',['vtype_t',['../namespacepocketfft_1_1detail.html#a3edfb93aeed2f8258183d463ea291d62',1,'pocketfft::detail']]]
+];
diff --git a/docs/build/html/search/variables_0.js b/docs/build/html/search/variables_0.js
index 50c42d58b..475304b92 100644
--- a/docs/build/html/search/variables_0.js
+++ b/docs/build/html/search/variables_0.js
@@ -7,11 +7,12 @@ var searchData=
   ['adj_5fout_5fh_4',['adj_out_h',['../structmlx_1_1steel_1_1_conv2_d_general_jump_params.html#a879cc9757f59605a87d936ec4156040d',1,'mlx::steel::Conv2DGeneralJumpParams']]],
   ['adj_5fout_5fhw_5',['adj_out_hw',['../structmlx_1_1steel_1_1_conv2_d_general_jump_params.html#aed0ffd63fbc85fd5d5c4cc7b43f68363',1,'mlx::steel::Conv2DGeneralJumpParams']]],
   ['adj_5fout_5fw_6',['adj_out_w',['../structmlx_1_1steel_1_1_conv2_d_general_jump_params.html#ab971bf879079895189331fbeaf33c211',1,'mlx::steel::Conv2DGeneralJumpParams']]],
-  ['align_5fk_7',['align_K',['../steel__gemm__fused_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416',1,'steel_gemm_fused.h']]],
+  ['align_5fk_7',['align_K',['../steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416',1,'align_K:&#160;steel_attention.h'],['../steel__gemm__fused_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416',1,'align_K:&#160;steel_gemm_fused.h']]],
   ['align_5fm_8',['align_M',['../steel__gemm__fused_8h.html#a55af226dc74b0026b7d4b865142a6d21',1,'steel_gemm_fused.h']]],
   ['align_5fn_9',['align_N',['../steel__gemm__fused_8h.html#aa3b267252df2dcbfdde8c5f174d27036',1,'steel_gemm_fused.h']]],
-  ['alpha_10',['alpha',['../struct_m_l_x_fast_attention_params.html#a932266d04fa7d6e27d4a4a2c175f1477',1,'MLXFastAttentionParams::alpha'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#af8693d96512eff3e125d33d203920710',1,'mlx::steel::GEMMAddMMParams::alpha'],['../structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff',1,'mlx::steel::TransformAxpby::alpha']]],
-  ['arange_5fkernels_11',['arange_kernels',['../metal_2jit_2arange_8h.html#a2f49fb7bdc0a90230077fe2023e6e5c0',1,'arange.h']]],
-  ['as_5foffset_12',['As_offset',['../structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562',1,'mlx::steel::BlockMMA']]],
-  ['atile_13',['Atile',['../structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c',1,'mlx::steel::BlockMMA']]]
+  ['align_5fq_10',['align_Q',['../steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982',1,'steel_attention.h']]],
+  ['alpha_11',['alpha',['../structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff',1,'mlx::steel::TransformAxpby::alpha'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#af8693d96512eff3e125d33d203920710',1,'mlx::steel::GEMMAddMMParams::alpha']]],
+  ['arange_5fkernels_12',['arange_kernels',['../metal_2jit_2arange_8h.html#a2f49fb7bdc0a90230077fe2023e6e5c0',1,'arange.h']]],
+  ['as_5foffset_13',['As_offset',['../structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562',1,'mlx::steel::BlockMMA']]],
+  ['atile_14',['Atile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586',1,'mlx::steel::BlockMMA']]]
 ];
diff --git a/docs/build/html/search/variables_1.js b/docs/build/html/search/variables_1.js
index f430517e8..ee78d0f3d 100644
--- a/docs/build/html/search/variables_1.js
+++ b/docs/build/html/search/variables_1.js
@@ -1,37 +1,35 @@
 var searchData=
 [
-  ['b_0',['b',['../unionbool4__or__uint.html#a47d77eac47598fe420f8f04a615f76ca',1,'bool4_or_uint']]],
-  ['b_5fstr_5fk_1',['B_str_k',['../structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211',1,'mlx::steel::BlockMMA']]],
-  ['b_5fstr_5fn_2',['B_str_n',['../structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17',1,'mlx::steel::BlockMMA']]],
-  ['backward_3',['BACKWARD',['../namespacepocketfft_1_1detail.html#a9d1eaa7469c018c39e745733eab9a9c3',1,'pocketfft::detail']]],
-  ['base_5fwh_4',['base_wh',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aca37adba6f148579eb1cd0a7800a5cfe',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::base_wh'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6c46564bf1a96a02791dd432cc9c883e',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::base_wh']]],
-  ['base_5fww_5',['base_ww',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32d020c6715d06f7de360877fcb7b6e4',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::base_ww'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a230f0e581f9b8227b9ee68760b3b1503',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::base_ww']]],
-  ['batch_5fndim_6',['batch_ndim',['../struct_m_l_x_fast_attention_params.html#a6f3d94dbe44b32e675558768710bf0a3',1,'MLXFastAttentionParams::batch_ndim'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a640dc138a8bf7b2b5bed6a436b429c2f',1,'mlx::steel::GEMMParams::batch_ndim']]],
-  ['batch_5fsize_7',['batch_size',['../struct_read_writer.html#a689f4890c1d2ce33fc6da7550beec735',1,'ReadWriter']]],
-  ['batch_5fstride_5fa_8',['batch_stride_a',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a76f55783a8e2ee203cf8507eee4b000c',1,'mlx::steel::GEMMParams']]],
-  ['batch_5fstride_5fb_9',['batch_stride_b',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a99b959b12d12da657648fa24d43e49e8',1,'mlx::steel::GEMMParams']]],
-  ['batch_5fstride_5fc_10',['batch_stride_c',['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a77c065db228e9654a0a75a6ffe47c15a',1,'mlx::steel::GEMMAddMMParams']]],
-  ['batch_5fstride_5fd_11',['batch_stride_d',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#ad98006dd509a455864e6aa7c52743a41',1,'mlx::steel::GEMMParams']]],
-  ['batch_5fstride_5fk_12',['batch_stride_k',['../struct_m_l_x_fast_attention_params.html#a162826d3f288f64c0aea88a36b34859b',1,'MLXFastAttentionParams']]],
-  ['batch_5fstride_5fo_13',['batch_stride_o',['../struct_m_l_x_fast_attention_params.html#a3c5b1170999087f3f3a03830193b55c7',1,'MLXFastAttentionParams']]],
-  ['batch_5fstride_5fq_14',['batch_stride_q',['../struct_m_l_x_fast_attention_params.html#a98766fc89f75d5eef65b345f16a782d1',1,'MLXFastAttentionParams']]],
-  ['batch_5fstride_5fv_15',['batch_stride_v',['../struct_m_l_x_fast_attention_params.html#a1180e311b95cd4b6d4a336d21b873c21',1,'MLXFastAttentionParams']]],
-  ['bcols_16',['BCOLS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a29fbeeacdf5b6feeb74815ced255fa5a',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac3b40db720055350bba59d614ea1dd79',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a86519729ef0561686bb86e474c95b93d',1,'mlx::steel::Conv2DWeightBlockLoader::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9229d22e0a02d96825eb5a57c8cb95ac',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b6cf53a10514310d01f4d6459053a57',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3d6272d000f8ea79d9b3b5228bdca20f',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a1843921cd67926002bb0dcccf3048eb6',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::BCOLS']]],
-  ['bcols_5fpacked_17',['BCOLS_PACKED',['../struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb',1,'QuantizedBlockLoader']]],
-  ['beta_18',['beta',['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#ac0ce4d8a6014f8adb29fd0a0bb23139f',1,'mlx::steel::GEMMAddMMParams::beta'],['../structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6',1,'mlx::steel::TransformAxpby::beta']]],
-  ['bfloat16_19',['bfloat16',['../namespacemlx_1_1core.html#a514cf8b4e6f0a6af3a867e752f4338f7',1,'mlx::core']]],
-  ['bi_20',['bi',['../struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906',1,'QuantizedBlockLoader::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8e53b0a9951cb840d922cc285b257ee3',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ae3af75287f279d2cdeef189126740d4c',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a8c5e74003600132954cb953616e1a026',1,'mlx::steel::Conv2DWeightBlockLoader::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9eb024e2fc6f07345f87fbf7141c0d16',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae3b9f21f72e5e6c541c9978f55d354c7',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32a3a91fa715b82f36e05ceb10933d09',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a4c91f848856ab0872bdfd37c62d4b0ba',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::bi'],['../structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af',1,'mlx::steel::BlockLoader::bi']]],
-  ['biases_21',['biases',['../struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd',1,'QuantizedBlockLoader']]],
-  ['bits_5f_22',['bits_',['../struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8',1,'_MLX_BFloat16::bits_'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#aca48963f820065c3d8ecab24265ab3fc',1,'mlx::core::_MLX_BFloat16::bits_'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a5203fe52424fd32bce6eb7917dd9288b',1,'mlx::core::_MLX_Float16::bits_']]],
-  ['bj_23',['bj',['../struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00',1,'QuantizedBlockLoader::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a7ae9e41f50c0c63c35b63086a1c22cc3',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a6fd3dd7b74d91609fa9dd61c657a0e32',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a6f2fdcaf5a67567cca38ae3d8120ab37',1,'mlx::steel::Conv2DWeightBlockLoader::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7cf448573d41fbc67f8dfc65b7aef2b2',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#adaa261fc2e8e694aedab4ebd60b52e5e',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#ace16704025bc6e6204c306a357f3a8b8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acec010e10d5733654963407af38d4f67',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::bj'],['../structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4',1,'mlx::steel::BlockLoader::bj']]],
-  ['blockm_24',['blockM',['../struct_g_e_m_v_kernel.html#a7281520100658811076400060663903c',1,'GEMVKernel::blockM'],['../struct_g_e_m_v_t_kernel.html#a2ae8ce535d59cccf453381b4485a77f0',1,'GEMVTKernel::blockM']]],
-  ['blockn_25',['blockN',['../struct_g_e_m_v_kernel.html#a2fef17f9c9aa0bdf530ad3554fb0988b',1,'GEMVKernel::blockN'],['../struct_g_e_m_v_t_kernel.html#a60be87666006ba0bf88bc8e6902da42a',1,'GEMVTKernel::blockN']]],
-  ['bool_5f_26',['bool_',['../namespacemlx_1_1core.html#a113d2bac7e4aa6a4cb4a5c3242527b82',1,'mlx::core']]],
-  ['brows_27',['BROWS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ac070c6bd5be85b1ae805e18890db4fd4',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a10591ea957605a9c662f93d59ff3410d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ae9b86b05b23153ea1abaeead456c491c',1,'mlx::steel::Conv2DWeightBlockLoader::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a343984fb74ec579a4404278dbbc7e7b5',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acc8140aae84694f62e6324dbb6a614a4',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aba1e1c8012e4e50f0e9bcfb9486c1781',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a015a0c56de74a0c4d51953a7e94fbba8',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::BROWS']]],
-  ['bs_5foffset_28',['Bs_offset',['../structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca',1,'mlx::steel::BlockMMA']]],
-  ['btile_29',['Btile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26',1,'mlx::steel::BlockMMA']]],
-  ['buf_30',['buf',['../struct_read_writer.html#a23bac3c96dd0265ddbee1f256be45ff5',1,'ReadWriter::buf'],['../backend_2metal_2allocator_8h.html#a15aa5cc1baf29be08d55cca88509e697',1,'buf:&#160;allocator.h']]],
-  ['buffer_31',['buffer',['../structmlx_1_1core_1_1array_1_1_data.html#a9a51e2d12ba505027cc0fca86bdd39ad',1,'mlx::core::array::Data::buffer'],['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb',1,'mlx::core::metal::DeviceStream::buffer']]],
-  ['buffer_5fops_32',['buffer_ops',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782',1,'mlx::core::metal::DeviceStream']]],
-  ['buffers_33',['buffers',['../struct_indices.html#ad705070a740579c07d109ae4f3d86e76',1,'Indices']]]
+  ['b_0',['B',['../structmlx_1_1steel_1_1_attn_params.html#a1cba7fedbd02e157922619195997cf4f',1,'mlx::steel::AttnParams']]],
+  ['b_1',['b',['../unionbool4__or__uint.html#a47d77eac47598fe420f8f04a615f76ca',1,'bool4_or_uint']]],
+  ['b_5fstr_5fk_2',['B_str_k',['../structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211',1,'mlx::steel::BlockMMA']]],
+  ['b_5fstr_5fn_3',['B_str_n',['../structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17',1,'mlx::steel::BlockMMA']]],
+  ['backward_4',['BACKWARD',['../namespacepocketfft_1_1detail.html#a9d1eaa7469c018c39e745733eab9a9c3',1,'pocketfft::detail']]],
+  ['base_5fwh_5',['base_wh',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aca37adba6f148579eb1cd0a7800a5cfe',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::base_wh'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6c46564bf1a96a02791dd432cc9c883e',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::base_wh']]],
+  ['base_5fww_6',['base_ww',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32d020c6715d06f7de360877fcb7b6e4',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::base_ww'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a230f0e581f9b8227b9ee68760b3b1503',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::base_ww']]],
+  ['batch_5fndim_7',['batch_ndim',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a640dc138a8bf7b2b5bed6a436b429c2f',1,'mlx::steel::GEMMParams']]],
+  ['batch_5fsize_8',['batch_size',['../struct_read_writer.html#a689f4890c1d2ce33fc6da7550beec735',1,'ReadWriter']]],
+  ['batch_5fstride_5fa_9',['batch_stride_a',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a76f55783a8e2ee203cf8507eee4b000c',1,'mlx::steel::GEMMParams']]],
+  ['batch_5fstride_5fb_10',['batch_stride_b',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a99b959b12d12da657648fa24d43e49e8',1,'mlx::steel::GEMMParams']]],
+  ['batch_5fstride_5fc_11',['batch_stride_c',['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a77c065db228e9654a0a75a6ffe47c15a',1,'mlx::steel::GEMMAddMMParams']]],
+  ['batch_5fstride_5fd_12',['batch_stride_d',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#ad98006dd509a455864e6aa7c52743a41',1,'mlx::steel::GEMMParams']]],
+  ['bcols_13',['BCOLS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a29fbeeacdf5b6feeb74815ced255fa5a',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac3b40db720055350bba59d614ea1dd79',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a86519729ef0561686bb86e474c95b93d',1,'mlx::steel::Conv2DWeightBlockLoader::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9229d22e0a02d96825eb5a57c8cb95ac',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b6cf53a10514310d01f4d6459053a57',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a3d6272d000f8ea79d9b3b5228bdca20f',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::BCOLS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a1843921cd67926002bb0dcccf3048eb6',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::BCOLS']]],
+  ['bcols_5fpacked_14',['BCOLS_PACKED',['../struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb',1,'QuantizedBlockLoader']]],
+  ['beta_15',['beta',['../structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6',1,'mlx::steel::TransformAxpby::beta'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#ac0ce4d8a6014f8adb29fd0a0bb23139f',1,'mlx::steel::GEMMAddMMParams::beta']]],
+  ['bfloat16_16',['bfloat16',['../namespacemlx_1_1core.html#a514cf8b4e6f0a6af3a867e752f4338f7',1,'mlx::core']]],
+  ['bi_17',['bi',['../struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906',1,'QuantizedBlockLoader::bi'],['../structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af',1,'mlx::steel::BlockLoader::bi'],['../structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35',1,'mlx::steel::BlockLoaderT::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8e53b0a9951cb840d922cc285b257ee3',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ae3af75287f279d2cdeef189126740d4c',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a8c5e74003600132954cb953616e1a026',1,'mlx::steel::Conv2DWeightBlockLoader::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9eb024e2fc6f07345f87fbf7141c0d16',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae3b9f21f72e5e6c541c9978f55d354c7',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::bi'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a32a3a91fa715b82f36e05ceb10933d09',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::bi'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a4c91f848856ab0872bdfd37c62d4b0ba',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::bi']]],
+  ['biases_18',['biases',['../struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd',1,'QuantizedBlockLoader']]],
+  ['bits_5f_19',['bits_',['../struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8',1,'_MLX_BFloat16::bits_'],['../structmlx_1_1core_1_1___m_l_x___b_float16.html#aca48963f820065c3d8ecab24265ab3fc',1,'mlx::core::_MLX_BFloat16::bits_'],['../structmlx_1_1core_1_1___m_l_x___float16.html#a5203fe52424fd32bce6eb7917dd9288b',1,'mlx::core::_MLX_Float16::bits_']]],
+  ['bj_20',['bj',['../struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00',1,'QuantizedBlockLoader::bj'],['../structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4',1,'mlx::steel::BlockLoader::bj'],['../structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957',1,'mlx::steel::BlockLoaderT::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a7ae9e41f50c0c63c35b63086a1c22cc3',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a6fd3dd7b74d91609fa9dd61c657a0e32',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a6f2fdcaf5a67567cca38ae3d8120ab37',1,'mlx::steel::Conv2DWeightBlockLoader::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a7cf448573d41fbc67f8dfc65b7aef2b2',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#adaa261fc2e8e694aedab4ebd60b52e5e',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::bj'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#ace16704025bc6e6204c306a357f3a8b8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::bj'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#acec010e10d5733654963407af38d4f67',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::bj']]],
+  ['blockm_21',['blockM',['../struct_g_e_m_v_kernel.html#a7281520100658811076400060663903c',1,'GEMVKernel::blockM'],['../struct_g_e_m_v_t_kernel.html#a2ae8ce535d59cccf453381b4485a77f0',1,'GEMVTKernel::blockM']]],
+  ['blockn_22',['blockN',['../struct_g_e_m_v_kernel.html#a2fef17f9c9aa0bdf530ad3554fb0988b',1,'GEMVKernel::blockN'],['../struct_g_e_m_v_t_kernel.html#a60be87666006ba0bf88bc8e6902da42a',1,'GEMVTKernel::blockN']]],
+  ['bool_5f_23',['bool_',['../namespacemlx_1_1core.html#a113d2bac7e4aa6a4cb4a5c3242527b82',1,'mlx::core']]],
+  ['brows_24',['BROWS',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ac070c6bd5be85b1ae805e18890db4fd4',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a10591ea957605a9c662f93d59ff3410d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ae9b86b05b23153ea1abaeead456c491c',1,'mlx::steel::Conv2DWeightBlockLoader::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a343984fb74ec579a4404278dbbc7e7b5',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acc8140aae84694f62e6324dbb6a614a4',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aba1e1c8012e4e50f0e9bcfb9486c1781',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::BROWS'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a015a0c56de74a0c4d51953a7e94fbba8',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::BROWS']]],
+  ['bs_5foffset_25',['Bs_offset',['../structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca',1,'mlx::steel::BlockMMA']]],
+  ['btile_26',['Btile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0',1,'mlx::steel::BlockMMA']]],
+  ['buf_27',['buf',['../struct_read_writer.html#a23bac3c96dd0265ddbee1f256be45ff5',1,'ReadWriter::buf'],['../backend_2metal_2allocator_8h.html#a15aa5cc1baf29be08d55cca88509e697',1,'buf:&#160;allocator.h']]],
+  ['buffer_28',['buffer',['../structmlx_1_1core_1_1array_1_1_data.html#a9a51e2d12ba505027cc0fca86bdd39ad',1,'mlx::core::array::Data::buffer'],['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb',1,'mlx::core::metal::DeviceStream::buffer']]],
+  ['buffer_5fops_29',['buffer_ops',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782',1,'mlx::core::metal::DeviceStream']]],
+  ['buffers_30',['buffers',['../struct_indices.html#ad705070a740579c07d109ae4f3d86e76',1,'Indices']]],
+  ['bytes_5fper_5fpack_31',['bytes_per_pack',['../struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db',1,'QuantizedBlockLoader']]]
 ];
diff --git a/docs/build/html/search/variables_10.js b/docs/build/html/search/variables_10.js
index fcebac343..8befeac4c 100644
--- a/docs/build/html/search/variables_10.js
+++ b/docs/build/html/search/variables_10.js
@@ -1,7 +1,8 @@
 var searchData=
 [
   ['q_0',['q',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#adf608e22d0c0397217472408aab52631',1,'mlx::core::scheduler::StreamThread']]],
-  ['quad_5fsize_1',['QUAD_SIZE',['../quantized_8h.html#a803e4d5a1459844ba647aea5b004e133',1,'quantized.h']]],
-  ['query_5fsequence_5flength_2',['QUERY_SEQUENCE_LENGTH',['../struct_m_l_x_scaled_dot_product_attention_params.html#a46cc2da6a069d822f36983ee18467e5c',1,'MLXScaledDotProductAttentionParams']]],
-  ['queue_3',['queue',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d',1,'mlx::core::metal::DeviceStream']]]
+  ['q_5fstrides_1',['Q_strides',['../structmlx_1_1steel_1_1_attn_params.html#a90bba215328201a37eb1c430ce9f8563',1,'mlx::steel::AttnParams']]],
+  ['ql_2',['qL',['../structmlx_1_1steel_1_1_attn_params.html#a59255882cbd78bb6f15e704e3a356a7f',1,'mlx::steel::AttnParams']]],
+  ['quad_5fsize_3',['QUAD_SIZE',['../quantized_8h.html#a803e4d5a1459844ba647aea5b004e133',1,'quantized.h']]],
+  ['queue_4',['queue',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d',1,'mlx::core::metal::DeviceStream']]]
 ];
diff --git a/docs/build/html/search/variables_11.js b/docs/build/html/search/variables_11.js
index 13d67e5d4..570e8bc57 100644
--- a/docs/build/html/search/variables_11.js
+++ b/docs/build/html/search/variables_11.js
@@ -1,6 +1,6 @@
 var searchData=
 [
-  ['r_0',['r',['../structpocketfft_1_1detail_1_1cmplx.html#afc51cdf222d77690953a8cb8ce3ee692',1,'pocketfft::detail::cmplx']]],
+  ['r_0',['r',['../structpocketfft_1_1detail_1_1cmplx.html#afc51cdf222d77690953a8cb8ce3ee692',1,'pocketfft::detail::cmplx::r'],['../structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe',1,'mlx::steel::Shape2D::r']]],
   ['r2h_1',['r2h',['../structpocketfft_1_1detail_1_1_exec_r2_r.html#a925b398c8e1868614ce9eaf381d02b7e',1,'pocketfft::detail::ExecR2R']]],
   ['rader_5f11_5fsteps_5f_2',['rader_11_steps_',['../backend_2metal_2kernels_2fft_8h.html#a1f3c377d05da52429172e64132dba750',1,'fft.h']]],
   ['rader_5f13_5fsteps_5f_3',['rader_13_steps_',['../backend_2metal_2kernels_2fft_8h.html#a20d24f3e040d3d226a70d4dd7c9ac6a9',1,'fft.h']]],
diff --git a/docs/build/html/search/variables_12.js b/docs/build/html/search/variables_12.js
index 4bbcefb02..3d3df5564 100644
--- a/docs/build/html/search/variables_12.js
+++ b/docs/build/html/search/variables_12.js
@@ -1,9 +1,9 @@
 var searchData=
 [
-  ['scale_0',['scale',['../struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b',1,'ScaleOp']]],
+  ['scale_0',['scale',['../struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b',1,'ScaleOp::scale'],['../struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6',1,'TransformScale::scale'],['../structmlx_1_1steel_1_1_attn_params.html#ad81bcd32e6ff8fec0000eca505fb6826',1,'mlx::steel::AttnParams::scale']]],
   ['scales_1',['scales',['../struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf',1,'QuantizedBlockLoader']]],
   ['scatter_5fkernels_2',['scatter_kernels',['../jit_2indexing_8h.html#a768c949cd650a44c6b402fc1440c1a56',1,'indexing.h']]],
-  ['shape_3',['shape',['../structmlx_1_1core_1_1_reduction_plan.html#a6cfa8771fa9caf6fdcc3d74c9fca83ae',1,'mlx::core::ReductionPlan::shape'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a',1,'mlx::core::fast::CustomKernelShapeInfo::shape']]],
+  ['shape_3',['shape',['../structmlx_1_1core_1_1_reduction_plan.html#a6cfa8771fa9caf6fdcc3d74c9fca83ae',1,'mlx::core::ReductionPlan::shape'],['../structmlx_1_1steel_1_1_layout2_d.html#a23183747ab1ddbdd3f1fcac6d0faa2cd',1,'mlx::steel::Layout2D::shape'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63db720fe0c2abc4b71e22a58a015f8a',1,'mlx::core::fast::CustomKernelShapeInfo::shape']]],
   ['shapes_4',['shapes',['../struct_indices.html#a5ab170f1a77636180889ddfffd4f7d2f',1,'Indices']]],
   ['shp_5',['shp',['../classpocketfft_1_1detail_1_1arr__info.html#a2467e9e01de1ba4d7cd28c1af783da8d',1,'pocketfft::detail::arr_info']]],
   ['signedinteger_6',['signedinteger',['../namespacemlx_1_1core.html#a24e1618af591d737d73729665e868001',1,'mlx::core']]],
@@ -16,8 +16,8 @@ var searchData=
   ['split_5fk_5fpartition_5fsize_13',['split_k_partition_size',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a9f5a67b2343645b570e109c3837d4042',1,'mlx::steel::GEMMSpiltKParams']]],
   ['split_5fk_5fpartition_5fstride_14',['split_k_partition_stride',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a12144ce89d404812cd862611d770b9fb',1,'mlx::steel::GEMMSpiltKParams']]],
   ['split_5fk_5fpartitions_15',['split_k_partitions',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#ae06c27116905d4ff3b9b436e588a93fd',1,'mlx::steel::GEMMSpiltKParams']]],
-  ['src_16',['src',['../struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b',1,'QuantizedBlockLoader::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a656a46ee27486482b45ff90b3d626255',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a57552e9cfbafad71d47b2f3a8e027bdf',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7bfbcc4a1e3eef7aef5dd8e8c374a95f',1,'mlx::steel::Conv2DWeightBlockLoader::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#afe21e46e08523232830c25eb1b4ade16',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b04a69952404a04029dacc424df6e8f',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1ee2922961b5fcb1db577928c4d9d731',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a50f458dbb74d61be2ed24727d8d43614',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::src'],['../structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd',1,'mlx::steel::BlockLoader::src']]],
-  ['src_5fld_17',['src_ld',['../struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e',1,'QuantizedBlockLoader::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7464ec687323fa79050702952ed9084f',1,'mlx::steel::Conv2DWeightBlockLoader::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#aa6bedc0cbb447eaf70c03f2e26df2cb2',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6918c1df7712c4e408e2871467ea7987',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::src_ld'],['../structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d',1,'mlx::steel::BlockLoader::src_ld']]],
+  ['src_16',['src',['../struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76',1,'QuantizedBlockLoader::src'],['../structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa',1,'mlx::steel::BlockLoader::src'],['../structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777',1,'mlx::steel::BlockLoaderT::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a656a46ee27486482b45ff90b3d626255',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a57552e9cfbafad71d47b2f3a8e027bdf',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7bfbcc4a1e3eef7aef5dd8e8c374a95f',1,'mlx::steel::Conv2DWeightBlockLoader::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#afe21e46e08523232830c25eb1b4ade16',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a8b04a69952404a04029dacc424df6e8f',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::src'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1ee2922961b5fcb1db577928c4d9d731',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::src'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a50f458dbb74d61be2ed24727d8d43614',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::src']]],
+  ['src_5fld_17',['src_ld',['../struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e',1,'QuantizedBlockLoader::src_ld'],['../structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d',1,'mlx::steel::BlockLoader::src_ld'],['../structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321',1,'mlx::steel::BlockLoaderT::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a7464ec687323fa79050702952ed9084f',1,'mlx::steel::Conv2DWeightBlockLoader::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#aa6bedc0cbb447eaf70c03f2e26df2cb2',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::src_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a6918c1df7712c4e408e2871467ea7987',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::src_ld']]],
   ['start_5frow_18',['start_row',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a220e033b689c8d6a6f319dae02b38334',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral']]],
   ['steel_5fconv_5fgeneral_5fkernels_19',['steel_conv_general_kernels',['../jit_2steel__conv_8h.html#ae4ca1720029316b08ea92b7662347d47',1,'steel_conv.h']]],
   ['steel_5fconv_5fkernels_20',['steel_conv_kernels',['../jit_2steel__conv_8h.html#a386d79077465df56659416fd84adb513',1,'steel_conv.h']]],
@@ -32,5 +32,5 @@ var searchData=
   ['strided_5fdevice_5fidx_29',['strided_device_idx',['../struct_read_writer.html#a4c0b12484aac4fd6759d67c190391989',1,'ReadWriter']]],
   ['strided_5fshared_5fidx_30',['strided_shared_idx',['../struct_read_writer.html#ace40adb02cfb33d89c98353327c251fc',1,'ReadWriter']]],
   ['strides_31',['strides',['../structmlx_1_1core_1_1_reduction_plan.html#a9bf7cae845ab633247c1811613ece8bd',1,'mlx::core::ReductionPlan::strides'],['../struct_indices.html#a7f73d7652f0f751e6a06c2663e329a4a',1,'Indices::strides'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#a63954de7da62942ec69afcaaa19d46f2',1,'mlx::core::fast::CustomKernelShapeInfo::strides']]],
-  ['swizzle_5flog_32',['swizzle_log',['../struct_m_l_x_fast_attention_params.html#a68a338d522ffeb6761b7b168869361e2',1,'MLXFastAttentionParams::swizzle_log'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ad0713159d4f710cd9a066596593d8840',1,'mlx::steel::ImplicitGemmConv2DParams::swizzle_log'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#af9ff2c06dd8994126634531440325be7',1,'mlx::steel::GEMMParams::swizzle_log']]]
+  ['swizzle_5flog_32',['swizzle_log',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ad0713159d4f710cd9a066596593d8840',1,'mlx::steel::ImplicitGemmConv2DParams::swizzle_log'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#af9ff2c06dd8994126634531440325be7',1,'mlx::steel::GEMMParams::swizzle_log']]]
 ];
diff --git a/docs/build/html/search/variables_13.js b/docs/build/html/search/variables_13.js
index 364d08b90..2de578ca0 100644
--- a/docs/build/html/search/variables_13.js
+++ b/docs/build/html/search/variables_13.js
@@ -9,15 +9,15 @@ var searchData=
   ['tgp_5fpadding_5fb_6',['tgp_padding_b',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ad1b03941e869017558423c08b08bc094',1,'mlx::steel::GEMMKernel']]],
   ['tgp_5fsize_7',['tgp_size',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a9058ddb73e30e83fb9c548ba22817d64',1,'mlx::steel::GEMMKernel']]],
   ['thread_8',['thread',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a449de02bf2ac80d8fe2f208fa7eac359',1,'mlx::core::scheduler::StreamThread']]],
-  ['thread_5fidx_9',['thread_idx',['../struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475',1,'QuantizedBlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a70da26a715135d973f88371a70255be9',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac18de37cde1459595bfe18b0d5ef146d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ab1cb2ade639787243e0325dcd3dc0a11',1,'mlx::steel::Conv2DWeightBlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9642399b8066e29123524f36ebc7b482',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acacdac168004c87fee27c8554ac905a7',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a401f0c7cf1588552556603c7ffba2316',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08a517bc50caf41155b98be0690bfe44',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::thread_idx'],['../structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b',1,'mlx::steel::BlockLoader::thread_idx']]],
+  ['thread_5fidx_9',['thread_idx',['../struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475',1,'QuantizedBlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b',1,'mlx::steel::BlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da',1,'mlx::steel::BlockLoaderT::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a70da26a715135d973f88371a70255be9',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#ac18de37cde1459595bfe18b0d5ef146d',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#ab1cb2ade639787243e0325dcd3dc0a11',1,'mlx::steel::Conv2DWeightBlockLoader::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a9642399b8066e29123524f36ebc7b482',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#acacdac168004c87fee27c8554ac905a7',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a401f0c7cf1588552556603c7ffba2316',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::thread_idx'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08a517bc50caf41155b98be0690bfe44',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::thread_idx']]],
   ['threads_5fper_5ftg_10',['threads_per_tg',['../struct_read_writer.html#a64c58e358da22358df3075448ea23893',1,'ReadWriter']]],
   ['threadsm_11',['threadsM',['../struct_g_e_m_v_kernel.html#a1dd943fcbf5e7be435fc36bed589a641',1,'GEMVKernel::threadsM'],['../struct_g_e_m_v_t_kernel.html#a4a53e73a581aa8881b1f86ce653519e6',1,'GEMVTKernel::threadsM']]],
   ['threadsn_12',['threadsN',['../struct_g_e_m_v_kernel.html#a47bfab7d21dd18760d3e0937ad36b19d',1,'GEMVKernel::threadsN'],['../struct_g_e_m_v_t_kernel.html#ade6f15a9744616de9dd71498ad7e758d',1,'GEMVTKernel::threadsN']]],
-  ['tile_5fstride_13',['tile_stride',['../struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320',1,'QuantizedBlockLoader::tile_stride'],['../structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d',1,'mlx::steel::BlockLoader::tile_stride']]],
+  ['tile_5fstride_13',['tile_stride',['../struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320',1,'QuantizedBlockLoader::tile_stride'],['../structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d',1,'mlx::steel::BlockLoader::tile_stride'],['../structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f',1,'mlx::steel::BlockLoaderT::tile_stride']]],
   ['tile_5fstride_5fa_14',['tile_stride_a',['../structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330',1,'mlx::steel::BlockMMA']]],
   ['tile_5fstride_5fb_15',['tile_stride_b',['../structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4',1,'mlx::steel::BlockMMA']]],
-  ['tiles_5fm_16',['tiles_m',['../struct_m_l_x_fast_attention_params.html#a0df159c839fc27b9426b8ac4336cc0ad',1,'MLXFastAttentionParams::tiles_m'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a4c5e33edf70be99cf93ac5723c12eb24',1,'mlx::steel::ImplicitGemmConv2DParams::tiles_m'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#ad23a5a7f74cd5859741a36e4bc7823ca',1,'mlx::steel::GEMMParams::tiles_m'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a0970989624e17088d5326c2e198cb95b',1,'mlx::steel::GEMMSpiltKParams::tiles_m']]],
-  ['tiles_5fn_17',['tiles_n',['../struct_m_l_x_fast_attention_params.html#a608aa256216ac6d80af00209303d2029',1,'MLXFastAttentionParams::tiles_n'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a76f9f381e7187a993d65128b9b681b2d',1,'mlx::steel::ImplicitGemmConv2DParams::tiles_n'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a0e6b8b629232f1b43fbce9a395174bed',1,'mlx::steel::GEMMParams::tiles_n'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a5b46dfb9cee3606efa05d217349a20a6',1,'mlx::steel::GEMMSpiltKParams::tiles_n']]],
+  ['tiles_5fm_16',['tiles_m',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a4c5e33edf70be99cf93ac5723c12eb24',1,'mlx::steel::ImplicitGemmConv2DParams::tiles_m'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#ad23a5a7f74cd5859741a36e4bc7823ca',1,'mlx::steel::GEMMParams::tiles_m'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a0970989624e17088d5326c2e198cb95b',1,'mlx::steel::GEMMSpiltKParams::tiles_m']]],
+  ['tiles_5fn_17',['tiles_n',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a76f9f381e7187a993d65128b9b681b2d',1,'mlx::steel::ImplicitGemmConv2DParams::tiles_n'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a0e6b8b629232f1b43fbce9a395174bed',1,'mlx::steel::GEMMParams::tiles_n'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a5b46dfb9cee3606efa05d217349a20a6',1,'mlx::steel::GEMMSpiltKParams::tiles_n']]],
   ['tm_18',['TM',['../structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591',1,'mlx::steel::BlockMMA']]],
   ['tm_5fstride_19',['TM_stride',['../structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307',1,'mlx::steel::BlockMMA']]],
   ['tn_20',['TN',['../structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424',1,'mlx::steel::BlockMMA']]],
diff --git a/docs/build/html/search/variables_15.js b/docs/build/html/search/variables_15.js
index 0910291ae..46e04f590 100644
--- a/docs/build/html/search/variables_15.js
+++ b/docs/build/html/search/variables_15.js
@@ -1,8 +1,9 @@
 var searchData=
 [
-  ['v_0',['v',['../structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#afbef88bfb901a71e8423de911b7c7347',1,'mlx::steel::BlockLoader::ReadVector']]],
-  ['val_1',['val',['../structpocketfft_1_1detail_1_1_v_l_e_n.html#ab1fdc340dedde723e636746c828a4534',1,'pocketfft::detail::VLEN::val'],['../structmlx__atomic.html#a6f6651b8dd8149917c50cd99b13c6747',1,'mlx_atomic::val'],['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html#a8dbf729fcd8c4a16e41b546c7405543d',1,'mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;::val']]],
-  ['val_5ffrags_2',['val_frags',['../structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44',1,'mlx::steel::MMATile']]],
-  ['value_3',['value',['../structmlx_1_1steel_1_1integral__constant.html#a4efa69cb3fd42ac0dcad46578600d637',1,'mlx::steel::integral_constant']]],
-  ['vec_5fsize_4',['vec_size',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#adcc83bf6c02391cc2375e55c06a1c9a4',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a71c313e1597a2bb99f7b07d434e119d2',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a10109dc9553207f5a365799e4969c6d2',1,'mlx::steel::Conv2DWeightBlockLoader::vec_size'],['../structmlx_1_1steel_1_1_channel_helper.html#a2b24f991a9380fdad6b51a038770b925',1,'mlx::steel::ChannelHelper::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a71449551bbfe56058440755dfd50fc75',1,'mlx::steel::ChannelHelper&lt; 1 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#acfb18991a77a9d1d4a79918ac5f387af',1,'mlx::steel::ChannelHelper&lt; 2 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a5cb83774601c29564a6bbc010fc0bf7f',1,'mlx::steel::ChannelHelper&lt; 3 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#af28cdbe2a3c027d95832de07f60448ca',1,'mlx::steel::ChannelHelper&lt; 4 &gt;::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a6b0b18428516d1d6dcae3beb3faee81c',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a006153d274aa13d5fd4448b4607fff3a',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1587047caa339cf5b2c06adc4b332ab8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08dba753ec7c8ea2892775746933b3e7',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::vec_size'],['../structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092',1,'mlx::steel::BlockLoader::vec_size']]]
+  ['v_0',['v',['../structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d',1,'mlx::steel::BlockLoader::ReadVector']]],
+  ['v_5fstrides_1',['V_strides',['../structmlx_1_1steel_1_1_attn_params.html#acc4860c3ce09c7230b470182ed002d3c',1,'mlx::steel::AttnParams']]],
+  ['val_2',['val',['../structpocketfft_1_1detail_1_1_v_l_e_n.html#ab1fdc340dedde723e636746c828a4534',1,'pocketfft::detail::VLEN::val'],['../structmlx__atomic.html#a6f6651b8dd8149917c50cd99b13c6747',1,'mlx_atomic::val'],['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html#a8dbf729fcd8c4a16e41b546c7405543d',1,'mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;::val']]],
+  ['val_5ffrags_3',['val_frags',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62',1,'mlx::steel::MMATile']]],
+  ['value_4',['value',['../structmlx_1_1steel_1_1integral__constant.html#a4efa69cb3fd42ac0dcad46578600d637',1,'mlx::steel::integral_constant']]],
+  ['vec_5fsize_5',['vec_size',['../structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092',1,'mlx::steel::BlockLoader::vec_size'],['../structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5',1,'mlx::steel::BlockLoaderT::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#adcc83bf6c02391cc2375e55c06a1c9a4',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a71c313e1597a2bb99f7b07d434e119d2',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a10109dc9553207f5a365799e4969c6d2',1,'mlx::steel::Conv2DWeightBlockLoader::vec_size'],['../structmlx_1_1steel_1_1_channel_helper.html#a2b24f991a9380fdad6b51a038770b925',1,'mlx::steel::ChannelHelper::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a71449551bbfe56058440755dfd50fc75',1,'mlx::steel::ChannelHelper&lt; 1 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#acfb18991a77a9d1d4a79918ac5f387af',1,'mlx::steel::ChannelHelper&lt; 2 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a5cb83774601c29564a6bbc010fc0bf7f',1,'mlx::steel::ChannelHelper&lt; 3 &gt;::vec_size'],['../structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#af28cdbe2a3c027d95832de07f60448ca',1,'mlx::steel::ChannelHelper&lt; 4 &gt;::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a6b0b18428516d1d6dcae3beb3faee81c',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a006153d274aa13d5fd4448b4607fff3a',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1587047caa339cf5b2c06adc4b332ab8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::vec_size'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a08dba753ec7c8ea2892775746933b3e7',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::vec_size']]]
 ];
diff --git a/docs/build/html/search/variables_2.js b/docs/build/html/search/variables_2.js
index dda638a30..3f19611c8 100644
--- a/docs/build/html/search/variables_2.js
+++ b/docs/build/html/search/variables_2.js
@@ -1,18 +1,19 @@
 var searchData=
 [
   ['c_0',['C',['../struct_m_l_x_conv_params.html#a0953063962ac3b5a027243289e72fbb2',1,'MLXConvParams']]],
-  ['can_5fconvert_5ffrom_5fbfloat_1',['can_convert_from_bfloat',['../backend_2metal_2kernels_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a',1,'bf16.h']]],
-  ['can_5fconvert_5ffrom_5fcomplex64_2',['can_convert_from_complex64',['../backend_2metal_2kernels_2complex_8h.html#ab149db78f6f19b8da6297dac4c36d893',1,'complex.h']]],
-  ['can_5fconvert_5fto_5fbfloat_3',['can_convert_to_bfloat',['../backend_2metal_2kernels_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e',1,'bf16.h']]],
-  ['can_5fconvert_5fto_5fcomplex128_4',['can_convert_to_complex128',['../namespacemlx_1_1core.html#a2822d2a4d346c826d3cfebbcf89c3057',1,'mlx::core']]],
-  ['can_5fconvert_5fto_5fcomplex64_5',['can_convert_to_complex64',['../backend_2metal_2kernels_2complex_8h.html#a4f90ad54f4fae363e8d3cc41d539557b',1,'can_convert_to_complex64:&#160;complex.h'],['../namespacemlx_1_1core.html#a0b3c76fd03f4df39ec8f9aefdced0861',1,'mlx::core::can_convert_to_complex64']]],
-  ['capitalize_5fbool_6',['capitalize_bool',['../structmlx_1_1core_1_1_print_formatter.html#adf49a949db36f0ba076842a6d675d79a',1,'mlx::core::PrintFormatter']]],
-  ['col_5fcontiguous_7',['col_contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#ae24709026598d635e6b5c24a15f8a802',1,'mlx::core::array::Flags']]],
-  ['complex64_8',['complex64',['../namespacemlx_1_1core.html#af99db87e0078bfcdb383f5689bc874d4',1,'mlx::core']]],
-  ['complexfloating_9',['complexfloating',['../namespacemlx_1_1core.html#a70b8e88c9df750af984757105af33423',1,'mlx::core']]],
-  ['cond_10',['cond',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a4ffd524d6a5bedd1a303b63bdde6701c',1,'mlx::core::scheduler::StreamThread']]],
-  ['contiguous_11',['contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#afd0ab11e7a486a2a8e50ee84b971ac8a',1,'mlx::core::array::Flags']]],
-  ['cosine_12',['cosine',['../structpocketfft_1_1detail_1_1_exec_dcst.html#a185023fc1e386cc8f233b79c49c1fd8a',1,'pocketfft::detail::ExecDcst']]],
-  ['cpu_13',['cpu',['../structmlx_1_1core_1_1_device.html#a69ee81924251dec96f1945c9d91506fd',1,'mlx::core::Device']]],
-  ['ctile_14',['Ctile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88',1,'mlx::steel::BlockMMA']]]
+  ['c_1',['c',['../structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e',1,'mlx::steel::Shape2D']]],
+  ['can_5fconvert_5ffrom_5fbfloat_2',['can_convert_from_bfloat',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7e5992f7fcd8f2cdadcc1d7f6aefbb5a',1,'bf16.h']]],
+  ['can_5fconvert_5ffrom_5fcomplex64_3',['can_convert_from_complex64',['../backend_2metal_2kernels_2complex_8h.html#ab149db78f6f19b8da6297dac4c36d893',1,'complex.h']]],
+  ['can_5fconvert_5fto_5fbfloat_4',['can_convert_to_bfloat',['../backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e',1,'bf16.h']]],
+  ['can_5fconvert_5fto_5fcomplex128_5',['can_convert_to_complex128',['../namespacemlx_1_1core.html#a2822d2a4d346c826d3cfebbcf89c3057',1,'mlx::core']]],
+  ['can_5fconvert_5fto_5fcomplex64_6',['can_convert_to_complex64',['../backend_2metal_2kernels_2complex_8h.html#a4f90ad54f4fae363e8d3cc41d539557b',1,'can_convert_to_complex64:&#160;complex.h'],['../namespacemlx_1_1core.html#a0b3c76fd03f4df39ec8f9aefdced0861',1,'mlx::core::can_convert_to_complex64']]],
+  ['capitalize_5fbool_7',['capitalize_bool',['../structmlx_1_1core_1_1_print_formatter.html#adf49a949db36f0ba076842a6d675d79a',1,'mlx::core::PrintFormatter']]],
+  ['col_5fcontiguous_8',['col_contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#ae24709026598d635e6b5c24a15f8a802',1,'mlx::core::array::Flags']]],
+  ['complex64_9',['complex64',['../namespacemlx_1_1core.html#af99db87e0078bfcdb383f5689bc874d4',1,'mlx::core']]],
+  ['complexfloating_10',['complexfloating',['../namespacemlx_1_1core.html#a70b8e88c9df750af984757105af33423',1,'mlx::core']]],
+  ['cond_11',['cond',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a4ffd524d6a5bedd1a303b63bdde6701c',1,'mlx::core::scheduler::StreamThread']]],
+  ['contiguous_12',['contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#afd0ab11e7a486a2a8e50ee84b971ac8a',1,'mlx::core::array::Flags']]],
+  ['cosine_13',['cosine',['../structpocketfft_1_1detail_1_1_exec_dcst.html#a185023fc1e386cc8f233b79c49c1fd8a',1,'pocketfft::detail::ExecDcst']]],
+  ['cpu_14',['cpu',['../structmlx_1_1core_1_1_device.html#a69ee81924251dec96f1945c9d91506fd',1,'mlx::core::Device']]],
+  ['ctile_15',['Ctile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6',1,'mlx::steel::BlockMMA']]]
 ];
diff --git a/docs/build/html/search/variables_3.js b/docs/build/html/search/variables_3.js
index c6190bec3..1a58473d7 100644
--- a/docs/build/html/search/variables_3.js
+++ b/docs/build/html/search/variables_3.js
@@ -1,12 +1,14 @@
 var searchData=
 [
-  ['d_0',['d',['../classpocketfft_1_1detail_1_1cndarr.html#ac29c769aebb03f81fbcf16ba6e766af2',1,'pocketfft::detail::cndarr::d'],['../structmlx_1_1core_1_1array_1_1_data.html#a25f52ac67912a49bb6e2b6715aa65311',1,'mlx::core::array::Data::d']]],
-  ['device_1',['device',['../structmlx_1_1core_1_1_stream.html#a406b1b0162287a4162fab1f70e2ff3bb',1,'mlx::core::Stream']]],
-  ['digits_2',['digits',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#af6a681edff230c8d734a1feefb8d1879',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['digits10_3',['digits10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a0f48dd0c8a2d2dfa825067fb212b2e6b',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['do_5faxpby_4',['do_axpby',['../steel__gemm__fused_8h.html#a703f06c849c89c37af7b1d27b0804a29',1,'steel_gemm_fused.h']]],
-  ['do_5fgather_5',['do_gather',['../steel__gemm__fused_8h.html#a60efac3ac3b7cd64d096bbae38a3ac69',1,'steel_gemm_fused.h']]],
-  ['do_5fread_6',['do_read',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a13eb86acf6abe288c19645935a47d2ad',1,'mlx::steel::Conv2DWeightBlockLoader::do_read'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a640155880483e1042ec5f647b9adaac6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::do_read']]],
-  ['dst_7',['dst',['../struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83',1,'QuantizedBlockLoader::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ae048eb79f8b8d98f0fe8805c30fbb09f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8598bf23a2bce6af13c876cbfa76449f',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aea6494838175225d02cbc7768a646ec7',1,'mlx::steel::Conv2DWeightBlockLoader::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a59a4fffc1dc2f3fadfb3fdd1b886da70',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a24e20e4c1dd1ebf9534bfa2b3e050ed3',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aa84c4ad43a5defb83ba1a5f49a7adb2a',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8474daf268013e138a84fc1c4bff7352',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::dst'],['../structmlx_1_1steel_1_1_block_loader.html#af34c184a19846e4b40ba54b2946589ec',1,'mlx::steel::BlockLoader::dst']]],
-  ['dst_5fld_8',['dst_ld',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a91192d512e7a18c2d16a139065000959',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a9e59da7e4436e61b2d3c3f982355910b',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a0ff5a6d503e0bbac4634030a75ab818d',1,'mlx::steel::Conv2DWeightBlockLoader::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ae71570942c7b0ad8e67c62662b336c4a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ac18eeebea26cc6da434ead6eb4397350',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a07c85eab8cbf7b02c60df29cf32031ef',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aae121ca6016fc6c7255027b3641f3a09',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::dst_ld']]]
+  ['d_0',['D',['../structmlx_1_1steel_1_1_attn_params.html#a07ae31628e43e09bce533c7682c8dae3',1,'mlx::steel::AttnParams']]],
+  ['d_1',['d',['../classpocketfft_1_1detail_1_1cndarr.html#ac29c769aebb03f81fbcf16ba6e766af2',1,'pocketfft::detail::cndarr::d'],['../structmlx_1_1core_1_1array_1_1_data.html#a25f52ac67912a49bb6e2b6715aa65311',1,'mlx::core::array::Data::d']]],
+  ['device_2',['device',['../structmlx_1_1core_1_1_stream.html#a406b1b0162287a4162fab1f70e2ff3bb',1,'mlx::core::Stream']]],
+  ['digits_3',['digits',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#af6a681edff230c8d734a1feefb8d1879',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['digits10_4',['digits10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a0f48dd0c8a2d2dfa825067fb212b2e6b',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['dim_5',['dim',['../struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364',1,'LoopedElemToLoc::dim'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a7be6bf560080472d61e74b522979ef1e',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::dim']]],
+  ['do_5faxpby_6',['do_axpby',['../steel__gemm__fused_8h.html#a703f06c849c89c37af7b1d27b0804a29',1,'steel_gemm_fused.h']]],
+  ['do_5fgather_7',['do_gather',['../steel__gemm__fused_8h.html#a60efac3ac3b7cd64d096bbae38a3ac69',1,'steel_gemm_fused.h']]],
+  ['do_5fread_8',['do_read',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a13eb86acf6abe288c19645935a47d2ad',1,'mlx::steel::Conv2DWeightBlockLoader::do_read'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a640155880483e1042ec5f647b9adaac6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::do_read']]],
+  ['dst_9',['dst',['../struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83',1,'QuantizedBlockLoader::dst'],['../structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2',1,'mlx::steel::BlockLoader::dst'],['../structmlx_1_1steel_1_1_block_loader_t.html#a6eb4e566b687395e27f290da288362db',1,'mlx::steel::BlockLoaderT::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ae048eb79f8b8d98f0fe8805c30fbb09f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a8598bf23a2bce6af13c876cbfa76449f',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#aea6494838175225d02cbc7768a646ec7',1,'mlx::steel::Conv2DWeightBlockLoader::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a59a4fffc1dc2f3fadfb3fdd1b886da70',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#a24e20e4c1dd1ebf9534bfa2b3e050ed3',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::dst'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#aa84c4ad43a5defb83ba1a5f49a7adb2a',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::dst'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#a8474daf268013e138a84fc1c4bff7352',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::dst']]],
+  ['dst_5fld_10',['dst_ld',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a91192d512e7a18c2d16a139065000959',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a9e59da7e4436e61b2d3c3f982355910b',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a0ff5a6d503e0bbac4634030a75ab818d',1,'mlx::steel::Conv2DWeightBlockLoader::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ae71570942c7b0ad8e67c62662b336c4a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ac18eeebea26cc6da434ead6eb4397350',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a07c85eab8cbf7b02c60df29cf32031ef',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::dst_ld'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aae121ca6016fc6c7255027b3641f3a09',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::dst_ld']]]
 ];
diff --git a/docs/build/html/search/variables_6.js b/docs/build/html/search/variables_6.js
index f61b85531..24d74db52 100644
--- a/docs/build/html/search/variables_6.js
+++ b/docs/build/html/search/variables_6.js
@@ -3,17 +3,16 @@ var searchData=
   ['gather_5fbias_0',['gather_bias',['../steel__gemm__fused_8h.html#aaaf17233201156be684f858bfd0f1b67',1,'steel_gemm_fused.h']]],
   ['gather_5fkernels_1',['gather_kernels',['../jit_2indexing_8h.html#a1a03318128191891a84707602b57b3cf',1,'indexing.h']]],
   ['gemm_5fk_5fiterations_2',['gemm_k_iterations',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a8b50863e4e2d3481c154be6c3629bf51',1,'mlx::steel::ImplicitGemmConv2DParams']]],
-  ['gemm_5fk_5fiterations_5faligned_3',['gemm_k_iterations_aligned',['../struct_m_l_x_fast_attention_params.html#adbc0a13076da5f704498e57239cb2bf2',1,'MLXFastAttentionParams::gemm_k_iterations_aligned'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a0d7f419ba265805b418e93ce1ca2e0f9',1,'mlx::steel::GEMMParams::gemm_k_iterations_aligned'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#aa37e05a03ac8b34ec7dc31ca42f68998',1,'mlx::steel::GEMMSpiltKParams::gemm_k_iterations_aligned']]],
-  ['gemm_5fn_5fiterations_5faligned_4',['gemm_n_iterations_aligned',['../struct_m_l_x_fast_attention_params.html#ab56b3db8fc6a938ce9c739ee78a7b803',1,'MLXFastAttentionParams']]],
-  ['gemm_5fparams_5',['gemm_params',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ab0724eb3ef52ee773b6607f6433b9f2c',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::gemm_params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#acc778b3c0b7ec38a43e8ea943df8704c',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::gemm_params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af59f9d356c4c3ec5627dc5a263d239d4',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::gemm_params']]],
-  ['gemm_5fsv_5fm_5fblock_5fiterations_6',['gemm_sv_m_block_iterations',['../struct_m_l_x_fast_attention_params.html#a2799a2f219441fef7f351374f4cbc67c',1,'MLXFastAttentionParams']]],
-  ['gemv_5fmasked_5fkernel_7',['gemv_masked_kernel',['../jit_2gemv__masked_8h.html#a933f06c211f86c37673dee329ed6901f',1,'gemv_masked.h']]],
-  ['generic_8',['generic',['../namespacemlx_1_1core.html#a34d69c4d46aa9b2a4a79dba7aba093d2',1,'mlx::core']]],
-  ['global_5fformatter_9',['global_formatter',['../namespacemlx_1_1core.html#af5a408a78cc934717dd711ddfda58ea6',1,'mlx::core']]],
-  ['gpu_10',['gpu',['../structmlx_1_1core_1_1_device.html#a45ed081b56ae5d4ddd39c83a5d8a1616',1,'mlx::core::Device']]],
-  ['grid_11',['grid',['../struct_read_writer.html#ac7a957f99873d3797081f5d620f3b2c8',1,'ReadWriter']]],
-  ['group_5fstep_5fcnt_12',['group_step_cnt',['../struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6',1,'QuantizedBlockLoader']]],
-  ['group_5fsteps_13',['group_steps',['../struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba',1,'QuantizedBlockLoader']]],
-  ['group_5fstride_14',['group_stride',['../struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab',1,'QuantizedBlockLoader']]],
-  ['groups_15',['groups',['../struct_m_l_x_conv_params.html#af7a5590ac0974c7841c7f8b9fda0cbed',1,'MLXConvParams']]]
+  ['gemm_5fk_5fiterations_5faligned_3',['gemm_k_iterations_aligned',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a0d7f419ba265805b418e93ce1ca2e0f9',1,'mlx::steel::GEMMParams::gemm_k_iterations_aligned'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#aa37e05a03ac8b34ec7dc31ca42f68998',1,'mlx::steel::GEMMSpiltKParams::gemm_k_iterations_aligned']]],
+  ['gemm_5fparams_4',['gemm_params',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#ab0724eb3ef52ee773b6607f6433b9f2c',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::gemm_params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#acc778b3c0b7ec38a43e8ea943df8704c',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::gemm_params'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#af59f9d356c4c3ec5627dc5a263d239d4',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::gemm_params']]],
+  ['gemv_5fmasked_5fkernel_5',['gemv_masked_kernel',['../jit_2gemv__masked_8h.html#a933f06c211f86c37673dee329ed6901f',1,'gemv_masked.h']]],
+  ['generic_6',['generic',['../namespacemlx_1_1core.html#a34d69c4d46aa9b2a4a79dba7aba093d2',1,'mlx::core']]],
+  ['global_5fformatter_7',['global_formatter',['../namespacemlx_1_1core.html#af5a408a78cc934717dd711ddfda58ea6',1,'mlx::core']]],
+  ['gpu_8',['gpu',['../structmlx_1_1core_1_1_device.html#a45ed081b56ae5d4ddd39c83a5d8a1616',1,'mlx::core::Device']]],
+  ['gqa_5ffactor_9',['gqa_factor',['../structmlx_1_1steel_1_1_attn_params.html#a3b3e18cb993ab24819c852bc64288841',1,'mlx::steel::AttnParams']]],
+  ['grid_10',['grid',['../struct_read_writer.html#ac7a957f99873d3797081f5d620f3b2c8',1,'ReadWriter']]],
+  ['group_5fstep_5fcnt_11',['group_step_cnt',['../struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6',1,'QuantizedBlockLoader']]],
+  ['group_5fsteps_12',['group_steps',['../struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba',1,'QuantizedBlockLoader']]],
+  ['group_5fstride_13',['group_stride',['../struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab',1,'QuantizedBlockLoader']]],
+  ['groups_14',['groups',['../struct_m_l_x_conv_params.html#af7a5590ac0974c7841c7f8b9fda0cbed',1,'MLXConvParams']]]
 ];
diff --git a/docs/build/html/search/variables_7.js b/docs/build/html/search/variables_7.js
index 33bcc018d..7d6caf6df 100644
--- a/docs/build/html/search/variables_7.js
+++ b/docs/build/html/search/variables_7.js
@@ -1,11 +1,12 @@
 var searchData=
 [
-  ['h12_0',['h12',['../namespacemlx_1_1core.html#a4beeeec4413be7adcfb14feaa9cf0e2e',1,'mlx::core']]],
-  ['h20_1',['h20',['../namespacemlx_1_1core.html#a862c6b94fec384c34a699ced64d01404',1,'mlx::core']]],
-  ['h28_2',['h28',['../namespacemlx_1_1core.html#ac447ad59592dd06435adca7df37e33ad',1,'mlx::core']]],
-  ['has_5fbatch_3',['has_batch',['../steel__gemm__fused_8h.html#adffcdc900c19ff97f1523e43f1a5a6cc',1,'steel_gemm_fused.h']]],
-  ['has_5fmul_5foperand_5fmask_4',['has_mul_operand_mask',['../struct_g_e_m_v_kernel.html#ad47223ee49b3cb7bf3746a2cec45f883',1,'GEMVKernel::has_mul_operand_mask'],['../struct_g_e_m_v_t_kernel.html#a8db6f01f96a36b216acd801c34a96ef5',1,'GEMVTKernel::has_mul_operand_mask']]],
-  ['has_5fmul_5foutput_5fmask_5',['has_mul_output_mask',['../struct_g_e_m_v_kernel.html#a0edbf2dd6a6563e7afa6dab6b670615c',1,'GEMVKernel::has_mul_output_mask'],['../struct_g_e_m_v_t_kernel.html#a8eb06f6569e4042e24fee220b11fa10d',1,'GEMVTKernel::has_mul_output_mask']]],
-  ['has_5foperand_5fmask_6',['has_operand_mask',['../struct_g_e_m_v_kernel.html#ab00784dff1512a7b0919fcb4cfa5d50e',1,'GEMVKernel::has_operand_mask'],['../struct_g_e_m_v_t_kernel.html#a6729d6e63e76a1e9c7c8e78d9aac4869',1,'GEMVTKernel::has_operand_mask']]],
-  ['has_5foutput_5fmask_7',['has_output_mask',['../struct_g_e_m_v_kernel.html#ab8b64c94f4c8f6f09c0777415589b487',1,'GEMVKernel::has_output_mask'],['../struct_g_e_m_v_t_kernel.html#aaefdf8f023da255bbb70a0c3e3408626',1,'GEMVTKernel::has_output_mask']]]
+  ['h_0',['H',['../structmlx_1_1steel_1_1_attn_params.html#a3d286a0c27bace6016ed7a87f43291b7',1,'mlx::steel::AttnParams']]],
+  ['h12_1',['h12',['../namespacemlx_1_1core.html#a4beeeec4413be7adcfb14feaa9cf0e2e',1,'mlx::core']]],
+  ['h20_2',['h20',['../namespacemlx_1_1core.html#a862c6b94fec384c34a699ced64d01404',1,'mlx::core']]],
+  ['h28_3',['h28',['../namespacemlx_1_1core.html#ac447ad59592dd06435adca7df37e33ad',1,'mlx::core']]],
+  ['has_5fbatch_4',['has_batch',['../steel__gemm__fused_8h.html#adffcdc900c19ff97f1523e43f1a5a6cc',1,'steel_gemm_fused.h']]],
+  ['has_5fmul_5foperand_5fmask_5',['has_mul_operand_mask',['../struct_g_e_m_v_kernel.html#ad47223ee49b3cb7bf3746a2cec45f883',1,'GEMVKernel::has_mul_operand_mask'],['../struct_g_e_m_v_t_kernel.html#a8db6f01f96a36b216acd801c34a96ef5',1,'GEMVTKernel::has_mul_operand_mask']]],
+  ['has_5fmul_5foutput_5fmask_6',['has_mul_output_mask',['../struct_g_e_m_v_kernel.html#a0edbf2dd6a6563e7afa6dab6b670615c',1,'GEMVKernel::has_mul_output_mask'],['../struct_g_e_m_v_t_kernel.html#a8eb06f6569e4042e24fee220b11fa10d',1,'GEMVTKernel::has_mul_output_mask']]],
+  ['has_5foperand_5fmask_7',['has_operand_mask',['../struct_g_e_m_v_kernel.html#ab00784dff1512a7b0919fcb4cfa5d50e',1,'GEMVKernel::has_operand_mask'],['../struct_g_e_m_v_t_kernel.html#a6729d6e63e76a1e9c7c8e78d9aac4869',1,'GEMVTKernel::has_operand_mask']]],
+  ['has_5foutput_5fmask_8',['has_output_mask',['../struct_g_e_m_v_kernel.html#ab8b64c94f4c8f6f09c0777415589b487',1,'GEMVKernel::has_output_mask'],['../struct_g_e_m_v_t_kernel.html#aaefdf8f023da255bbb70a0c3e3408626',1,'GEMVTKernel::has_output_mask']]]
 ];
diff --git a/docs/build/html/search/variables_8.js b/docs/build/html/search/variables_8.js
index aa38b6cb3..7cd417b8e 100644
--- a/docs/build/html/search/variables_8.js
+++ b/docs/build/html/search/variables_8.js
@@ -5,10 +5,10 @@ var searchData=
   ['imag_2',['imag',['../structcomplex64__t.html#a94037c0cf8451aaff7cb4d154a8426de',1,'complex64_t']]],
   ['in_3',['in',['../struct_read_writer.html#ab6057215920138f28fd00f0e7ea8afa4',1,'ReadWriter']]],
   ['in_5fstrides_4',['in_strides',['../struct_m_l_x_conv_params.html#ab25eade6573784985dbea1216f9068cf',1,'MLXConvParams']]],
-  ['index_5',['index',['../structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a',1,'looped_elem_to_loc::index'],['../structmlx_1_1core_1_1_device.html#a5e345748fe318a267833ab7398b364ac',1,'mlx::core::Device::index'],['../structmlx_1_1core_1_1_stream.html#a9d0dafc1899333e1176eb2bbc0a8b626',1,'mlx::core::Stream::index']]],
+  ['index_5',['index',['../struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333',1,'LoopedElemToLoc::index'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a2cd3b616739b3d5b41e5b46ae335957d',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::index'],['../structmlx_1_1core_1_1_device.html#a5e345748fe318a267833ab7398b364ac',1,'mlx::core::Device::index'],['../structmlx_1_1core_1_1_stream.html#a9d0dafc1899333e1176eb2bbc0a8b626',1,'mlx::core::Stream::index']]],
   ['inexact_6',['inexact',['../namespacemlx_1_1core.html#a54c6fae21b7f2fea8e6f80011ef38534',1,'mlx::core']]],
   ['init_7',['init',['../struct_cum_prod_3_01bool_01_4.html#ae7a8b0ba9e6898356b87b18766e76d2c',1,'CumProd&lt; bool &gt;::init'],['../struct_cum_max.html#a16480052a2eeb4340e546838aab59cc4',1,'CumMax::init'],['../struct_cum_min.html#a8b67f739c620d0cc194b533190990ab9',1,'CumMin::init'],['../struct_less_than.html#abf97a6b0163048e4ba96460939dbd3a3',1,'LessThan::init']]],
-  ['inner_5flooper_8',['inner_looper',['../structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189',1,'looped_elem_to_loc']]],
+  ['inner_5flooper_8',['inner_looper',['../struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40',1,'LoopedElemToLoc']]],
   ['inp_5fjump_5fc_9',['inp_jump_c',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a78d30e843d65d1829623afb0b607f0a5',1,'mlx::steel::ImplicitGemmConv2DParams']]],
   ['inp_5fjump_5fh_10',['inp_jump_h',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a03685a4066cdb11ffb647408e2c5b122',1,'mlx::steel::ImplicitGemmConv2DParams']]],
   ['inp_5fjump_5fw_11',['inp_jump_w',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#acf168c72f4a86b72b8f5f386f07c9d8c',1,'mlx::steel::ImplicitGemmConv2DParams']]],
@@ -19,11 +19,10 @@ var searchData=
   ['integer_16',['integer',['../namespacemlx_1_1core.html#a074d000f25ae3ed77450e6a5fec4b38b',1,'mlx::core']]],
   ['inv_17',['inv',['../struct_read_writer.html#a773fa8524515bdc2ff8b0e2060741813',1,'ReadWriter']]],
   ['inv_5f_18',['inv_',['../backend_2metal_2kernels_2fft_8h.html#a7a83318497519ff3ff0141b7d511ed38',1,'fft.h']]],
-  ['inv_5falpha_19',['INV_ALPHA',['../struct_m_l_x_scaled_dot_product_attention_params.html#a7461e0e17cdc7d3fed80bb00d58d8644',1,'MLXScaledDotProductAttentionParams']]],
-  ['is_20',['iS',['../struct_m_l_x_conv_params.html#a72e1c3b4da0f70622cf18036bbf97fe6',1,'MLXConvParams']]],
-  ['is_5farray_5fv_21',['is_array_v',['../namespacemlx_1_1core.html#a01b0d64a75dfa2e95d6c7b5c53d708af',1,'mlx::core']]],
-  ['is_5farrays_5fv_22',['is_arrays_v',['../namespacemlx_1_1core.html#a94c1057929b390e5613304afa16dfbda',1,'mlx::core']]],
-  ['is_5fintegral_5fv_23',['is_integral_v',['../namespacemlx_1_1steel.html#a92a3465716ea7fd682d22cecc08d45fd',1,'mlx::steel']]],
-  ['is_5fmetal_5fatomic_24',['is_metal_atomic',['../atomic_8h.html#a91a8bdcae647947a83c6689d7f252d24',1,'atomic.h']]],
-  ['is_5fpower_5fof_5f2_5f_25',['is_power_of_2_',['../backend_2metal_2kernels_2fft_8h.html#a2a4df90e329b84ee6c1890ba7c265c9c',1,'fft.h']]]
+  ['is_19',['iS',['../struct_m_l_x_conv_params.html#a72e1c3b4da0f70622cf18036bbf97fe6',1,'MLXConvParams']]],
+  ['is_5farray_5fv_20',['is_array_v',['../namespacemlx_1_1core.html#a01b0d64a75dfa2e95d6c7b5c53d708af',1,'mlx::core']]],
+  ['is_5farrays_5fv_21',['is_arrays_v',['../namespacemlx_1_1core.html#a94c1057929b390e5613304afa16dfbda',1,'mlx::core']]],
+  ['is_5fintegral_5fv_22',['is_integral_v',['../namespacemlx_1_1steel.html#a92a3465716ea7fd682d22cecc08d45fd',1,'mlx::steel']]],
+  ['is_5fmetal_5fatomic_23',['is_metal_atomic',['../atomic_8h.html#a91a8bdcae647947a83c6689d7f252d24',1,'atomic.h']]],
+  ['is_5fpower_5fof_5f2_5f_24',['is_power_of_2_',['../backend_2metal_2kernels_2fft_8h.html#a2a4df90e329b84ee6c1890ba7c265c9c',1,'fft.h']]]
 ];
diff --git a/docs/build/html/search/variables_a.js b/docs/build/html/search/variables_a.js
index ec7e0d5fb..4f98f1ced 100644
--- a/docs/build/html/search/variables_a.js
+++ b/docs/build/html/search/variables_a.js
@@ -1,18 +1,21 @@
 var searchData=
 [
-  ['k_0',['K',['../struct_m_l_x_fast_attention_params.html#ada454f5ad22ec36a22d0ff596751af23',1,'MLXFastAttentionParams::K'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ae1b0386e4cd1a7018f4b654c4e9493ba',1,'mlx::steel::ImplicitGemmConv2DParams::K'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#aa0851af4da8df820bdad9589ff517cff',1,'mlx::steel::GEMMParams::K'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a3733f9031e82e761ec44e72ed5c6d0e7',1,'mlx::steel::GEMMSpiltKParams::K']]],
-  ['kcols_1',['kCols',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257',1,'mlx::steel::MMATile']]],
-  ['kdil_2',['kdil',['../struct_m_l_x_conv_params.html#a7611db8f1621c7e09fc685ed44073b14',1,'MLXConvParams']]],
-  ['kelemcols_3',['kElemCols',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
-  ['kelemrows_4',['kElemRows',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
-  ['kelemsperfrag_5',['kElemsPerFrag',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a3c34dfdc944db110f4735f1b25307cf0',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kElemsPerFrag'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6',1,'mlx::steel::MMATile::kElemsPerFrag']]],
-  ['kelemspertile_6',['kElemsPerTile',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f',1,'mlx::steel::MMATile']]],
-  ['kfragcols_7',['kFragCols',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a211102315e2afbcfcd2e2c201b638e9f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kFragCols'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906',1,'mlx::steel::MMATile::kFragCols']]],
-  ['kfragrows_8',['kFragRows',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kFragRows'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7',1,'mlx::steel::MMATile::kFragRows']]],
-  ['kfragsize_9',['kFragSize',['../structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d',1,'mlx::steel::BlockMMA']]],
-  ['knumfrags_10',['kNumFrags',['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3',1,'mlx::steel::MMATile']]],
-  ['krows_11',['kRows',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323',1,'mlx::steel::MMATile']]],
-  ['ktilecols_12',['kTileCols',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4',1,'mlx::steel::MMATile']]],
-  ['ktilerows_13',['kTileRows',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a',1,'mlx::steel::MMATile']]],
-  ['kv_5ftiles_14',['KV_TILES',['../struct_m_l_x_scaled_dot_product_attention_params.html#a58ef2765fd681e6b35b2ba72030610e0',1,'MLXScaledDotProductAttentionParams']]]
+  ['k_0',['K',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#ae1b0386e4cd1a7018f4b654c4e9493ba',1,'mlx::steel::ImplicitGemmConv2DParams::K'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#aa0851af4da8df820bdad9589ff517cff',1,'mlx::steel::GEMMParams::K'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a3733f9031e82e761ec44e72ed5c6d0e7',1,'mlx::steel::GEMMSpiltKParams::K']]],
+  ['k_5fstrides_1',['K_strides',['../structmlx_1_1steel_1_1_attn_params.html#a03e5480d1cca6af541be54a8720e9974',1,'mlx::steel::AttnParams']]],
+  ['kcols_2',['kCols',['../structmlx_1_1steel_1_1_c_shape.html#a01b09227356b6a682a0694523a8e6901',1,'mlx::steel::CShape::kCols'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257',1,'mlx::steel::MMATile::kCols']]],
+  ['kcolsperthread_3',['kColsPerThread',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1ea49efd92696b15302ee4b52ecd548c',1,'mlx::steel::MMATile']]],
+  ['kdil_4',['kdil',['../struct_m_l_x_conv_params.html#a7611db8f1621c7e09fc685ed44073b14',1,'MLXConvParams']]],
+  ['kelemcols_5',['kElemCols',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
+  ['kelemrows_6',['kElemRows',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;']]],
+  ['kelemsperfrag_7',['kElemsPerFrag',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a3c34dfdc944db110f4735f1b25307cf0',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kElemsPerFrag'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6',1,'mlx::steel::MMATile::kElemsPerFrag']]],
+  ['kelemspertile_8',['kElemsPerTile',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f',1,'mlx::steel::MMATile']]],
+  ['kfragcols_9',['kFragCols',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a211102315e2afbcfcd2e2c201b638e9f',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kFragCols'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906',1,'mlx::steel::MMATile::kFragCols']]],
+  ['kfragrows_10',['kFragRows',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::kFragRows'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7',1,'mlx::steel::MMATile::kFragRows']]],
+  ['kfragsize_11',['kFragSize',['../structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d',1,'mlx::steel::BlockMMA']]],
+  ['kl_12',['kL',['../structmlx_1_1steel_1_1_attn_params.html#a497b7404bcd25b535c3589c61f269f63',1,'mlx::steel::AttnParams']]],
+  ['knumfrags_13',['kNumFrags',['../structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3',1,'mlx::steel::MMATile']]],
+  ['krows_14',['kRows',['../structmlx_1_1steel_1_1_c_shape.html#a5caf36cb9acf9f90ba59a9b0b4197993',1,'mlx::steel::CShape::kRows'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323',1,'mlx::steel::MMATile::kRows']]],
+  ['krowsperthread_15',['kRowsPerThread',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e',1,'mlx::steel::MMATile']]],
+  ['ktilecols_16',['kTileCols',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4',1,'mlx::steel::MMATile']]],
+  ['ktilerows_17',['kTileRows',['../structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a',1,'mlx::steel::MMATile']]]
 ];
diff --git a/docs/build/html/search/variables_b.js b/docs/build/html/search/variables_b.js
index 9b85264b1..56c02572c 100644
--- a/docs/build/html/search/variables_b.js
+++ b/docs/build/html/search/variables_b.js
@@ -1,13 +1,9 @@
 var searchData=
 [
-  ['lda_0',['lda',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#afec42b532ffcad32bbffd494526bef03',1,'mlx::steel::GEMMParams::lda'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a6fac3c4a7c35af7b46b53f9662f882c6',1,'mlx::steel::GEMMSpiltKParams::lda']]],
-  ['ldb_1',['ldb',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a6032a081ab707c14b5f28069faa7cf62',1,'mlx::steel::GEMMParams::ldb'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a7f6f511854ccc98fa573bb560776ebed',1,'mlx::steel::GEMMSpiltKParams::ldb']]],
-  ['ldc_2',['ldc',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a888730efa5c5c8ae7ed771c3084d583c',1,'mlx::steel::GEMMSpiltKParams::ldc'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a801e2245a36632160975a784b762a4e6',1,'mlx::steel::GEMMAddMMParams::ldc']]],
-  ['ldd_3',['ldd',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a6e8ae14e3f97c499ad9c39358a1855ab',1,'mlx::steel::GEMMParams']]],
-  ['ldk_4',['ldk',['../struct_m_l_x_fast_attention_params.html#a1f8c89bd55d89ad7b9fe27c60e3cb8d5',1,'MLXFastAttentionParams']]],
-  ['ldo_5',['ldo',['../struct_m_l_x_fast_attention_params.html#a9e73dc1971b5ab913bd85a7afa7cf46c',1,'MLXFastAttentionParams']]],
-  ['ldq_6',['ldq',['../struct_m_l_x_fast_attention_params.html#af2dadba2a28f5db2ca52472d00937e58',1,'MLXFastAttentionParams']]],
-  ['lds_7',['lds',['../struct_m_l_x_fast_attention_params.html#a274eeb8591c02511014dce50c4240c8a',1,'MLXFastAttentionParams']]],
-  ['ldv_8',['ldv',['../struct_m_l_x_fast_attention_params.html#aebada0bf0789e8706dce564752208e8b',1,'MLXFastAttentionParams']]],
-  ['loc_9',['loc',['../structmlx_1_1core_1_1_contiguous_iterator.html#a027b29e06d5cb467d961c019699514b1',1,'mlx::core::ContiguousIterator']]]
+  ['layout_0',['layout',['../structmlx_1_1steel_1_1_layout2_d.html#a6beedf1677ee1b192fb48c83a29ac8a1',1,'mlx::steel::Layout2D']]],
+  ['lda_1',['lda',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#afec42b532ffcad32bbffd494526bef03',1,'mlx::steel::GEMMParams::lda'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a6fac3c4a7c35af7b46b53f9662f882c6',1,'mlx::steel::GEMMSpiltKParams::lda']]],
+  ['ldb_2',['ldb',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a6032a081ab707c14b5f28069faa7cf62',1,'mlx::steel::GEMMParams::ldb'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a7f6f511854ccc98fa573bb560776ebed',1,'mlx::steel::GEMMSpiltKParams::ldb']]],
+  ['ldc_3',['ldc',['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a888730efa5c5c8ae7ed771c3084d583c',1,'mlx::steel::GEMMSpiltKParams::ldc'],['../structmlx_1_1steel_1_1_g_e_m_m_add_m_m_params.html#a801e2245a36632160975a784b762a4e6',1,'mlx::steel::GEMMAddMMParams::ldc']]],
+  ['ldd_4',['ldd',['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a6e8ae14e3f97c499ad9c39358a1855ab',1,'mlx::steel::GEMMParams']]],
+  ['loc_5',['loc',['../structmlx_1_1core_1_1_contiguous_iterator.html#a027b29e06d5cb467d961c019699514b1',1,'mlx::core::ContiguousIterator']]]
 ];
diff --git a/docs/build/html/search/variables_c.js b/docs/build/html/search/variables_c.js
index 1d872a1e7..c2e221058 100644
--- a/docs/build/html/search/variables_c.js
+++ b/docs/build/html/search/variables_c.js
@@ -1,6 +1,6 @@
 var searchData=
 [
-  ['m_0',['M',['../struct_m_l_x_fast_attention_params.html#a5cd3ede5f41d5fdf8177cab3f059f4d8',1,'MLXFastAttentionParams::M'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a2117fc93662d5177c8f3e7c2dbb9e2db',1,'mlx::steel::ImplicitGemmConv2DParams::M'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a85b20a4c4558cc78d76fcbd045a9c694',1,'mlx::steel::GEMMParams::M'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a8bab0cf8a20d2abefe294a7505917e7e',1,'mlx::steel::GEMMSpiltKParams::M']]],
+  ['m_0',['M',['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a2117fc93662d5177c8f3e7c2dbb9e2db',1,'mlx::steel::ImplicitGemmConv2DParams::M'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a85b20a4c4558cc78d76fcbd045a9c694',1,'mlx::steel::GEMMParams::M'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a8bab0cf8a20d2abefe294a7505917e7e',1,'mlx::steel::GEMMSpiltKParams::M']]],
   ['mask_5fh_1',['mask_h',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0b892c1a7edb9ed20c076d8945855c19',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
   ['mask_5fw_2',['mask_w',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a19ddba7259c3c2c02ed90f3f635557be',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
   ['max_3',['max',['../struct_limits.html#a2f0673b6f9da89ce1d64f9f3d74f50a8',1,'Limits::max'],['../struct_limits_3_01uint8__t_01_4.html#a1570fb640e2e41f96776db5ca08d500c',1,'Limits&lt; uint8_t &gt;::max'],['../struct_limits_3_01uint16__t_01_4.html#a228b33556ba4cb7e6137ab6258628488',1,'Limits&lt; uint16_t &gt;::max'],['../struct_limits_3_01uint32__t_01_4.html#a91fa8f7214ec936976a8324c7431c651',1,'Limits&lt; uint32_t &gt;::max'],['../struct_limits_3_01uint64__t_01_4.html#aa8c2257881a4e1fa8596fa07dba5e107',1,'Limits&lt; uint64_t &gt;::max'],['../struct_limits_3_01int8__t_01_4.html#a96fed01fa9249226be69760652643289',1,'Limits&lt; int8_t &gt;::max'],['../struct_limits_3_01int16__t_01_4.html#a12d64c398ca7609b7c906f3cf1a6f678',1,'Limits&lt; int16_t &gt;::max'],['../struct_limits_3_01int32__t_01_4.html#af756344b31e84222dd73d3445dcd5640',1,'Limits&lt; int32_t &gt;::max'],['../struct_limits_3_01int64__t_01_4.html#ac9c420604c0f3d237ddfb2b8a2439224',1,'Limits&lt; int64_t &gt;::max'],['../struct_limits_3_01half_01_4.html#a4f9515dbf2a622074f121bea39a7b175',1,'Limits&lt; half &gt;::max'],['../struct_limits_3_01float_01_4.html#aba172b22b388190aa3969ef16885d8a6',1,'Limits&lt; float &gt;::max'],['../struct_limits_3_01bfloat16__t_01_4.html#a0ead3618da6718629ea9fa4670b5005f',1,'Limits&lt; bfloat16_t &gt;::max'],['../struct_limits_3_01bool_01_4.html#acbd2132145888d51220558a101ffcff4',1,'Limits&lt; bool &gt;::max'],['../struct_limits_3_01complex64__t_01_4.html#ac01c274b224b90f5210b675a484f4607',1,'Limits&lt; complex64_t &gt;::max']]],
diff --git a/docs/build/html/search/variables_d.js b/docs/build/html/search/variables_d.js
index 733ac0ae1..d7f442308 100644
--- a/docs/build/html/search/variables_d.js
+++ b/docs/build/html/search/variables_d.js
@@ -1,16 +1,18 @@
 var searchData=
 [
-  ['n_0',['N',['../struct_m_l_x_fast_attention_params.html#ab42c792a80388002e34992cbd837a167',1,'MLXFastAttentionParams::N'],['../struct_m_l_x_conv_params.html#ae6b7054dc3cffa8e6aedeb29fa7da932',1,'MLXConvParams::N'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a213f5ea4018120d8b61ab82754aaba83',1,'mlx::steel::ImplicitGemmConv2DParams::N'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a174626ab98515d89923b2841a664b9a1',1,'mlx::steel::GEMMParams::N'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a1103e79fb8962812b9a3c9d5c902ff86',1,'mlx::steel::GEMMSpiltKParams::N']]],
+  ['n_0',['N',['../struct_m_l_x_conv_params.html#ae6b7054dc3cffa8e6aedeb29fa7da932',1,'MLXConvParams::N'],['../structmlx_1_1steel_1_1_implicit_gemm_conv2_d_params.html#a213f5ea4018120d8b61ab82754aaba83',1,'mlx::steel::ImplicitGemmConv2DParams::N'],['../structmlx_1_1steel_1_1_g_e_m_m_params.html#a174626ab98515d89923b2841a664b9a1',1,'mlx::steel::GEMMParams::N'],['../structmlx_1_1steel_1_1_g_e_m_m_spilt_k_params.html#a1103e79fb8962812b9a3c9d5c902ff86',1,'mlx::steel::GEMMSpiltKParams::N']]],
   ['n_1',['n',['../struct_read_writer.html#a655346c9ebfc33a69da3f1c1d4238dfb',1,'ReadWriter']]],
   ['n_5fchannels_2',['n_channels',['../structmlx_1_1steel_1_1_channel_helper.html#aa476bd0fcb38494c268547fc9820fc0a',1,'mlx::steel::ChannelHelper::n_channels'],['../structmlx_1_1steel_1_1_channel_helper_3_011_01_4.html#a06c2fb9c93660e8f6916228cd77f9494',1,'mlx::steel::ChannelHelper&lt; 1 &gt;::n_channels'],['../structmlx_1_1steel_1_1_channel_helper_3_012_01_4.html#ac66ff37bc2cf78d96667192a6cca73b5',1,'mlx::steel::ChannelHelper&lt; 2 &gt;::n_channels'],['../structmlx_1_1steel_1_1_channel_helper_3_013_01_4.html#a071c015713b7bab09930661165517eff',1,'mlx::steel::ChannelHelper&lt; 3 &gt;::n_channels'],['../structmlx_1_1steel_1_1_channel_helper_3_014_01_4.html#a167b00a84adf93b60e3d7a943d5eb977',1,'mlx::steel::ChannelHelper&lt; 4 &gt;::n_channels']]],
-  ['n_5fkv_5fheads_3',['N_KV_HEADS',['../struct_m_l_x_scaled_dot_product_attention_params.html#a68a292b9986c20560aca88394f82e9f7',1,'MLXScaledDotProductAttentionParams']]],
-  ['n_5fper_5fblock_4',['N_PER_BLOCK',['../struct_kernel_merge_sort.html#a959aaf5bfb70796a525fed318f7ae8ab',1,'KernelMergeSort::N_PER_BLOCK'],['../struct_kernel_multi_block_merge_sort.html#ae5113ca5852d11999ae932439af95a5c',1,'KernelMultiBlockMergeSort::N_PER_BLOCK']]],
-  ['n_5fq_5fheads_5',['N_Q_HEADS',['../struct_m_l_x_scaled_dot_product_attention_params.html#a1a63d2e7ad712b4ba26219c784c95177',1,'MLXScaledDotProductAttentionParams']]],
-  ['n_5freads_6',['n_reads',['../struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb',1,'QuantizedBlockLoader']]],
-  ['n_5frows_7',['n_rows',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a097c48a23e1bd7d8cf3e9d531397602f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3ec8a92c9e6643c1d5bf8af278026fe8',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a593ec140370d53f8c968f6240116d38b',1,'mlx::steel::Conv2DWeightBlockLoader::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a8b6c0936c9ad2766242664f034d1115f',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae905e56c1129606e93dbbcd7baed8f0f',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#abff29c5d96645d9113314c9a997dd7a8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aaebb6da2cac9961f5edf52d16c18de7d',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::n_rows'],['../structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5',1,'mlx::steel::BlockLoader::n_rows']]],
-  ['names_8',['names',['../structmlx_1_1core_1_1_node_namer.html#a57823f9a2cdc60b2f06f857b36019277',1,'mlx::core::NodeNamer']]],
-  ['ndim_9',['ndim',['../struct_indices.html#a7dec359e91d0eb2b64e5461b54308313',1,'Indices::ndim'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051',1,'mlx::core::fast::CustomKernelShapeInfo::ndim']]],
-  ['needs_5ftgp_5freduction_10',['needs_tgp_reduction',['../struct_g_e_m_v_kernel.html#ae8113fddf6fb637acfd12efd978b704c',1,'GEMVKernel::needs_tgp_reduction'],['../struct_g_e_m_v_t_kernel.html#a67be7ec69c3791f02e97ccdb00ae0e03',1,'GEMVTKernel::needs_tgp_reduction']]],
-  ['next_11',['next',['../backend_2metal_2allocator_8h.html#ae704ab07eac590091daa5fc4aec7bddb',1,'allocator.h']]],
-  ['number_12',['number',['../namespacemlx_1_1core.html#a069c0aab6b36aef34419534ec4a4310d',1,'mlx::core']]]
+  ['n_5fper_5fblock_3',['N_PER_BLOCK',['../struct_kernel_merge_sort.html#a959aaf5bfb70796a525fed318f7ae8ab',1,'KernelMergeSort::N_PER_BLOCK'],['../struct_kernel_multi_block_merge_sort.html#ae5113ca5852d11999ae932439af95a5c',1,'KernelMultiBlockMergeSort::N_PER_BLOCK']]],
+  ['n_5freads_4',['n_reads',['../struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb',1,'QuantizedBlockLoader']]],
+  ['n_5frows_5',['n_rows',['../structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5',1,'mlx::steel::BlockLoader::n_rows'],['../structmlx_1_1steel_1_1_block_loader_t.html#a0ccc7caa93e6e709981a1a08159d41dc',1,'mlx::steel::BlockLoaderT::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a097c48a23e1bd7d8cf3e9d531397602f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a3ec8a92c9e6643c1d5bf8af278026fe8',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a593ec140370d53f8c968f6240116d38b',1,'mlx::steel::Conv2DWeightBlockLoader::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#a8b6c0936c9ad2766242664f034d1115f',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae905e56c1129606e93dbbcd7baed8f0f',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#abff29c5d96645d9113314c9a997dd7a8',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::n_rows'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#aaebb6da2cac9961f5edf52d16c18de7d',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::n_rows']]],
+  ['names_6',['names',['../structmlx_1_1core_1_1_node_namer.html#a57823f9a2cdc60b2f06f857b36019277',1,'mlx::core::NodeNamer']]],
+  ['ndim_7',['ndim',['../struct_indices.html#a7dec359e91d0eb2b64e5461b54308313',1,'Indices::ndim'],['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html#ae605df33f449872e3da9777d97008051',1,'mlx::core::fast::CustomKernelShapeInfo::ndim']]],
+  ['needs_5ftgp_5freduction_8',['needs_tgp_reduction',['../struct_g_e_m_v_kernel.html#ae8113fddf6fb637acfd12efd978b704c',1,'GEMVKernel::needs_tgp_reduction'],['../struct_g_e_m_v_t_kernel.html#a67be7ec69c3791f02e97ccdb00ae0e03',1,'GEMVTKernel::needs_tgp_reduction']]],
+  ['next_9',['next',['../backend_2metal_2allocator_8h.html#ae704ab07eac590091daa5fc4aec7bddb',1,'allocator.h']]],
+  ['nk_10',['NK',['../structmlx_1_1steel_1_1_attn_params.html#a68a66e3fafa922dcfd1ab1f6bdc2375e',1,'mlx::steel::AttnParams']]],
+  ['nk_5faligned_11',['NK_aligned',['../structmlx_1_1steel_1_1_attn_params.html#aaf953954274794cfcb4e35e82d681b58',1,'mlx::steel::AttnParams']]],
+  ['nq_12',['NQ',['../structmlx_1_1steel_1_1_attn_params.html#a48575afc94ab9ff74deaba61464e57a1',1,'mlx::steel::AttnParams']]],
+  ['nq_5faligned_13',['NQ_aligned',['../structmlx_1_1steel_1_1_attn_params.html#a4cfd2ccb0fd7eb81c2a781a0614fdcbe',1,'mlx::steel::AttnParams']]],
+  ['number_14',['number',['../namespacemlx_1_1core.html#a069c0aab6b36aef34419534ec4a4310d',1,'mlx::core']]]
 ];
diff --git a/docs/build/html/search/variables_e.js b/docs/build/html/search/variables_e.js
index ed7d09cf6..4d4ca8fb6 100644
--- a/docs/build/html/search/variables_e.js
+++ b/docs/build/html/search/variables_e.js
@@ -1,11 +1,12 @@
 var searchData=
 [
   ['o_0',['O',['../struct_m_l_x_conv_params.html#ad55ff586d30072d8154865f9dfe92d97',1,'MLXConvParams']]],
-  ['offset_1',['offset',['../structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0',1,'looped_elem_to_loc::offset'],['../structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a7aebc0b0656e3a55d0dbca27a57d600e',1,'looped_elem_to_loc&lt; 1, offset_t &gt;::offset']]],
-  ['op_2',['op',['../structmlx_1_1core_1_1_default_strided_reduce.html#ac871f55a7ddd205574974cb4492a240b',1,'mlx::core::DefaultStridedReduce::op'],['../structmlx_1_1core_1_1_default_contiguous_reduce.html#a1928f07db988715cc177999e386f4830',1,'mlx::core::DefaultContiguousReduce::op'],['../common_2binary_8h.html#a70228731d29946574b238d21fb4b360c',1,'op:&#160;binary.h']]],
-  ['ortho_3',['ortho',['../structpocketfft_1_1detail_1_1_exec_dcst.html#aea17551a49acaca5e7808dc181d38b7f',1,'pocketfft::detail::ExecDcst']]],
-  ['os_4',['oS',['../struct_m_l_x_conv_params.html#a19ccb9fecfccdc18b6a7f0cc43adbc6e',1,'MLXConvParams']]],
-  ['out_5',['out',['../struct_read_writer.html#abea3b913c952c505d0ca4e529c7316ef',1,'ReadWriter']]],
-  ['out_5fstrides_6',['out_strides',['../struct_m_l_x_conv_params.html#a0c8b2cfc26859a2af9d39a2cfcc3aea6',1,'MLXConvParams']]],
-  ['outputs_7',['outputs',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9',1,'mlx::core::metal::DeviceStream']]]
+  ['o_5fstrides_1',['O_strides',['../structmlx_1_1steel_1_1_attn_params.html#a33dc7fc22d2604a73af9f94eeea45bb4',1,'mlx::steel::AttnParams']]],
+  ['offset_2',['offset',['../struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791',1,'LoopedElemToLoc::offset'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a3a18944c158e2747a6ddebb420299a3b',1,'LoopedElemToLoc&lt; 1, OffsetT, true &gt;::offset'],['../struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af792b1fd4e8286f97b9b863c127a2d9a',1,'LoopedElemToLoc&lt; 1, OffsetT, false &gt;::offset']]],
+  ['op_3',['op',['../structmlx_1_1core_1_1_default_strided_reduce.html#ac871f55a7ddd205574974cb4492a240b',1,'mlx::core::DefaultStridedReduce::op'],['../structmlx_1_1core_1_1_default_contiguous_reduce.html#a1928f07db988715cc177999e386f4830',1,'mlx::core::DefaultContiguousReduce::op'],['../common_2binary_8h.html#a70228731d29946574b238d21fb4b360c',1,'op:&#160;binary.h']]],
+  ['ortho_4',['ortho',['../structpocketfft_1_1detail_1_1_exec_dcst.html#aea17551a49acaca5e7808dc181d38b7f',1,'pocketfft::detail::ExecDcst']]],
+  ['os_5',['oS',['../struct_m_l_x_conv_params.html#a19ccb9fecfccdc18b6a7f0cc43adbc6e',1,'MLXConvParams']]],
+  ['out_6',['out',['../struct_read_writer.html#abea3b913c952c505d0ca4e529c7316ef',1,'ReadWriter']]],
+  ['out_5fstrides_7',['out_strides',['../struct_m_l_x_conv_params.html#a0c8b2cfc26859a2af9d39a2cfcc3aea6',1,'MLXConvParams']]],
+  ['outputs_8',['outputs',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9',1,'mlx::core::metal::DeviceStream']]]
 ];
diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js
index 09019b7d5..a53200a55 100644
--- a/docs/build/html/searchindex.js
+++ b/docs/build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"A Simple Example": [[487, "a-simple-example"]], "Array": [[316, null]], "Attention layer": [[5, "attention-layer"]], "Automatic Differentiation": [[481, "automatic-differentiation"]], "Automatic Vectorization": [[481, "automatic-vectorization"]], "Basics": [[485, "basics"]], "Basics of Compile": [[479, "basics-of-compile"]], "Binary Size Minimization": [[8, "binary-size-minimization"]], "Binding to Python": [[2, "binding-to-python"]], "Build Options": [[8, "id3"]], "Build Requirements": [[8, "build-requirements"]], "Build and Install": [[8, null]], "Build from source": [[8, "build-from-source"]], "Building and Binding": [[2, "building-and-binding"]], "Building with CMake": [[2, "building-with-cmake"]], "Building with setuptools": [[2, "building-with-setuptools"]], "C++ API": [[8, "c-api"]], "C++ API Reference": [[7, null]], "Common Optimizers": [[473, null]], "Compilation": [[479, null]], "Compiling Training Graphs": [[479, "compiling-training-graphs"]], "Complex Example": [[1, "complex-example"]], "Conversion to NumPy and Other Frameworks": [[484, null]], "Converting the weights": [[5, "converting-the-weights"]], "Custom Extensions in MLX": [[2, null]], "Custom Metal Kernels": [[1, null]], "Data Types": [[317, null]], "Debugging": [[479, "debugging"]], "Devices and Streams": [[318, null]], "Differences from NumPy": [[482, "differences-from-numpy"]], "Distributed Communication": [[319, null], [480, null]], "Download the code": [[2, null], [5, null]], "Encoder layer": [[5, "encoder-layer"]], "Example Speedup": [[479, "example-speedup"]], "Examples": [[7, null]], "FFT": [[321, null]], "Fast": [[320, null]], "Full model": [[5, "full-model"]], "Function Transforms": [[481, null]], "Function and Graph Transformations": [[485, "function-and-graph-transformations"]], "Functions": [[448, null]], "Further Reading": [[7, null]], "Generation": [[5, "generation"]], "Getting Started": [[480, "getting-started"]], "Grid Sample VJP": [[1, "grid-sample-vjp"]], "Implementing the CPU Back-end": [[2, "implementing-the-cpu-back-end"]], "Implementing the GPU Back-end": [[2, "implementing-the-gpu-back-end"]], "Implementing the Primitive": [[2, "implementing-the-primitive"]], "Implementing the model": [[5, "implementing-the-model"]], "In Place Updates": [[482, "in-place-updates"]], "Indexing Arrays": [[482, null]], "Initializers": [[449, null]], "Inspecting Modules": [[324, "inspecting-modules"]], "Install": [[7, null]], "Installing MPI": [[480, "installing-mpi"]], "Introducing the Example": [[2, "introducing-the-example"]], "JAX": [[484, "jax"]], "LLM inference": [[5, null]], "Layers": [[450, null]], "Lazy Evaluation": [[483, null]], "Linear Algebra": [[322, null]], "Linear Regression": [[4, null]], "Loss Functions": [[451, null]], "MLX": [[7, null]], "Metal": [[323, null]], "Metal Debugger": [[3, null]], "Metal not found": [[8, "metal-not-found"]], "Module": [[452, null]], "Multi-Layer Perceptron": [[6, null]], "Neural Networks": [[324, null]], "Only Compute What You Use": [[483, "only-compute-what-you-use"]], "Operations": [[0, null], [2, "operations"], [453, null]], "Operations and Primitives": [[2, "operations-and-primitives"]], "Optimizer": [[474, null]], "Optimizers": [[454, null]], "Parameters": [[324, "parameters"]], "Primitive Transforms": [[2, "primitive-transforms"]], "Primitives": [[2, "primitives"]], "Pure Functions": [[479, "pure-functions"]], "Putting it all together": [[5, "putting-it-all-together"]], "PyTorch": [[484, "pytorch"]], "Python API": [[8, "python-api"]], "Python API Reference": [[7, null]], "Python Installation": [[8, "python-installation"]], "Quick Start Guide": [[485, null]], "Quick Start with Neural Networks": [[324, "quick-start-with-neural-networks"]], "Random": [[476, null]], "Results": [[2, "results"]], "Saving and Loading": [[454, "saving-and-loading"]], "Saving and Loading Arrays": [[486, null]], "Schedulers": [[475, null]], "Scripts": [[2, "scripts"], [5, "scripts"]], "Serialization Formats": [[486, "id1"]], "Setting up Remote Hosts": [[480, "setting-up-remote-hosts"]], "Simple Example": [[1, "simple-example"]], "Specifying the Stream": [[488, "specifying-the-stream"]], "Supported Data Types": [[317, "id2"]], "TensorFlow": [[484, "tensorflow"]], "The Module Class": [[324, "the-module-class"]], "Training Example": [[480, "training-example"]], "Transformations with Compile": [[479, "transformations-with-compile"]], "Transforming Compute Graphs": [[483, "transforming-compute-graphs"]], "Transforms": [[477, null]], "Tree Utils": [[478, null]], "Troubleshooting": [[8, "troubleshooting"], [8, "id2"]], "Tuning All Reduce": [[480, "tuning-all-reduce"]], "Unified Memory": [[487, null]], "Updating the Parameters": [[324, "updating-the-parameters"]], "Usage": [[2, "usage"], [7, null]], "Using Shape/Strides": [[1, "using-shape-strides"]], "Using Streams": [[488, null]], "Using the Primitive": [[2, "using-the-primitive"]], "Value and Grad": [[324, "value-and-grad"]], "Weight loading and benchmarking": [[5, "weight-loading-and-benchmarking"]], "When to Evaluate": [[483, "when-to-evaluate"]], "Why Lazy Evaluation": [[483, "why-lazy-evaluation"]], "Xcode Workflow": [[3, "xcode-workflow"]], "mlx.core.Device": [[9, null]], "mlx.core.Dtype": [[10, null]], "mlx.core.DtypeCategory": [[11, null]], "mlx.core.Stream": [[315, null]], "mlx.core.abs": [[12, null]], "mlx.core.add": [[13, null]], "mlx.core.addmm": [[14, null]], "mlx.core.all": [[15, null]], "mlx.core.allclose": [[16, null]], "mlx.core.any": [[17, null]], "mlx.core.arange": [[18, null]], "mlx.core.arccos": [[19, null]], "mlx.core.arccosh": [[20, null]], "mlx.core.arcsin": [[21, null]], "mlx.core.arcsinh": [[22, null]], "mlx.core.arctan": [[23, null]], "mlx.core.arctan2": [[24, null]], "mlx.core.arctanh": [[25, null]], "mlx.core.argmax": [[26, null]], "mlx.core.argmin": [[27, null]], "mlx.core.argpartition": [[28, null]], "mlx.core.argsort": [[29, null]], "mlx.core.array": [[30, null]], "mlx.core.array.T": [[31, null]], "mlx.core.array.abs": [[32, null]], "mlx.core.array.all": [[33, null]], "mlx.core.array.any": [[34, null]], "mlx.core.array.argmax": [[35, null]], "mlx.core.array.argmin": [[36, null]], "mlx.core.array.astype": [[37, null]], "mlx.core.array.at": [[38, null]], "mlx.core.array.conj": [[39, null]], "mlx.core.array.cos": [[40, null]], "mlx.core.array.cummax": [[41, null]], "mlx.core.array.cummin": [[42, null]], "mlx.core.array.cumprod": [[43, null]], "mlx.core.array.cumsum": [[44, null]], "mlx.core.array.diag": [[45, null]], "mlx.core.array.diagonal": [[46, null]], "mlx.core.array.dtype": [[47, null]], "mlx.core.array.exp": [[48, null]], "mlx.core.array.flatten": [[49, null]], "mlx.core.array.item": [[50, null]], "mlx.core.array.itemsize": [[51, null]], "mlx.core.array.log": [[52, null]], "mlx.core.array.log10": [[53, null]], "mlx.core.array.log1p": [[54, null]], "mlx.core.array.log2": [[55, null]], "mlx.core.array.logsumexp": [[56, null]], "mlx.core.array.max": [[57, null]], "mlx.core.array.mean": [[58, null]], "mlx.core.array.min": [[59, null]], "mlx.core.array.moveaxis": [[60, null]], "mlx.core.array.nbytes": [[61, null]], "mlx.core.array.ndim": [[62, null]], "mlx.core.array.prod": [[63, null]], "mlx.core.array.reciprocal": [[64, null]], "mlx.core.array.reshape": [[65, null]], "mlx.core.array.round": [[66, null]], "mlx.core.array.rsqrt": [[67, null]], "mlx.core.array.shape": [[68, null]], "mlx.core.array.sin": [[69, null]], "mlx.core.array.size": [[70, null]], "mlx.core.array.split": [[71, null]], "mlx.core.array.sqrt": [[72, null]], "mlx.core.array.square": [[73, null]], "mlx.core.array.squeeze": [[74, null]], "mlx.core.array.std": [[75, null]], "mlx.core.array.sum": [[76, null]], "mlx.core.array.swapaxes": [[77, null]], "mlx.core.array.tolist": [[78, null]], "mlx.core.array.transpose": [[79, null]], "mlx.core.array.var": [[80, null]], "mlx.core.array.view": [[81, null]], "mlx.core.array_equal": [[82, null]], "mlx.core.as_strided": [[83, null]], "mlx.core.atleast_1d": [[84, null]], "mlx.core.atleast_2d": [[85, null]], "mlx.core.atleast_3d": [[86, null]], "mlx.core.bitwise_and": [[87, null]], "mlx.core.bitwise_or": [[88, null]], "mlx.core.bitwise_xor": [[89, null]], "mlx.core.block_masked_mm": [[90, null]], "mlx.core.broadcast_to": [[91, null]], "mlx.core.ceil": [[92, null]], "mlx.core.clip": [[93, null]], "mlx.core.compile": [[94, null]], "mlx.core.concatenate": [[95, null]], "mlx.core.conj": [[96, null]], "mlx.core.conjugate": [[97, null]], "mlx.core.conv1d": [[98, null]], "mlx.core.conv2d": [[99, null]], "mlx.core.conv3d": [[100, null]], "mlx.core.conv_general": [[101, null]], "mlx.core.conv_transpose1d": [[102, null]], "mlx.core.conv_transpose2d": [[103, null]], "mlx.core.conv_transpose3d": [[104, null]], "mlx.core.convolve": [[105, null]], "mlx.core.cos": [[106, null]], "mlx.core.cosh": [[107, null]], "mlx.core.cummax": [[108, null]], "mlx.core.cummin": [[109, null]], "mlx.core.cumprod": [[110, null]], "mlx.core.cumsum": [[111, null]], "mlx.core.custom_function": [[112, null]], "mlx.core.default_device": [[113, null]], "mlx.core.default_stream": [[114, null]], "mlx.core.degrees": [[115, null]], "mlx.core.dequantize": [[116, null]], "mlx.core.diag": [[117, null]], "mlx.core.diagonal": [[118, null]], "mlx.core.disable_compile": [[119, null]], "mlx.core.distributed.Group": [[120, null]], "mlx.core.distributed.all_gather": [[121, null]], "mlx.core.distributed.all_sum": [[122, null]], "mlx.core.distributed.init": [[123, null]], "mlx.core.distributed.is_available": [[124, null]], "mlx.core.distributed.recv": [[125, null]], "mlx.core.distributed.recv_like": [[126, null]], "mlx.core.distributed.send": [[127, null]], "mlx.core.divide": [[128, null]], "mlx.core.divmod": [[129, null]], "mlx.core.einsum": [[130, null]], "mlx.core.einsum_path": [[131, null]], "mlx.core.enable_compile": [[132, null]], "mlx.core.equal": [[133, null]], "mlx.core.erf": [[134, null]], "mlx.core.erfinv": [[135, null]], "mlx.core.eval": [[136, null]], "mlx.core.exp": [[137, null]], "mlx.core.expand_dims": [[138, null]], "mlx.core.expm1": [[139, null]], "mlx.core.eye": [[140, null]], "mlx.core.fast.affine_quantize": [[141, null]], "mlx.core.fast.layer_norm": [[142, null]], "mlx.core.fast.metal_kernel": [[143, null]], "mlx.core.fast.rms_norm": [[144, null]], "mlx.core.fast.rope": [[145, null]], "mlx.core.fast.scaled_dot_product_attention": [[146, null]], "mlx.core.fft.fft": [[147, null]], "mlx.core.fft.fft2": [[148, null]], "mlx.core.fft.fftn": [[149, null]], "mlx.core.fft.ifft": [[150, null]], "mlx.core.fft.ifft2": [[151, null]], "mlx.core.fft.ifftn": [[152, null]], "mlx.core.fft.irfft": [[153, null]], "mlx.core.fft.irfft2": [[154, null]], "mlx.core.fft.irfftn": [[155, null]], "mlx.core.fft.rfft": [[156, null]], "mlx.core.fft.rfft2": [[157, null]], "mlx.core.fft.rfftn": [[158, null]], "mlx.core.flatten": [[159, null]], "mlx.core.floor": [[160, null]], "mlx.core.floor_divide": [[161, null]], "mlx.core.full": [[162, null]], "mlx.core.gather_mm": [[163, null]], "mlx.core.gather_qmm": [[164, null]], "mlx.core.grad": [[165, null]], "mlx.core.greater": [[166, null]], "mlx.core.greater_equal": [[167, null]], "mlx.core.hadamard_transform": [[168, null]], "mlx.core.identity": [[169, null]], "mlx.core.imag": [[170, null]], "mlx.core.inner": [[171, null]], "mlx.core.isclose": [[172, null]], "mlx.core.isfinite": [[173, null]], "mlx.core.isinf": [[174, null]], "mlx.core.isnan": [[175, null]], "mlx.core.isneginf": [[176, null]], "mlx.core.isposinf": [[177, null]], "mlx.core.issubdtype": [[178, null]], "mlx.core.jvp": [[179, null]], "mlx.core.left_shift": [[180, null]], "mlx.core.less": [[181, null]], "mlx.core.less_equal": [[182, null]], "mlx.core.linalg.cholesky": [[183, null]], "mlx.core.linalg.cholesky_inv": [[184, null]], "mlx.core.linalg.cross": [[185, null]], "mlx.core.linalg.eigh": [[186, null]], "mlx.core.linalg.eigvalsh": [[187, null]], "mlx.core.linalg.inv": [[188, null]], "mlx.core.linalg.norm": [[189, null]], "mlx.core.linalg.qr": [[190, null]], "mlx.core.linalg.svd": [[191, null]], "mlx.core.linalg.tri_inv": [[192, null]], "mlx.core.linspace": [[193, null]], "mlx.core.load": [[194, null]], "mlx.core.log": [[195, null]], "mlx.core.log10": [[196, null]], "mlx.core.log1p": [[197, null]], "mlx.core.log2": [[198, null]], "mlx.core.logaddexp": [[199, null]], "mlx.core.logical_and": [[200, null]], "mlx.core.logical_not": [[201, null]], "mlx.core.logical_or": [[202, null]], "mlx.core.logsumexp": [[203, null]], "mlx.core.matmul": [[204, null]], "mlx.core.max": [[205, null]], "mlx.core.maximum": [[206, null]], "mlx.core.mean": [[207, null]], "mlx.core.meshgrid": [[208, null]], "mlx.core.metal.clear_cache": [[209, null]], "mlx.core.metal.device_info": [[210, null]], "mlx.core.metal.get_active_memory": [[211, null]], "mlx.core.metal.get_cache_memory": [[212, null]], "mlx.core.metal.get_peak_memory": [[213, null]], "mlx.core.metal.is_available": [[214, null]], "mlx.core.metal.reset_peak_memory": [[215, null]], "mlx.core.metal.set_cache_limit": [[216, null]], "mlx.core.metal.set_memory_limit": [[217, null]], "mlx.core.metal.set_wired_limit": [[218, null]], "mlx.core.metal.start_capture": [[219, null]], "mlx.core.metal.stop_capture": [[220, null]], "mlx.core.min": [[221, null]], "mlx.core.minimum": [[222, null]], "mlx.core.moveaxis": [[223, null]], "mlx.core.multiply": [[224, null]], "mlx.core.nan_to_num": [[225, null]], "mlx.core.negative": [[226, null]], "mlx.core.new_stream": [[227, null]], "mlx.core.not_equal": [[228, null]], "mlx.core.ones": [[229, null]], "mlx.core.ones_like": [[230, null]], "mlx.core.outer": [[231, null]], "mlx.core.pad": [[232, null]], "mlx.core.partition": [[233, null]], "mlx.core.power": [[234, null]], "mlx.core.prod": [[235, null]], "mlx.core.put_along_axis": [[236, null]], "mlx.core.quantize": [[237, null]], "mlx.core.quantized_matmul": [[238, null]], "mlx.core.radians": [[239, null]], "mlx.core.random.bernoulli": [[240, null]], "mlx.core.random.categorical": [[241, null]], "mlx.core.random.gumbel": [[242, null]], "mlx.core.random.key": [[243, null]], "mlx.core.random.laplace": [[244, null]], "mlx.core.random.multivariate_normal": [[245, null]], "mlx.core.random.normal": [[246, null]], "mlx.core.random.permutation": [[247, null]], "mlx.core.random.randint": [[248, null]], "mlx.core.random.seed": [[249, null]], "mlx.core.random.split": [[250, null]], "mlx.core.random.truncated_normal": [[251, null]], "mlx.core.random.uniform": [[252, null]], "mlx.core.real": [[253, null]], "mlx.core.reciprocal": [[254, null]], "mlx.core.remainder": [[255, null]], "mlx.core.repeat": [[256, null]], "mlx.core.reshape": [[257, null]], "mlx.core.right_shift": [[258, null]], "mlx.core.roll": [[259, null]], "mlx.core.round": [[260, null]], "mlx.core.rsqrt": [[261, null]], "mlx.core.save": [[262, null]], "mlx.core.save_gguf": [[263, null]], "mlx.core.save_safetensors": [[264, null]], "mlx.core.savez": [[265, null]], "mlx.core.savez_compressed": [[266, null]], "mlx.core.set_default_device": [[267, null]], "mlx.core.set_default_stream": [[268, null]], "mlx.core.sigmoid": [[269, null]], "mlx.core.sign": [[270, null]], "mlx.core.sin": [[271, null]], "mlx.core.sinh": [[272, null]], "mlx.core.softmax": [[273, null]], "mlx.core.sort": [[274, null]], "mlx.core.split": [[275, null]], "mlx.core.sqrt": [[276, null]], "mlx.core.square": [[277, null]], "mlx.core.squeeze": [[278, null]], "mlx.core.stack": [[279, null]], "mlx.core.std": [[280, null]], "mlx.core.stop_gradient": [[281, null]], "mlx.core.stream": [[282, null]], "mlx.core.subtract": [[283, null]], "mlx.core.sum": [[284, null]], "mlx.core.swapaxes": [[285, null]], "mlx.core.synchronize": [[286, null]], "mlx.core.take": [[287, null]], "mlx.core.take_along_axis": [[288, null]], "mlx.core.tan": [[289, null]], "mlx.core.tanh": [[290, null]], "mlx.core.tensordot": [[291, null]], "mlx.core.tile": [[292, null]], "mlx.core.topk": [[293, null]], "mlx.core.trace": [[294, null]], "mlx.core.transpose": [[295, null]], "mlx.core.tri": [[296, null]], "mlx.core.tril": [[297, null]], "mlx.core.triu": [[298, null]], "mlx.core.value_and_grad": [[299, null]], "mlx.core.var": [[300, null]], "mlx.core.view": [[301, null]], "mlx.core.vjp": [[302, null]], "mlx.core.vmap": [[303, null]], "mlx.core.where": [[304, null]], "mlx.core.zeros": [[305, null]], "mlx.core.zeros_like": [[306, null]], "mlx.nn.ALiBi": [[325, null]], "mlx.nn.AvgPool1d": [[326, null]], "mlx.nn.AvgPool2d": [[327, null]], "mlx.nn.BatchNorm": [[328, null]], "mlx.nn.CELU": [[329, null]], "mlx.nn.Conv1d": [[330, null]], "mlx.nn.Conv2d": [[331, null]], "mlx.nn.Conv3d": [[332, null]], "mlx.nn.ConvTranspose1d": [[333, null]], "mlx.nn.ConvTranspose2d": [[334, null]], "mlx.nn.ConvTranspose3d": [[335, null]], "mlx.nn.Dropout": [[336, null]], "mlx.nn.Dropout2d": [[337, null]], "mlx.nn.Dropout3d": [[338, null]], "mlx.nn.ELU": [[339, null]], "mlx.nn.Embedding": [[340, null]], "mlx.nn.GELU": [[341, null]], "mlx.nn.GLU": [[342, null]], "mlx.nn.GRU": [[343, null]], "mlx.nn.GroupNorm": [[344, null]], "mlx.nn.HardShrink": [[345, null]], "mlx.nn.HardTanh": [[346, null]], "mlx.nn.Hardswish": [[347, null]], "mlx.nn.InstanceNorm": [[348, null]], "mlx.nn.LSTM": [[349, null]], "mlx.nn.LayerNorm": [[350, null]], "mlx.nn.LeakyReLU": [[351, null]], "mlx.nn.Linear": [[352, null]], "mlx.nn.LogSigmoid": [[353, null]], "mlx.nn.LogSoftmax": [[354, null]], "mlx.nn.MaxPool1d": [[355, null]], "mlx.nn.MaxPool2d": [[356, null]], "mlx.nn.Mish": [[357, null]], "mlx.nn.Module.apply": [[358, null]], "mlx.nn.Module.apply_to_modules": [[359, null]], "mlx.nn.Module.children": [[360, null]], "mlx.nn.Module.eval": [[361, null]], "mlx.nn.Module.filter_and_map": [[362, null]], "mlx.nn.Module.freeze": [[363, null]], "mlx.nn.Module.leaf_modules": [[364, null]], "mlx.nn.Module.load_weights": [[365, null]], "mlx.nn.Module.modules": [[366, null]], "mlx.nn.Module.named_modules": [[367, null]], "mlx.nn.Module.parameters": [[368, null]], "mlx.nn.Module.save_weights": [[369, null]], "mlx.nn.Module.set_dtype": [[370, null]], "mlx.nn.Module.state": [[371, null]], "mlx.nn.Module.train": [[372, null]], "mlx.nn.Module.trainable_parameters": [[373, null]], "mlx.nn.Module.training": [[374, null]], "mlx.nn.Module.unfreeze": [[375, null]], "mlx.nn.Module.update": [[376, null]], "mlx.nn.Module.update_modules": [[377, null]], "mlx.nn.MultiHeadAttention": [[378, null]], "mlx.nn.PReLU": [[379, null]], "mlx.nn.QuantizedEmbedding": [[380, null]], "mlx.nn.QuantizedLinear": [[381, null]], "mlx.nn.RMSNorm": [[382, null]], "mlx.nn.RNN": [[383, null]], "mlx.nn.ReLU": [[384, null]], "mlx.nn.ReLU6": [[385, null]], "mlx.nn.RoPE": [[386, null]], "mlx.nn.SELU": [[387, null]], "mlx.nn.Sequential": [[388, null]], "mlx.nn.SiLU": [[389, null]], "mlx.nn.Sigmoid": [[390, null]], "mlx.nn.SinusoidalPositionalEncoding": [[391, null]], "mlx.nn.Softmax": [[392, null]], "mlx.nn.Softmin": [[393, null]], "mlx.nn.Softplus": [[394, null]], "mlx.nn.Softshrink": [[395, null]], "mlx.nn.Softsign": [[396, null]], "mlx.nn.Step": [[397, null]], "mlx.nn.Tanh": [[398, null]], "mlx.nn.Transformer": [[399, null]], "mlx.nn.Upsample": [[400, null]], "mlx.nn.celu": [[409, null]], "mlx.nn.elu": [[410, null]], "mlx.nn.gelu": [[411, null]], "mlx.nn.gelu_approx": [[412, null]], "mlx.nn.gelu_fast_approx": [[413, null]], "mlx.nn.glu": [[414, null]], "mlx.nn.hard_shrink": [[415, null]], "mlx.nn.hard_tanh": [[416, null]], "mlx.nn.hardswish": [[417, null]], "mlx.nn.init.constant": [[401, null]], "mlx.nn.init.glorot_normal": [[402, null]], "mlx.nn.init.glorot_uniform": [[403, null]], "mlx.nn.init.he_normal": [[404, null]], "mlx.nn.init.he_uniform": [[405, null]], "mlx.nn.init.identity": [[406, null]], "mlx.nn.init.normal": [[407, null]], "mlx.nn.init.uniform": [[408, null]], "mlx.nn.leaky_relu": [[418, null]], "mlx.nn.log_sigmoid": [[419, null]], "mlx.nn.log_softmax": [[420, null]], "mlx.nn.losses.binary_cross_entropy": [[421, null]], "mlx.nn.losses.cosine_similarity_loss": [[422, null]], "mlx.nn.losses.cross_entropy": [[423, null]], "mlx.nn.losses.gaussian_nll_loss": [[424, null]], "mlx.nn.losses.hinge_loss": [[425, null]], "mlx.nn.losses.huber_loss": [[426, null]], "mlx.nn.losses.kl_div_loss": [[427, null]], "mlx.nn.losses.l1_loss": [[428, null]], "mlx.nn.losses.log_cosh_loss": [[429, null]], "mlx.nn.losses.margin_ranking_loss": [[430, null]], "mlx.nn.losses.mse_loss": [[431, null]], "mlx.nn.losses.nll_loss": [[432, null]], "mlx.nn.losses.smooth_l1_loss": [[433, null]], "mlx.nn.losses.triplet_loss": [[434, null]], "mlx.nn.mish": [[435, null]], "mlx.nn.prelu": [[436, null]], "mlx.nn.quantize": [[307, null]], "mlx.nn.relu": [[437, null]], "mlx.nn.relu6": [[438, null]], "mlx.nn.selu": [[439, null]], "mlx.nn.sigmoid": [[440, null]], "mlx.nn.silu": [[441, null]], "mlx.nn.softmax": [[442, null]], "mlx.nn.softmin": [[443, null]], "mlx.nn.softplus": [[444, null]], "mlx.nn.softshrink": [[445, null]], "mlx.nn.step": [[446, null]], "mlx.nn.tanh": [[447, null]], "mlx.nn.value_and_grad": [[308, null]], "mlx.optimizers.AdaDelta": [[455, null]], "mlx.optimizers.Adafactor": [[456, null]], "mlx.optimizers.Adagrad": [[457, null]], "mlx.optimizers.Adam": [[458, null]], "mlx.optimizers.AdamW": [[459, null]], "mlx.optimizers.Adamax": [[460, null]], "mlx.optimizers.Lion": [[461, null]], "mlx.optimizers.Optimizer.apply_gradients": [[462, null]], "mlx.optimizers.Optimizer.init": [[463, null]], "mlx.optimizers.Optimizer.state": [[464, null]], "mlx.optimizers.Optimizer.update": [[465, null]], "mlx.optimizers.RMSprop": [[466, null]], "mlx.optimizers.SGD": [[467, null]], "mlx.optimizers.clip_grad_norm": [[309, null]], "mlx.optimizers.cosine_decay": [[468, null]], "mlx.optimizers.exponential_decay": [[469, null]], "mlx.optimizers.join_schedules": [[470, null]], "mlx.optimizers.linear_schedule": [[471, null]], "mlx.optimizers.step_decay": [[472, null]], "mlx.utils.tree_flatten": [[310, null]], "mlx.utils.tree_map": [[311, null]], "mlx.utils.tree_map_with_path": [[312, null]], "mlx.utils.tree_reduce": [[313, null]], "mlx.utils.tree_unflatten": [[314, null]], "x86 Shell": [[8, "x86-shell"]]}, "docnames": ["cpp/ops", "dev/custom_metal_kernels", "dev/extensions", "dev/metal_debugger", "examples/linear_regression", "examples/llama-inference", "examples/mlp", "index", "install", "python/_autosummary/mlx.core.Device", "python/_autosummary/mlx.core.Dtype", "python/_autosummary/mlx.core.DtypeCategory", "python/_autosummary/mlx.core.abs", "python/_autosummary/mlx.core.add", "python/_autosummary/mlx.core.addmm", "python/_autosummary/mlx.core.all", "python/_autosummary/mlx.core.allclose", "python/_autosummary/mlx.core.any", "python/_autosummary/mlx.core.arange", "python/_autosummary/mlx.core.arccos", "python/_autosummary/mlx.core.arccosh", "python/_autosummary/mlx.core.arcsin", "python/_autosummary/mlx.core.arcsinh", "python/_autosummary/mlx.core.arctan", "python/_autosummary/mlx.core.arctan2", "python/_autosummary/mlx.core.arctanh", "python/_autosummary/mlx.core.argmax", "python/_autosummary/mlx.core.argmin", "python/_autosummary/mlx.core.argpartition", "python/_autosummary/mlx.core.argsort", "python/_autosummary/mlx.core.array", "python/_autosummary/mlx.core.array.T", "python/_autosummary/mlx.core.array.abs", "python/_autosummary/mlx.core.array.all", "python/_autosummary/mlx.core.array.any", "python/_autosummary/mlx.core.array.argmax", "python/_autosummary/mlx.core.array.argmin", "python/_autosummary/mlx.core.array.astype", "python/_autosummary/mlx.core.array.at", "python/_autosummary/mlx.core.array.conj", "python/_autosummary/mlx.core.array.cos", "python/_autosummary/mlx.core.array.cummax", "python/_autosummary/mlx.core.array.cummin", "python/_autosummary/mlx.core.array.cumprod", "python/_autosummary/mlx.core.array.cumsum", "python/_autosummary/mlx.core.array.diag", "python/_autosummary/mlx.core.array.diagonal", "python/_autosummary/mlx.core.array.dtype", "python/_autosummary/mlx.core.array.exp", "python/_autosummary/mlx.core.array.flatten", "python/_autosummary/mlx.core.array.item", "python/_autosummary/mlx.core.array.itemsize", "python/_autosummary/mlx.core.array.log", "python/_autosummary/mlx.core.array.log10", "python/_autosummary/mlx.core.array.log1p", "python/_autosummary/mlx.core.array.log2", "python/_autosummary/mlx.core.array.logsumexp", "python/_autosummary/mlx.core.array.max", "python/_autosummary/mlx.core.array.mean", "python/_autosummary/mlx.core.array.min", "python/_autosummary/mlx.core.array.moveaxis", "python/_autosummary/mlx.core.array.nbytes", "python/_autosummary/mlx.core.array.ndim", "python/_autosummary/mlx.core.array.prod", "python/_autosummary/mlx.core.array.reciprocal", "python/_autosummary/mlx.core.array.reshape", "python/_autosummary/mlx.core.array.round", "python/_autosummary/mlx.core.array.rsqrt", "python/_autosummary/mlx.core.array.shape", "python/_autosummary/mlx.core.array.sin", "python/_autosummary/mlx.core.array.size", "python/_autosummary/mlx.core.array.split", "python/_autosummary/mlx.core.array.sqrt", "python/_autosummary/mlx.core.array.square", "python/_autosummary/mlx.core.array.squeeze", "python/_autosummary/mlx.core.array.std", "python/_autosummary/mlx.core.array.sum", "python/_autosummary/mlx.core.array.swapaxes", "python/_autosummary/mlx.core.array.tolist", "python/_autosummary/mlx.core.array.transpose", "python/_autosummary/mlx.core.array.var", "python/_autosummary/mlx.core.array.view", "python/_autosummary/mlx.core.array_equal", "python/_autosummary/mlx.core.as_strided", "python/_autosummary/mlx.core.atleast_1d", "python/_autosummary/mlx.core.atleast_2d", "python/_autosummary/mlx.core.atleast_3d", "python/_autosummary/mlx.core.bitwise_and", "python/_autosummary/mlx.core.bitwise_or", "python/_autosummary/mlx.core.bitwise_xor", "python/_autosummary/mlx.core.block_masked_mm", "python/_autosummary/mlx.core.broadcast_to", "python/_autosummary/mlx.core.ceil", "python/_autosummary/mlx.core.clip", "python/_autosummary/mlx.core.compile", "python/_autosummary/mlx.core.concatenate", "python/_autosummary/mlx.core.conj", "python/_autosummary/mlx.core.conjugate", "python/_autosummary/mlx.core.conv1d", "python/_autosummary/mlx.core.conv2d", "python/_autosummary/mlx.core.conv3d", "python/_autosummary/mlx.core.conv_general", "python/_autosummary/mlx.core.conv_transpose1d", "python/_autosummary/mlx.core.conv_transpose2d", "python/_autosummary/mlx.core.conv_transpose3d", "python/_autosummary/mlx.core.convolve", "python/_autosummary/mlx.core.cos", "python/_autosummary/mlx.core.cosh", "python/_autosummary/mlx.core.cummax", "python/_autosummary/mlx.core.cummin", "python/_autosummary/mlx.core.cumprod", "python/_autosummary/mlx.core.cumsum", "python/_autosummary/mlx.core.custom_function", "python/_autosummary/mlx.core.default_device", "python/_autosummary/mlx.core.default_stream", "python/_autosummary/mlx.core.degrees", "python/_autosummary/mlx.core.dequantize", "python/_autosummary/mlx.core.diag", "python/_autosummary/mlx.core.diagonal", "python/_autosummary/mlx.core.disable_compile", "python/_autosummary/mlx.core.distributed.Group", "python/_autosummary/mlx.core.distributed.all_gather", "python/_autosummary/mlx.core.distributed.all_sum", "python/_autosummary/mlx.core.distributed.init", "python/_autosummary/mlx.core.distributed.is_available", "python/_autosummary/mlx.core.distributed.recv", "python/_autosummary/mlx.core.distributed.recv_like", "python/_autosummary/mlx.core.distributed.send", "python/_autosummary/mlx.core.divide", "python/_autosummary/mlx.core.divmod", "python/_autosummary/mlx.core.einsum", "python/_autosummary/mlx.core.einsum_path", "python/_autosummary/mlx.core.enable_compile", "python/_autosummary/mlx.core.equal", "python/_autosummary/mlx.core.erf", "python/_autosummary/mlx.core.erfinv", "python/_autosummary/mlx.core.eval", "python/_autosummary/mlx.core.exp", "python/_autosummary/mlx.core.expand_dims", "python/_autosummary/mlx.core.expm1", "python/_autosummary/mlx.core.eye", "python/_autosummary/mlx.core.fast.affine_quantize", "python/_autosummary/mlx.core.fast.layer_norm", "python/_autosummary/mlx.core.fast.metal_kernel", "python/_autosummary/mlx.core.fast.rms_norm", "python/_autosummary/mlx.core.fast.rope", "python/_autosummary/mlx.core.fast.scaled_dot_product_attention", "python/_autosummary/mlx.core.fft.fft", "python/_autosummary/mlx.core.fft.fft2", "python/_autosummary/mlx.core.fft.fftn", "python/_autosummary/mlx.core.fft.ifft", "python/_autosummary/mlx.core.fft.ifft2", "python/_autosummary/mlx.core.fft.ifftn", "python/_autosummary/mlx.core.fft.irfft", "python/_autosummary/mlx.core.fft.irfft2", "python/_autosummary/mlx.core.fft.irfftn", "python/_autosummary/mlx.core.fft.rfft", "python/_autosummary/mlx.core.fft.rfft2", "python/_autosummary/mlx.core.fft.rfftn", "python/_autosummary/mlx.core.flatten", "python/_autosummary/mlx.core.floor", "python/_autosummary/mlx.core.floor_divide", "python/_autosummary/mlx.core.full", "python/_autosummary/mlx.core.gather_mm", "python/_autosummary/mlx.core.gather_qmm", "python/_autosummary/mlx.core.grad", "python/_autosummary/mlx.core.greater", "python/_autosummary/mlx.core.greater_equal", "python/_autosummary/mlx.core.hadamard_transform", "python/_autosummary/mlx.core.identity", "python/_autosummary/mlx.core.imag", "python/_autosummary/mlx.core.inner", "python/_autosummary/mlx.core.isclose", "python/_autosummary/mlx.core.isfinite", "python/_autosummary/mlx.core.isinf", "python/_autosummary/mlx.core.isnan", "python/_autosummary/mlx.core.isneginf", "python/_autosummary/mlx.core.isposinf", "python/_autosummary/mlx.core.issubdtype", "python/_autosummary/mlx.core.jvp", "python/_autosummary/mlx.core.left_shift", "python/_autosummary/mlx.core.less", "python/_autosummary/mlx.core.less_equal", "python/_autosummary/mlx.core.linalg.cholesky", "python/_autosummary/mlx.core.linalg.cholesky_inv", "python/_autosummary/mlx.core.linalg.cross", "python/_autosummary/mlx.core.linalg.eigh", "python/_autosummary/mlx.core.linalg.eigvalsh", "python/_autosummary/mlx.core.linalg.inv", "python/_autosummary/mlx.core.linalg.norm", "python/_autosummary/mlx.core.linalg.qr", "python/_autosummary/mlx.core.linalg.svd", "python/_autosummary/mlx.core.linalg.tri_inv", "python/_autosummary/mlx.core.linspace", "python/_autosummary/mlx.core.load", "python/_autosummary/mlx.core.log", "python/_autosummary/mlx.core.log10", "python/_autosummary/mlx.core.log1p", "python/_autosummary/mlx.core.log2", "python/_autosummary/mlx.core.logaddexp", "python/_autosummary/mlx.core.logical_and", "python/_autosummary/mlx.core.logical_not", "python/_autosummary/mlx.core.logical_or", "python/_autosummary/mlx.core.logsumexp", "python/_autosummary/mlx.core.matmul", "python/_autosummary/mlx.core.max", "python/_autosummary/mlx.core.maximum", "python/_autosummary/mlx.core.mean", "python/_autosummary/mlx.core.meshgrid", "python/_autosummary/mlx.core.metal.clear_cache", "python/_autosummary/mlx.core.metal.device_info", "python/_autosummary/mlx.core.metal.get_active_memory", "python/_autosummary/mlx.core.metal.get_cache_memory", "python/_autosummary/mlx.core.metal.get_peak_memory", "python/_autosummary/mlx.core.metal.is_available", "python/_autosummary/mlx.core.metal.reset_peak_memory", "python/_autosummary/mlx.core.metal.set_cache_limit", "python/_autosummary/mlx.core.metal.set_memory_limit", "python/_autosummary/mlx.core.metal.set_wired_limit", "python/_autosummary/mlx.core.metal.start_capture", "python/_autosummary/mlx.core.metal.stop_capture", "python/_autosummary/mlx.core.min", "python/_autosummary/mlx.core.minimum", "python/_autosummary/mlx.core.moveaxis", "python/_autosummary/mlx.core.multiply", "python/_autosummary/mlx.core.nan_to_num", "python/_autosummary/mlx.core.negative", "python/_autosummary/mlx.core.new_stream", "python/_autosummary/mlx.core.not_equal", "python/_autosummary/mlx.core.ones", "python/_autosummary/mlx.core.ones_like", "python/_autosummary/mlx.core.outer", "python/_autosummary/mlx.core.pad", "python/_autosummary/mlx.core.partition", "python/_autosummary/mlx.core.power", "python/_autosummary/mlx.core.prod", "python/_autosummary/mlx.core.put_along_axis", "python/_autosummary/mlx.core.quantize", "python/_autosummary/mlx.core.quantized_matmul", "python/_autosummary/mlx.core.radians", "python/_autosummary/mlx.core.random.bernoulli", "python/_autosummary/mlx.core.random.categorical", "python/_autosummary/mlx.core.random.gumbel", "python/_autosummary/mlx.core.random.key", "python/_autosummary/mlx.core.random.laplace", "python/_autosummary/mlx.core.random.multivariate_normal", "python/_autosummary/mlx.core.random.normal", "python/_autosummary/mlx.core.random.permutation", "python/_autosummary/mlx.core.random.randint", "python/_autosummary/mlx.core.random.seed", "python/_autosummary/mlx.core.random.split", "python/_autosummary/mlx.core.random.truncated_normal", "python/_autosummary/mlx.core.random.uniform", "python/_autosummary/mlx.core.real", "python/_autosummary/mlx.core.reciprocal", "python/_autosummary/mlx.core.remainder", "python/_autosummary/mlx.core.repeat", "python/_autosummary/mlx.core.reshape", "python/_autosummary/mlx.core.right_shift", "python/_autosummary/mlx.core.roll", "python/_autosummary/mlx.core.round", "python/_autosummary/mlx.core.rsqrt", "python/_autosummary/mlx.core.save", "python/_autosummary/mlx.core.save_gguf", "python/_autosummary/mlx.core.save_safetensors", "python/_autosummary/mlx.core.savez", "python/_autosummary/mlx.core.savez_compressed", "python/_autosummary/mlx.core.set_default_device", "python/_autosummary/mlx.core.set_default_stream", "python/_autosummary/mlx.core.sigmoid", "python/_autosummary/mlx.core.sign", "python/_autosummary/mlx.core.sin", "python/_autosummary/mlx.core.sinh", "python/_autosummary/mlx.core.softmax", "python/_autosummary/mlx.core.sort", "python/_autosummary/mlx.core.split", "python/_autosummary/mlx.core.sqrt", "python/_autosummary/mlx.core.square", "python/_autosummary/mlx.core.squeeze", "python/_autosummary/mlx.core.stack", "python/_autosummary/mlx.core.std", "python/_autosummary/mlx.core.stop_gradient", "python/_autosummary/mlx.core.stream", "python/_autosummary/mlx.core.subtract", "python/_autosummary/mlx.core.sum", "python/_autosummary/mlx.core.swapaxes", "python/_autosummary/mlx.core.synchronize", "python/_autosummary/mlx.core.take", "python/_autosummary/mlx.core.take_along_axis", "python/_autosummary/mlx.core.tan", "python/_autosummary/mlx.core.tanh", "python/_autosummary/mlx.core.tensordot", "python/_autosummary/mlx.core.tile", "python/_autosummary/mlx.core.topk", "python/_autosummary/mlx.core.trace", "python/_autosummary/mlx.core.transpose", "python/_autosummary/mlx.core.tri", "python/_autosummary/mlx.core.tril", "python/_autosummary/mlx.core.triu", "python/_autosummary/mlx.core.value_and_grad", "python/_autosummary/mlx.core.var", "python/_autosummary/mlx.core.view", "python/_autosummary/mlx.core.vjp", "python/_autosummary/mlx.core.vmap", "python/_autosummary/mlx.core.where", "python/_autosummary/mlx.core.zeros", "python/_autosummary/mlx.core.zeros_like", "python/_autosummary/mlx.nn.quantize", "python/_autosummary/mlx.nn.value_and_grad", "python/_autosummary/mlx.optimizers.clip_grad_norm", "python/_autosummary/mlx.utils.tree_flatten", "python/_autosummary/mlx.utils.tree_map", "python/_autosummary/mlx.utils.tree_map_with_path", "python/_autosummary/mlx.utils.tree_reduce", "python/_autosummary/mlx.utils.tree_unflatten", "python/_autosummary/stream_class", "python/array", "python/data_types", "python/devices_and_streams", "python/distributed", "python/fast", "python/fft", "python/linalg", "python/metal", "python/nn", "python/nn/_autosummary/mlx.nn.ALiBi", "python/nn/_autosummary/mlx.nn.AvgPool1d", "python/nn/_autosummary/mlx.nn.AvgPool2d", "python/nn/_autosummary/mlx.nn.BatchNorm", "python/nn/_autosummary/mlx.nn.CELU", "python/nn/_autosummary/mlx.nn.Conv1d", "python/nn/_autosummary/mlx.nn.Conv2d", "python/nn/_autosummary/mlx.nn.Conv3d", "python/nn/_autosummary/mlx.nn.ConvTranspose1d", "python/nn/_autosummary/mlx.nn.ConvTranspose2d", "python/nn/_autosummary/mlx.nn.ConvTranspose3d", "python/nn/_autosummary/mlx.nn.Dropout", "python/nn/_autosummary/mlx.nn.Dropout2d", "python/nn/_autosummary/mlx.nn.Dropout3d", "python/nn/_autosummary/mlx.nn.ELU", "python/nn/_autosummary/mlx.nn.Embedding", "python/nn/_autosummary/mlx.nn.GELU", "python/nn/_autosummary/mlx.nn.GLU", "python/nn/_autosummary/mlx.nn.GRU", "python/nn/_autosummary/mlx.nn.GroupNorm", "python/nn/_autosummary/mlx.nn.HardShrink", "python/nn/_autosummary/mlx.nn.HardTanh", "python/nn/_autosummary/mlx.nn.Hardswish", "python/nn/_autosummary/mlx.nn.InstanceNorm", "python/nn/_autosummary/mlx.nn.LSTM", "python/nn/_autosummary/mlx.nn.LayerNorm", "python/nn/_autosummary/mlx.nn.LeakyReLU", "python/nn/_autosummary/mlx.nn.Linear", "python/nn/_autosummary/mlx.nn.LogSigmoid", "python/nn/_autosummary/mlx.nn.LogSoftmax", "python/nn/_autosummary/mlx.nn.MaxPool1d", "python/nn/_autosummary/mlx.nn.MaxPool2d", "python/nn/_autosummary/mlx.nn.Mish", "python/nn/_autosummary/mlx.nn.Module.apply", "python/nn/_autosummary/mlx.nn.Module.apply_to_modules", "python/nn/_autosummary/mlx.nn.Module.children", "python/nn/_autosummary/mlx.nn.Module.eval", "python/nn/_autosummary/mlx.nn.Module.filter_and_map", "python/nn/_autosummary/mlx.nn.Module.freeze", "python/nn/_autosummary/mlx.nn.Module.leaf_modules", "python/nn/_autosummary/mlx.nn.Module.load_weights", "python/nn/_autosummary/mlx.nn.Module.modules", "python/nn/_autosummary/mlx.nn.Module.named_modules", "python/nn/_autosummary/mlx.nn.Module.parameters", "python/nn/_autosummary/mlx.nn.Module.save_weights", "python/nn/_autosummary/mlx.nn.Module.set_dtype", "python/nn/_autosummary/mlx.nn.Module.state", "python/nn/_autosummary/mlx.nn.Module.train", "python/nn/_autosummary/mlx.nn.Module.trainable_parameters", "python/nn/_autosummary/mlx.nn.Module.training", "python/nn/_autosummary/mlx.nn.Module.unfreeze", "python/nn/_autosummary/mlx.nn.Module.update", "python/nn/_autosummary/mlx.nn.Module.update_modules", "python/nn/_autosummary/mlx.nn.MultiHeadAttention", "python/nn/_autosummary/mlx.nn.PReLU", "python/nn/_autosummary/mlx.nn.QuantizedEmbedding", "python/nn/_autosummary/mlx.nn.QuantizedLinear", "python/nn/_autosummary/mlx.nn.RMSNorm", "python/nn/_autosummary/mlx.nn.RNN", "python/nn/_autosummary/mlx.nn.ReLU", "python/nn/_autosummary/mlx.nn.ReLU6", "python/nn/_autosummary/mlx.nn.RoPE", "python/nn/_autosummary/mlx.nn.SELU", "python/nn/_autosummary/mlx.nn.Sequential", "python/nn/_autosummary/mlx.nn.SiLU", "python/nn/_autosummary/mlx.nn.Sigmoid", "python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding", "python/nn/_autosummary/mlx.nn.Softmax", "python/nn/_autosummary/mlx.nn.Softmin", "python/nn/_autosummary/mlx.nn.Softplus", "python/nn/_autosummary/mlx.nn.Softshrink", "python/nn/_autosummary/mlx.nn.Softsign", "python/nn/_autosummary/mlx.nn.Step", "python/nn/_autosummary/mlx.nn.Tanh", "python/nn/_autosummary/mlx.nn.Transformer", "python/nn/_autosummary/mlx.nn.Upsample", "python/nn/_autosummary/mlx.nn.init.constant", "python/nn/_autosummary/mlx.nn.init.glorot_normal", "python/nn/_autosummary/mlx.nn.init.glorot_uniform", "python/nn/_autosummary/mlx.nn.init.he_normal", "python/nn/_autosummary/mlx.nn.init.he_uniform", "python/nn/_autosummary/mlx.nn.init.identity", "python/nn/_autosummary/mlx.nn.init.normal", "python/nn/_autosummary/mlx.nn.init.uniform", "python/nn/_autosummary_functions/mlx.nn.celu", "python/nn/_autosummary_functions/mlx.nn.elu", "python/nn/_autosummary_functions/mlx.nn.gelu", "python/nn/_autosummary_functions/mlx.nn.gelu_approx", "python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx", "python/nn/_autosummary_functions/mlx.nn.glu", "python/nn/_autosummary_functions/mlx.nn.hard_shrink", "python/nn/_autosummary_functions/mlx.nn.hard_tanh", "python/nn/_autosummary_functions/mlx.nn.hardswish", "python/nn/_autosummary_functions/mlx.nn.leaky_relu", "python/nn/_autosummary_functions/mlx.nn.log_sigmoid", "python/nn/_autosummary_functions/mlx.nn.log_softmax", "python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy", "python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss", "python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy", "python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss", "python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss", "python/nn/_autosummary_functions/mlx.nn.losses.huber_loss", "python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss", "python/nn/_autosummary_functions/mlx.nn.losses.l1_loss", "python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss", "python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss", "python/nn/_autosummary_functions/mlx.nn.losses.mse_loss", "python/nn/_autosummary_functions/mlx.nn.losses.nll_loss", "python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss", "python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss", "python/nn/_autosummary_functions/mlx.nn.mish", "python/nn/_autosummary_functions/mlx.nn.prelu", "python/nn/_autosummary_functions/mlx.nn.relu", "python/nn/_autosummary_functions/mlx.nn.relu6", "python/nn/_autosummary_functions/mlx.nn.selu", "python/nn/_autosummary_functions/mlx.nn.sigmoid", "python/nn/_autosummary_functions/mlx.nn.silu", "python/nn/_autosummary_functions/mlx.nn.softmax", "python/nn/_autosummary_functions/mlx.nn.softmin", "python/nn/_autosummary_functions/mlx.nn.softplus", "python/nn/_autosummary_functions/mlx.nn.softshrink", "python/nn/_autosummary_functions/mlx.nn.step", "python/nn/_autosummary_functions/mlx.nn.tanh", "python/nn/functions", "python/nn/init", "python/nn/layers", "python/nn/losses", "python/nn/module", "python/ops", "python/optimizers", "python/optimizers/_autosummary/mlx.optimizers.AdaDelta", "python/optimizers/_autosummary/mlx.optimizers.Adafactor", "python/optimizers/_autosummary/mlx.optimizers.Adagrad", "python/optimizers/_autosummary/mlx.optimizers.Adam", "python/optimizers/_autosummary/mlx.optimizers.AdamW", "python/optimizers/_autosummary/mlx.optimizers.Adamax", "python/optimizers/_autosummary/mlx.optimizers.Lion", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.init", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.state", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.update", "python/optimizers/_autosummary/mlx.optimizers.RMSprop", "python/optimizers/_autosummary/mlx.optimizers.SGD", "python/optimizers/_autosummary/mlx.optimizers.cosine_decay", "python/optimizers/_autosummary/mlx.optimizers.exponential_decay", "python/optimizers/_autosummary/mlx.optimizers.join_schedules", "python/optimizers/_autosummary/mlx.optimizers.linear_schedule", "python/optimizers/_autosummary/mlx.optimizers.step_decay", "python/optimizers/common_optimizers", "python/optimizers/optimizer", "python/optimizers/schedulers", "python/random", "python/transforms", "python/tree_utils", "usage/compile", "usage/distributed", "usage/function_transforms", "usage/indexing", "usage/lazy_evaluation", "usage/numpy", "usage/quick_start", "usage/saving_and_loading", "usage/unified_memory", "usage/using_streams"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1}, "filenames": ["cpp/ops.rst", "dev/custom_metal_kernels.rst", "dev/extensions.rst", "dev/metal_debugger.rst", "examples/linear_regression.rst", "examples/llama-inference.rst", "examples/mlp.rst", "index.rst", "install.rst", "python/_autosummary/mlx.core.Device.rst", "python/_autosummary/mlx.core.Dtype.rst", "python/_autosummary/mlx.core.DtypeCategory.rst", "python/_autosummary/mlx.core.abs.rst", "python/_autosummary/mlx.core.add.rst", "python/_autosummary/mlx.core.addmm.rst", "python/_autosummary/mlx.core.all.rst", "python/_autosummary/mlx.core.allclose.rst", "python/_autosummary/mlx.core.any.rst", "python/_autosummary/mlx.core.arange.rst", "python/_autosummary/mlx.core.arccos.rst", "python/_autosummary/mlx.core.arccosh.rst", "python/_autosummary/mlx.core.arcsin.rst", "python/_autosummary/mlx.core.arcsinh.rst", "python/_autosummary/mlx.core.arctan.rst", "python/_autosummary/mlx.core.arctan2.rst", "python/_autosummary/mlx.core.arctanh.rst", "python/_autosummary/mlx.core.argmax.rst", "python/_autosummary/mlx.core.argmin.rst", "python/_autosummary/mlx.core.argpartition.rst", "python/_autosummary/mlx.core.argsort.rst", "python/_autosummary/mlx.core.array.rst", "python/_autosummary/mlx.core.array.T.rst", "python/_autosummary/mlx.core.array.abs.rst", "python/_autosummary/mlx.core.array.all.rst", "python/_autosummary/mlx.core.array.any.rst", "python/_autosummary/mlx.core.array.argmax.rst", "python/_autosummary/mlx.core.array.argmin.rst", "python/_autosummary/mlx.core.array.astype.rst", "python/_autosummary/mlx.core.array.at.rst", "python/_autosummary/mlx.core.array.conj.rst", "python/_autosummary/mlx.core.array.cos.rst", "python/_autosummary/mlx.core.array.cummax.rst", "python/_autosummary/mlx.core.array.cummin.rst", "python/_autosummary/mlx.core.array.cumprod.rst", "python/_autosummary/mlx.core.array.cumsum.rst", "python/_autosummary/mlx.core.array.diag.rst", "python/_autosummary/mlx.core.array.diagonal.rst", "python/_autosummary/mlx.core.array.dtype.rst", "python/_autosummary/mlx.core.array.exp.rst", "python/_autosummary/mlx.core.array.flatten.rst", "python/_autosummary/mlx.core.array.item.rst", "python/_autosummary/mlx.core.array.itemsize.rst", "python/_autosummary/mlx.core.array.log.rst", "python/_autosummary/mlx.core.array.log10.rst", "python/_autosummary/mlx.core.array.log1p.rst", "python/_autosummary/mlx.core.array.log2.rst", "python/_autosummary/mlx.core.array.logsumexp.rst", "python/_autosummary/mlx.core.array.max.rst", "python/_autosummary/mlx.core.array.mean.rst", "python/_autosummary/mlx.core.array.min.rst", "python/_autosummary/mlx.core.array.moveaxis.rst", "python/_autosummary/mlx.core.array.nbytes.rst", "python/_autosummary/mlx.core.array.ndim.rst", "python/_autosummary/mlx.core.array.prod.rst", "python/_autosummary/mlx.core.array.reciprocal.rst", "python/_autosummary/mlx.core.array.reshape.rst", "python/_autosummary/mlx.core.array.round.rst", "python/_autosummary/mlx.core.array.rsqrt.rst", "python/_autosummary/mlx.core.array.shape.rst", "python/_autosummary/mlx.core.array.sin.rst", "python/_autosummary/mlx.core.array.size.rst", "python/_autosummary/mlx.core.array.split.rst", "python/_autosummary/mlx.core.array.sqrt.rst", "python/_autosummary/mlx.core.array.square.rst", "python/_autosummary/mlx.core.array.squeeze.rst", "python/_autosummary/mlx.core.array.std.rst", "python/_autosummary/mlx.core.array.sum.rst", "python/_autosummary/mlx.core.array.swapaxes.rst", "python/_autosummary/mlx.core.array.tolist.rst", "python/_autosummary/mlx.core.array.transpose.rst", "python/_autosummary/mlx.core.array.var.rst", "python/_autosummary/mlx.core.array.view.rst", "python/_autosummary/mlx.core.array_equal.rst", "python/_autosummary/mlx.core.as_strided.rst", "python/_autosummary/mlx.core.atleast_1d.rst", "python/_autosummary/mlx.core.atleast_2d.rst", "python/_autosummary/mlx.core.atleast_3d.rst", "python/_autosummary/mlx.core.bitwise_and.rst", "python/_autosummary/mlx.core.bitwise_or.rst", "python/_autosummary/mlx.core.bitwise_xor.rst", "python/_autosummary/mlx.core.block_masked_mm.rst", "python/_autosummary/mlx.core.broadcast_to.rst", "python/_autosummary/mlx.core.ceil.rst", "python/_autosummary/mlx.core.clip.rst", "python/_autosummary/mlx.core.compile.rst", "python/_autosummary/mlx.core.concatenate.rst", "python/_autosummary/mlx.core.conj.rst", "python/_autosummary/mlx.core.conjugate.rst", "python/_autosummary/mlx.core.conv1d.rst", "python/_autosummary/mlx.core.conv2d.rst", "python/_autosummary/mlx.core.conv3d.rst", "python/_autosummary/mlx.core.conv_general.rst", "python/_autosummary/mlx.core.conv_transpose1d.rst", "python/_autosummary/mlx.core.conv_transpose2d.rst", "python/_autosummary/mlx.core.conv_transpose3d.rst", "python/_autosummary/mlx.core.convolve.rst", "python/_autosummary/mlx.core.cos.rst", "python/_autosummary/mlx.core.cosh.rst", "python/_autosummary/mlx.core.cummax.rst", "python/_autosummary/mlx.core.cummin.rst", "python/_autosummary/mlx.core.cumprod.rst", "python/_autosummary/mlx.core.cumsum.rst", "python/_autosummary/mlx.core.custom_function.rst", "python/_autosummary/mlx.core.default_device.rst", "python/_autosummary/mlx.core.default_stream.rst", "python/_autosummary/mlx.core.degrees.rst", "python/_autosummary/mlx.core.dequantize.rst", "python/_autosummary/mlx.core.diag.rst", "python/_autosummary/mlx.core.diagonal.rst", "python/_autosummary/mlx.core.disable_compile.rst", "python/_autosummary/mlx.core.distributed.Group.rst", "python/_autosummary/mlx.core.distributed.all_gather.rst", "python/_autosummary/mlx.core.distributed.all_sum.rst", "python/_autosummary/mlx.core.distributed.init.rst", "python/_autosummary/mlx.core.distributed.is_available.rst", "python/_autosummary/mlx.core.distributed.recv.rst", "python/_autosummary/mlx.core.distributed.recv_like.rst", "python/_autosummary/mlx.core.distributed.send.rst", "python/_autosummary/mlx.core.divide.rst", "python/_autosummary/mlx.core.divmod.rst", "python/_autosummary/mlx.core.einsum.rst", "python/_autosummary/mlx.core.einsum_path.rst", "python/_autosummary/mlx.core.enable_compile.rst", "python/_autosummary/mlx.core.equal.rst", "python/_autosummary/mlx.core.erf.rst", "python/_autosummary/mlx.core.erfinv.rst", "python/_autosummary/mlx.core.eval.rst", "python/_autosummary/mlx.core.exp.rst", "python/_autosummary/mlx.core.expand_dims.rst", "python/_autosummary/mlx.core.expm1.rst", "python/_autosummary/mlx.core.eye.rst", "python/_autosummary/mlx.core.fast.affine_quantize.rst", "python/_autosummary/mlx.core.fast.layer_norm.rst", "python/_autosummary/mlx.core.fast.metal_kernel.rst", "python/_autosummary/mlx.core.fast.rms_norm.rst", "python/_autosummary/mlx.core.fast.rope.rst", "python/_autosummary/mlx.core.fast.scaled_dot_product_attention.rst", "python/_autosummary/mlx.core.fft.fft.rst", "python/_autosummary/mlx.core.fft.fft2.rst", "python/_autosummary/mlx.core.fft.fftn.rst", "python/_autosummary/mlx.core.fft.ifft.rst", "python/_autosummary/mlx.core.fft.ifft2.rst", "python/_autosummary/mlx.core.fft.ifftn.rst", "python/_autosummary/mlx.core.fft.irfft.rst", "python/_autosummary/mlx.core.fft.irfft2.rst", "python/_autosummary/mlx.core.fft.irfftn.rst", "python/_autosummary/mlx.core.fft.rfft.rst", "python/_autosummary/mlx.core.fft.rfft2.rst", "python/_autosummary/mlx.core.fft.rfftn.rst", "python/_autosummary/mlx.core.flatten.rst", "python/_autosummary/mlx.core.floor.rst", "python/_autosummary/mlx.core.floor_divide.rst", "python/_autosummary/mlx.core.full.rst", "python/_autosummary/mlx.core.gather_mm.rst", "python/_autosummary/mlx.core.gather_qmm.rst", "python/_autosummary/mlx.core.grad.rst", "python/_autosummary/mlx.core.greater.rst", "python/_autosummary/mlx.core.greater_equal.rst", "python/_autosummary/mlx.core.hadamard_transform.rst", "python/_autosummary/mlx.core.identity.rst", "python/_autosummary/mlx.core.imag.rst", "python/_autosummary/mlx.core.inner.rst", "python/_autosummary/mlx.core.isclose.rst", "python/_autosummary/mlx.core.isfinite.rst", "python/_autosummary/mlx.core.isinf.rst", "python/_autosummary/mlx.core.isnan.rst", "python/_autosummary/mlx.core.isneginf.rst", "python/_autosummary/mlx.core.isposinf.rst", "python/_autosummary/mlx.core.issubdtype.rst", "python/_autosummary/mlx.core.jvp.rst", "python/_autosummary/mlx.core.left_shift.rst", "python/_autosummary/mlx.core.less.rst", "python/_autosummary/mlx.core.less_equal.rst", "python/_autosummary/mlx.core.linalg.cholesky.rst", "python/_autosummary/mlx.core.linalg.cholesky_inv.rst", "python/_autosummary/mlx.core.linalg.cross.rst", "python/_autosummary/mlx.core.linalg.eigh.rst", "python/_autosummary/mlx.core.linalg.eigvalsh.rst", "python/_autosummary/mlx.core.linalg.inv.rst", "python/_autosummary/mlx.core.linalg.norm.rst", "python/_autosummary/mlx.core.linalg.qr.rst", "python/_autosummary/mlx.core.linalg.svd.rst", "python/_autosummary/mlx.core.linalg.tri_inv.rst", "python/_autosummary/mlx.core.linspace.rst", "python/_autosummary/mlx.core.load.rst", "python/_autosummary/mlx.core.log.rst", "python/_autosummary/mlx.core.log10.rst", "python/_autosummary/mlx.core.log1p.rst", "python/_autosummary/mlx.core.log2.rst", "python/_autosummary/mlx.core.logaddexp.rst", "python/_autosummary/mlx.core.logical_and.rst", "python/_autosummary/mlx.core.logical_not.rst", "python/_autosummary/mlx.core.logical_or.rst", "python/_autosummary/mlx.core.logsumexp.rst", "python/_autosummary/mlx.core.matmul.rst", "python/_autosummary/mlx.core.max.rst", "python/_autosummary/mlx.core.maximum.rst", "python/_autosummary/mlx.core.mean.rst", "python/_autosummary/mlx.core.meshgrid.rst", "python/_autosummary/mlx.core.metal.clear_cache.rst", "python/_autosummary/mlx.core.metal.device_info.rst", "python/_autosummary/mlx.core.metal.get_active_memory.rst", "python/_autosummary/mlx.core.metal.get_cache_memory.rst", "python/_autosummary/mlx.core.metal.get_peak_memory.rst", "python/_autosummary/mlx.core.metal.is_available.rst", "python/_autosummary/mlx.core.metal.reset_peak_memory.rst", "python/_autosummary/mlx.core.metal.set_cache_limit.rst", "python/_autosummary/mlx.core.metal.set_memory_limit.rst", "python/_autosummary/mlx.core.metal.set_wired_limit.rst", "python/_autosummary/mlx.core.metal.start_capture.rst", "python/_autosummary/mlx.core.metal.stop_capture.rst", "python/_autosummary/mlx.core.min.rst", "python/_autosummary/mlx.core.minimum.rst", "python/_autosummary/mlx.core.moveaxis.rst", "python/_autosummary/mlx.core.multiply.rst", "python/_autosummary/mlx.core.nan_to_num.rst", "python/_autosummary/mlx.core.negative.rst", "python/_autosummary/mlx.core.new_stream.rst", "python/_autosummary/mlx.core.not_equal.rst", "python/_autosummary/mlx.core.ones.rst", "python/_autosummary/mlx.core.ones_like.rst", "python/_autosummary/mlx.core.outer.rst", "python/_autosummary/mlx.core.pad.rst", "python/_autosummary/mlx.core.partition.rst", "python/_autosummary/mlx.core.power.rst", "python/_autosummary/mlx.core.prod.rst", "python/_autosummary/mlx.core.put_along_axis.rst", "python/_autosummary/mlx.core.quantize.rst", "python/_autosummary/mlx.core.quantized_matmul.rst", "python/_autosummary/mlx.core.radians.rst", "python/_autosummary/mlx.core.random.bernoulli.rst", "python/_autosummary/mlx.core.random.categorical.rst", "python/_autosummary/mlx.core.random.gumbel.rst", "python/_autosummary/mlx.core.random.key.rst", "python/_autosummary/mlx.core.random.laplace.rst", "python/_autosummary/mlx.core.random.multivariate_normal.rst", "python/_autosummary/mlx.core.random.normal.rst", "python/_autosummary/mlx.core.random.permutation.rst", "python/_autosummary/mlx.core.random.randint.rst", "python/_autosummary/mlx.core.random.seed.rst", "python/_autosummary/mlx.core.random.split.rst", "python/_autosummary/mlx.core.random.truncated_normal.rst", "python/_autosummary/mlx.core.random.uniform.rst", "python/_autosummary/mlx.core.real.rst", "python/_autosummary/mlx.core.reciprocal.rst", "python/_autosummary/mlx.core.remainder.rst", "python/_autosummary/mlx.core.repeat.rst", "python/_autosummary/mlx.core.reshape.rst", "python/_autosummary/mlx.core.right_shift.rst", "python/_autosummary/mlx.core.roll.rst", "python/_autosummary/mlx.core.round.rst", "python/_autosummary/mlx.core.rsqrt.rst", "python/_autosummary/mlx.core.save.rst", "python/_autosummary/mlx.core.save_gguf.rst", "python/_autosummary/mlx.core.save_safetensors.rst", "python/_autosummary/mlx.core.savez.rst", "python/_autosummary/mlx.core.savez_compressed.rst", "python/_autosummary/mlx.core.set_default_device.rst", "python/_autosummary/mlx.core.set_default_stream.rst", "python/_autosummary/mlx.core.sigmoid.rst", "python/_autosummary/mlx.core.sign.rst", "python/_autosummary/mlx.core.sin.rst", "python/_autosummary/mlx.core.sinh.rst", "python/_autosummary/mlx.core.softmax.rst", "python/_autosummary/mlx.core.sort.rst", "python/_autosummary/mlx.core.split.rst", "python/_autosummary/mlx.core.sqrt.rst", "python/_autosummary/mlx.core.square.rst", "python/_autosummary/mlx.core.squeeze.rst", "python/_autosummary/mlx.core.stack.rst", "python/_autosummary/mlx.core.std.rst", "python/_autosummary/mlx.core.stop_gradient.rst", "python/_autosummary/mlx.core.stream.rst", "python/_autosummary/mlx.core.subtract.rst", "python/_autosummary/mlx.core.sum.rst", "python/_autosummary/mlx.core.swapaxes.rst", "python/_autosummary/mlx.core.synchronize.rst", "python/_autosummary/mlx.core.take.rst", "python/_autosummary/mlx.core.take_along_axis.rst", "python/_autosummary/mlx.core.tan.rst", "python/_autosummary/mlx.core.tanh.rst", "python/_autosummary/mlx.core.tensordot.rst", "python/_autosummary/mlx.core.tile.rst", "python/_autosummary/mlx.core.topk.rst", "python/_autosummary/mlx.core.trace.rst", "python/_autosummary/mlx.core.transpose.rst", "python/_autosummary/mlx.core.tri.rst", "python/_autosummary/mlx.core.tril.rst", "python/_autosummary/mlx.core.triu.rst", "python/_autosummary/mlx.core.value_and_grad.rst", "python/_autosummary/mlx.core.var.rst", "python/_autosummary/mlx.core.view.rst", "python/_autosummary/mlx.core.vjp.rst", "python/_autosummary/mlx.core.vmap.rst", "python/_autosummary/mlx.core.where.rst", "python/_autosummary/mlx.core.zeros.rst", "python/_autosummary/mlx.core.zeros_like.rst", "python/_autosummary/mlx.nn.quantize.rst", "python/_autosummary/mlx.nn.value_and_grad.rst", "python/_autosummary/mlx.optimizers.clip_grad_norm.rst", "python/_autosummary/mlx.utils.tree_flatten.rst", "python/_autosummary/mlx.utils.tree_map.rst", "python/_autosummary/mlx.utils.tree_map_with_path.rst", "python/_autosummary/mlx.utils.tree_reduce.rst", "python/_autosummary/mlx.utils.tree_unflatten.rst", "python/_autosummary/stream_class.rst", "python/array.rst", "python/data_types.rst", "python/devices_and_streams.rst", "python/distributed.rst", "python/fast.rst", "python/fft.rst", "python/linalg.rst", "python/metal.rst", "python/nn.rst", "python/nn/_autosummary/mlx.nn.ALiBi.rst", "python/nn/_autosummary/mlx.nn.AvgPool1d.rst", "python/nn/_autosummary/mlx.nn.AvgPool2d.rst", "python/nn/_autosummary/mlx.nn.BatchNorm.rst", "python/nn/_autosummary/mlx.nn.CELU.rst", "python/nn/_autosummary/mlx.nn.Conv1d.rst", "python/nn/_autosummary/mlx.nn.Conv2d.rst", "python/nn/_autosummary/mlx.nn.Conv3d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose1d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose2d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose3d.rst", "python/nn/_autosummary/mlx.nn.Dropout.rst", "python/nn/_autosummary/mlx.nn.Dropout2d.rst", "python/nn/_autosummary/mlx.nn.Dropout3d.rst", "python/nn/_autosummary/mlx.nn.ELU.rst", "python/nn/_autosummary/mlx.nn.Embedding.rst", "python/nn/_autosummary/mlx.nn.GELU.rst", "python/nn/_autosummary/mlx.nn.GLU.rst", "python/nn/_autosummary/mlx.nn.GRU.rst", "python/nn/_autosummary/mlx.nn.GroupNorm.rst", "python/nn/_autosummary/mlx.nn.HardShrink.rst", "python/nn/_autosummary/mlx.nn.HardTanh.rst", "python/nn/_autosummary/mlx.nn.Hardswish.rst", "python/nn/_autosummary/mlx.nn.InstanceNorm.rst", "python/nn/_autosummary/mlx.nn.LSTM.rst", "python/nn/_autosummary/mlx.nn.LayerNorm.rst", "python/nn/_autosummary/mlx.nn.LeakyReLU.rst", "python/nn/_autosummary/mlx.nn.Linear.rst", "python/nn/_autosummary/mlx.nn.LogSigmoid.rst", "python/nn/_autosummary/mlx.nn.LogSoftmax.rst", "python/nn/_autosummary/mlx.nn.MaxPool1d.rst", "python/nn/_autosummary/mlx.nn.MaxPool2d.rst", "python/nn/_autosummary/mlx.nn.Mish.rst", "python/nn/_autosummary/mlx.nn.Module.apply.rst", "python/nn/_autosummary/mlx.nn.Module.apply_to_modules.rst", "python/nn/_autosummary/mlx.nn.Module.children.rst", "python/nn/_autosummary/mlx.nn.Module.eval.rst", "python/nn/_autosummary/mlx.nn.Module.filter_and_map.rst", "python/nn/_autosummary/mlx.nn.Module.freeze.rst", "python/nn/_autosummary/mlx.nn.Module.leaf_modules.rst", "python/nn/_autosummary/mlx.nn.Module.load_weights.rst", "python/nn/_autosummary/mlx.nn.Module.modules.rst", "python/nn/_autosummary/mlx.nn.Module.named_modules.rst", "python/nn/_autosummary/mlx.nn.Module.parameters.rst", "python/nn/_autosummary/mlx.nn.Module.save_weights.rst", "python/nn/_autosummary/mlx.nn.Module.set_dtype.rst", "python/nn/_autosummary/mlx.nn.Module.state.rst", "python/nn/_autosummary/mlx.nn.Module.train.rst", "python/nn/_autosummary/mlx.nn.Module.trainable_parameters.rst", "python/nn/_autosummary/mlx.nn.Module.training.rst", "python/nn/_autosummary/mlx.nn.Module.unfreeze.rst", "python/nn/_autosummary/mlx.nn.Module.update.rst", "python/nn/_autosummary/mlx.nn.Module.update_modules.rst", "python/nn/_autosummary/mlx.nn.MultiHeadAttention.rst", "python/nn/_autosummary/mlx.nn.PReLU.rst", "python/nn/_autosummary/mlx.nn.QuantizedEmbedding.rst", "python/nn/_autosummary/mlx.nn.QuantizedLinear.rst", "python/nn/_autosummary/mlx.nn.RMSNorm.rst", "python/nn/_autosummary/mlx.nn.RNN.rst", "python/nn/_autosummary/mlx.nn.ReLU.rst", "python/nn/_autosummary/mlx.nn.ReLU6.rst", "python/nn/_autosummary/mlx.nn.RoPE.rst", "python/nn/_autosummary/mlx.nn.SELU.rst", "python/nn/_autosummary/mlx.nn.Sequential.rst", "python/nn/_autosummary/mlx.nn.SiLU.rst", "python/nn/_autosummary/mlx.nn.Sigmoid.rst", "python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding.rst", "python/nn/_autosummary/mlx.nn.Softmax.rst", "python/nn/_autosummary/mlx.nn.Softmin.rst", "python/nn/_autosummary/mlx.nn.Softplus.rst", "python/nn/_autosummary/mlx.nn.Softshrink.rst", "python/nn/_autosummary/mlx.nn.Softsign.rst", "python/nn/_autosummary/mlx.nn.Step.rst", "python/nn/_autosummary/mlx.nn.Tanh.rst", "python/nn/_autosummary/mlx.nn.Transformer.rst", "python/nn/_autosummary/mlx.nn.Upsample.rst", "python/nn/_autosummary/mlx.nn.init.constant.rst", "python/nn/_autosummary/mlx.nn.init.glorot_normal.rst", "python/nn/_autosummary/mlx.nn.init.glorot_uniform.rst", "python/nn/_autosummary/mlx.nn.init.he_normal.rst", "python/nn/_autosummary/mlx.nn.init.he_uniform.rst", "python/nn/_autosummary/mlx.nn.init.identity.rst", "python/nn/_autosummary/mlx.nn.init.normal.rst", "python/nn/_autosummary/mlx.nn.init.uniform.rst", "python/nn/_autosummary_functions/mlx.nn.celu.rst", "python/nn/_autosummary_functions/mlx.nn.elu.rst", "python/nn/_autosummary_functions/mlx.nn.gelu.rst", "python/nn/_autosummary_functions/mlx.nn.gelu_approx.rst", "python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx.rst", "python/nn/_autosummary_functions/mlx.nn.glu.rst", "python/nn/_autosummary_functions/mlx.nn.hard_shrink.rst", "python/nn/_autosummary_functions/mlx.nn.hard_tanh.rst", "python/nn/_autosummary_functions/mlx.nn.hardswish.rst", "python/nn/_autosummary_functions/mlx.nn.leaky_relu.rst", "python/nn/_autosummary_functions/mlx.nn.log_sigmoid.rst", "python/nn/_autosummary_functions/mlx.nn.log_softmax.rst", "python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy.rst", "python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy.rst", "python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.huber_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.l1_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.mse_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.nll_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss.rst", "python/nn/_autosummary_functions/mlx.nn.mish.rst", "python/nn/_autosummary_functions/mlx.nn.prelu.rst", "python/nn/_autosummary_functions/mlx.nn.relu.rst", "python/nn/_autosummary_functions/mlx.nn.relu6.rst", "python/nn/_autosummary_functions/mlx.nn.selu.rst", "python/nn/_autosummary_functions/mlx.nn.sigmoid.rst", "python/nn/_autosummary_functions/mlx.nn.silu.rst", "python/nn/_autosummary_functions/mlx.nn.softmax.rst", "python/nn/_autosummary_functions/mlx.nn.softmin.rst", "python/nn/_autosummary_functions/mlx.nn.softplus.rst", "python/nn/_autosummary_functions/mlx.nn.softshrink.rst", "python/nn/_autosummary_functions/mlx.nn.step.rst", "python/nn/_autosummary_functions/mlx.nn.tanh.rst", "python/nn/functions.rst", "python/nn/init.rst", "python/nn/layers.rst", "python/nn/losses.rst", "python/nn/module.rst", "python/ops.rst", "python/optimizers.rst", "python/optimizers/_autosummary/mlx.optimizers.AdaDelta.rst", "python/optimizers/_autosummary/mlx.optimizers.Adafactor.rst", "python/optimizers/_autosummary/mlx.optimizers.Adagrad.rst", "python/optimizers/_autosummary/mlx.optimizers.Adam.rst", "python/optimizers/_autosummary/mlx.optimizers.AdamW.rst", "python/optimizers/_autosummary/mlx.optimizers.Adamax.rst", "python/optimizers/_autosummary/mlx.optimizers.Lion.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.init.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.state.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.update.rst", "python/optimizers/_autosummary/mlx.optimizers.RMSprop.rst", "python/optimizers/_autosummary/mlx.optimizers.SGD.rst", "python/optimizers/_autosummary/mlx.optimizers.cosine_decay.rst", "python/optimizers/_autosummary/mlx.optimizers.exponential_decay.rst", "python/optimizers/_autosummary/mlx.optimizers.join_schedules.rst", "python/optimizers/_autosummary/mlx.optimizers.linear_schedule.rst", "python/optimizers/_autosummary/mlx.optimizers.step_decay.rst", "python/optimizers/common_optimizers.rst", "python/optimizers/optimizer.rst", "python/optimizers/schedulers.rst", "python/random.rst", "python/transforms.rst", "python/tree_utils.rst", "usage/compile.rst", "usage/distributed.rst", "usage/function_transforms.rst", "usage/indexing.rst", "usage/lazy_evaluation.rst", "usage/numpy.rst", "usage/quick_start.rst", "usage/saving_and_loading.rst", "usage/unified_memory.rst", "usage/using_streams.rst"], "indexentries": {"__init__() (array method)": [[30, "mlx.core.array.__init__", false]], "__init__() (custom_function method)": [[112, "mlx.core.custom_function.__init__", false]], "__init__() (device method)": [[9, "mlx.core.Device.__init__", false]], "__init__() (dtype method)": [[10, "mlx.core.Dtype.__init__", false]], "__init__() (dtypecategory method)": [[11, "mlx.core.DtypeCategory.__init__", false]], "__init__() (group method)": [[120, "mlx.core.distributed.Group.__init__", false]], "__init__() (stream method)": [[315, "mlx.core.Stream.__init__", false]], "abs (c++ function)": [[0, "_CPPv43absRK5array14StreamOrDevice", false]], "abs() (array method)": [[32, "mlx.core.array.abs", false]], "abs() (in module mlx.core)": [[12, "mlx.core.abs", false]], "adadelta (class in mlx.optimizers)": [[455, "mlx.optimizers.AdaDelta", false]], "adafactor (class in mlx.optimizers)": [[456, "mlx.optimizers.Adafactor", false]], "adagrad (class in mlx.optimizers)": [[457, "mlx.optimizers.Adagrad", false]], "adam (class in mlx.optimizers)": [[458, "mlx.optimizers.Adam", false]], "adamax (class in mlx.optimizers)": [[460, "mlx.optimizers.Adamax", false]], "adamw (class in mlx.optimizers)": [[459, "mlx.optimizers.AdamW", false]], "add (c++ function)": [[0, "_CPPv43addRK5arrayRK5array14StreamOrDevice", false]], "add() (in module mlx.core)": [[13, "mlx.core.add", false]], "addmm (c++ function)": [[0, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", false]], "addmm() (in module mlx.core)": [[14, "mlx.core.addmm", false]], "affine_quantize() (in module mlx.core.fast)": [[141, "mlx.core.fast.affine_quantize", false]], "alibi (class in mlx.nn)": [[325, "mlx.nn.ALiBi", false]], "all (c++ function)": [[0, "_CPPv43allRK5array14StreamOrDevice", false], [0, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43allRK5arrayb14StreamOrDevice", false], [0, "_CPPv43allRK5arrayib14StreamOrDevice", false]], "all() (array method)": [[33, "mlx.core.array.all", false]], "all() (in module mlx.core)": [[15, "mlx.core.all", false]], "all_gather() (in module mlx.core.distributed)": [[121, "mlx.core.distributed.all_gather", false]], "all_sum() (in module mlx.core.distributed)": [[122, "mlx.core.distributed.all_sum", false]], "allclose (c++ function)": [[0, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", false]], "allclose() (in module mlx.core)": [[16, "mlx.core.allclose", false]], "any (c++ function)": [[0, "_CPPv43anyRK5array14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayb14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayib14StreamOrDevice", false]], "any() (array method)": [[34, "mlx.core.array.any", false]], "any() (in module mlx.core)": [[17, "mlx.core.any", false]], "apply() (module method)": [[358, "mlx.nn.Module.apply", false]], "apply_gradients() (optimizer method)": [[462, "mlx.optimizers.Optimizer.apply_gradients", false]], "apply_to_modules() (module method)": [[359, "mlx.nn.Module.apply_to_modules", false]], "arange (c++ function)": [[0, "_CPPv46aranged14StreamOrDevice", false], [0, "_CPPv46aranged5Dtype14StreamOrDevice", false], [0, "_CPPv46arangedd14StreamOrDevice", false], [0, "_CPPv46arangedd5Dtype14StreamOrDevice", false], [0, "_CPPv46arangeddd14StreamOrDevice", false], [0, "_CPPv46arangeddd5Dtype14StreamOrDevice", false], [0, "_CPPv46arangei14StreamOrDevice", false], [0, "_CPPv46arangeii14StreamOrDevice", false], [0, "_CPPv46arangeiii14StreamOrDevice", false]], "arange() (in module mlx.core)": [[18, "mlx.core.arange", false]], "arccos (c++ function)": [[0, "_CPPv46arccosRK5array14StreamOrDevice", false]], "arccos() (in module mlx.core)": [[19, "mlx.core.arccos", false]], "arccosh (c++ function)": [[0, "_CPPv47arccoshRK5array14StreamOrDevice", false]], "arccosh() (in module mlx.core)": [[20, "mlx.core.arccosh", false]], "arcsin (c++ function)": [[0, "_CPPv46arcsinRK5array14StreamOrDevice", false]], "arcsin() (in module mlx.core)": [[21, "mlx.core.arcsin", false]], "arcsinh (c++ function)": [[0, "_CPPv47arcsinhRK5array14StreamOrDevice", false]], "arcsinh() (in module mlx.core)": [[22, "mlx.core.arcsinh", false]], "arctan (c++ function)": [[0, "_CPPv46arctanRK5array14StreamOrDevice", false]], "arctan() (in module mlx.core)": [[23, "mlx.core.arctan", false]], "arctan2 (c++ function)": [[0, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", false]], "arctan2() (in module mlx.core)": [[24, "mlx.core.arctan2", false]], "arctanh (c++ function)": [[0, "_CPPv47arctanhRK5array14StreamOrDevice", false]], "arctanh() (in module mlx.core)": [[25, "mlx.core.arctanh", false]], "argmax (c++ function)": [[0, "_CPPv46argmaxRK5array14StreamOrDevice", false], [0, "_CPPv46argmaxRK5arrayb14StreamOrDevice", false], [0, "_CPPv46argmaxRK5arrayib14StreamOrDevice", false]], "argmax() (array method)": [[35, "mlx.core.array.argmax", false]], "argmax() (in module mlx.core)": [[26, "mlx.core.argmax", false]], "argmin (c++ function)": [[0, "_CPPv46argminRK5array14StreamOrDevice", false], [0, "_CPPv46argminRK5arrayb14StreamOrDevice", false], [0, "_CPPv46argminRK5arrayib14StreamOrDevice", false]], "argmin() (array method)": [[36, "mlx.core.array.argmin", false]], "argmin() (in module mlx.core)": [[27, "mlx.core.argmin", false]], "argpartition (c++ function)": [[0, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", false], [0, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", false]], "argpartition() (in module mlx.core)": [[28, "mlx.core.argpartition", false]], "argsort (c++ function)": [[0, "_CPPv47argsortRK5array14StreamOrDevice", false], [0, "_CPPv47argsortRK5arrayi14StreamOrDevice", false]], "argsort() (in module mlx.core)": [[29, "mlx.core.argsort", false]], "array (class in mlx.core)": [[30, "mlx.core.array", false]], "array_equal (c++ function)": [[0, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", false], [0, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", false]], "array_equal() (in module mlx.core)": [[82, "mlx.core.array_equal", false]], "as_strided (c++ function)": [[0, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", false]], "as_strided() (in module mlx.core)": [[83, "mlx.core.as_strided", false]], "astype (c++ function)": [[0, "_CPPv46astype5array5Dtype14StreamOrDevice", false]], "astype() (array method)": [[37, "mlx.core.array.astype", false]], "at (array property)": [[38, "mlx.core.array.at", false]], "atleast_1d (c++ function)": [[0, "_CPPv410atleast_1dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_1d() (in module mlx.core)": [[84, "mlx.core.atleast_1d", false]], "atleast_2d (c++ function)": [[0, "_CPPv410atleast_2dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_2d() (in module mlx.core)": [[85, "mlx.core.atleast_2d", false]], "atleast_3d (c++ function)": [[0, "_CPPv410atleast_3dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_3d() (in module mlx.core)": [[86, "mlx.core.atleast_3d", false]], "avgpool1d (class in mlx.nn)": [[326, "mlx.nn.AvgPool1d", false]], "avgpool2d (class in mlx.nn)": [[327, "mlx.nn.AvgPool2d", false]], "batchnorm (class in mlx.nn)": [[328, "mlx.nn.BatchNorm", false]], "bernoulli() (in module mlx.core.random)": [[240, "mlx.core.random.bernoulli", false]], "binary_cross_entropy (class in mlx.nn.losses)": [[421, "mlx.nn.losses.binary_cross_entropy", false]], "bitwise_and (c++ function)": [[0, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", false]], "bitwise_and() (in module mlx.core)": [[87, "mlx.core.bitwise_and", false]], "bitwise_or (c++ function)": [[0, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", false]], "bitwise_or() (in module mlx.core)": [[88, "mlx.core.bitwise_or", false]], "bitwise_xor (c++ function)": [[0, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", false]], "bitwise_xor() (in module mlx.core)": [[89, "mlx.core.bitwise_xor", false]], "block_masked_mm (c++ function)": [[0, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", false]], "block_masked_mm() (in module mlx.core)": [[90, "mlx.core.block_masked_mm", false]], "broadcast_arrays (c++ function)": [[0, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "broadcast_to (c++ function)": [[0, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "broadcast_to() (in module mlx.core)": [[91, "mlx.core.broadcast_to", false]], "categorical() (in module mlx.core.random)": [[241, "mlx.core.random.categorical", false]], "ceil (c++ function)": [[0, "_CPPv44ceilRK5array14StreamOrDevice", false]], "ceil() (in module mlx.core)": [[92, "mlx.core.ceil", false]], "celu (class in mlx.nn)": [[329, "mlx.nn.CELU", false], [409, "mlx.nn.celu", false]], "children() (module method)": [[360, "mlx.nn.Module.children", false]], "cholesky() (in module mlx.core.linalg)": [[183, "mlx.core.linalg.cholesky", false]], "cholesky_inv() (in module mlx.core.linalg)": [[184, "mlx.core.linalg.cholesky_inv", false]], "clear_cache() (in module mlx.core.metal)": [[209, "mlx.core.metal.clear_cache", false]], "clip (c++ function)": [[0, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", false]], "clip() (in module mlx.core)": [[93, "mlx.core.clip", false]], "clip_grad_norm() (in module mlx.optimizers)": [[309, "mlx.optimizers.clip_grad_norm", false]], "compile() (in module mlx.core)": [[94, "mlx.core.compile", false]], "concatenate (c++ function)": [[0, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", false], [0, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", false]], "concatenate() (in module mlx.core)": [[95, "mlx.core.concatenate", false]], "conj() (array method)": [[39, "mlx.core.array.conj", false]], "conj() (in module mlx.core)": [[96, "mlx.core.conj", false]], "conjugate (c++ function)": [[0, "_CPPv49conjugateRK5array14StreamOrDevice", false]], "conjugate() (in module mlx.core)": [[97, "mlx.core.conjugate", false]], "constant() (in module mlx.nn.init)": [[401, "mlx.nn.init.constant", false]], "conv1d (c++ function)": [[0, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", false]], "conv1d (class in mlx.nn)": [[330, "mlx.nn.Conv1d", false]], "conv1d() (in module mlx.core)": [[98, "mlx.core.conv1d", false]], "conv2d (c++ function)": [[0, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", false]], "conv2d (class in mlx.nn)": [[331, "mlx.nn.Conv2d", false]], "conv2d() (in module mlx.core)": [[99, "mlx.core.conv2d", false]], "conv3d (c++ function)": [[0, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", false]], "conv3d (class in mlx.nn)": [[332, "mlx.nn.Conv3d", false]], "conv3d() (in module mlx.core)": [[100, "mlx.core.conv3d", false]], "conv_general (c++ function)": [[0, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", false], [0, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", false]], "conv_general() (in module mlx.core)": [[101, "mlx.core.conv_general", false]], "conv_transpose1d (c++ function)": [[0, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", false]], "conv_transpose1d() (in module mlx.core)": [[102, "mlx.core.conv_transpose1d", false]], "conv_transpose2d (c++ function)": [[0, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", false]], "conv_transpose2d() (in module mlx.core)": [[103, "mlx.core.conv_transpose2d", false]], "conv_transpose3d (c++ function)": [[0, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", false]], "conv_transpose3d() (in module mlx.core)": [[104, "mlx.core.conv_transpose3d", false]], "convolve() (in module mlx.core)": [[105, "mlx.core.convolve", false]], "convtranspose1d (class in mlx.nn)": [[333, "mlx.nn.ConvTranspose1d", false]], "convtranspose2d (class in mlx.nn)": [[334, "mlx.nn.ConvTranspose2d", false]], "convtranspose3d (class in mlx.nn)": [[335, "mlx.nn.ConvTranspose3d", false]], "copy (c++ function)": [[0, "_CPPv44copy5array14StreamOrDevice", false]], "cos (c++ function)": [[0, "_CPPv43cosRK5array14StreamOrDevice", false]], "cos() (array method)": [[40, "mlx.core.array.cos", false]], "cos() (in module mlx.core)": [[106, "mlx.core.cos", false]], "cosh (c++ function)": [[0, "_CPPv44coshRK5array14StreamOrDevice", false]], "cosh() (in module mlx.core)": [[107, "mlx.core.cosh", false]], "cosine_decay() (in module mlx.optimizers)": [[468, "mlx.optimizers.cosine_decay", false]], "cosine_similarity_loss (class in mlx.nn.losses)": [[422, "mlx.nn.losses.cosine_similarity_loss", false]], "cross() (in module mlx.core.linalg)": [[185, "mlx.core.linalg.cross", false]], "cross_entropy (class in mlx.nn.losses)": [[423, "mlx.nn.losses.cross_entropy", false]], "cummax (c++ function)": [[0, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", false]], "cummax() (array method)": [[41, "mlx.core.array.cummax", false]], "cummax() (in module mlx.core)": [[108, "mlx.core.cummax", false]], "cummin (c++ function)": [[0, "_CPPv46cumminRK5arrayibb14StreamOrDevice", false]], "cummin() (array method)": [[42, "mlx.core.array.cummin", false]], "cummin() (in module mlx.core)": [[109, "mlx.core.cummin", false]], "cumprod (c++ function)": [[0, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", false]], "cumprod() (array method)": [[43, "mlx.core.array.cumprod", false]], "cumprod() (in module mlx.core)": [[110, "mlx.core.cumprod", false]], "cumsum (c++ function)": [[0, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", false]], "cumsum() (array method)": [[44, "mlx.core.array.cumsum", false]], "cumsum() (in module mlx.core)": [[111, "mlx.core.cumsum", false]], "custom_function (class in mlx.core)": [[112, "mlx.core.custom_function", false]], "default_device() (in module mlx.core)": [[113, "mlx.core.default_device", false]], "default_stream() (in module mlx.core)": [[114, "mlx.core.default_stream", false]], "degrees (c++ function)": [[0, "_CPPv47degreesRK5array14StreamOrDevice", false]], "degrees() (in module mlx.core)": [[115, "mlx.core.degrees", false]], "depends (c++ function)": [[0, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", false]], "dequantize (c++ function)": [[0, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", false]], "dequantize() (in module mlx.core)": [[116, "mlx.core.dequantize", false]], "device (class in mlx.core)": [[9, "mlx.core.Device", false]], "device_info() (in module mlx.core.metal)": [[210, "mlx.core.metal.device_info", false]], "diag (c++ function)": [[0, "_CPPv44diagRK5arrayi14StreamOrDevice", false]], "diag() (array method)": [[45, "mlx.core.array.diag", false]], "diag() (in module mlx.core)": [[117, "mlx.core.diag", false]], "diagonal (c++ function)": [[0, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", false]], "diagonal() (array method)": [[46, "mlx.core.array.diagonal", false]], "diagonal() (in module mlx.core)": [[118, "mlx.core.diagonal", false]], "disable_compile() (in module mlx.core)": [[119, "mlx.core.disable_compile", false]], "divide (c++ function)": [[0, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", false]], "divide() (in module mlx.core)": [[128, "mlx.core.divide", false]], "divmod (c++ function)": [[0, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", false]], "divmod() (in module mlx.core)": [[129, "mlx.core.divmod", false]], "dropout (class in mlx.nn)": [[336, "mlx.nn.Dropout", false]], "dropout2d (class in mlx.nn)": [[337, "mlx.nn.Dropout2d", false]], "dropout3d (class in mlx.nn)": [[338, "mlx.nn.Dropout3d", false]], "dtype (array property)": [[47, "mlx.core.array.dtype", false]], "dtype (class in mlx.core)": [[10, "mlx.core.Dtype", false]], "dtypecategory (class in mlx.core)": [[11, "mlx.core.DtypeCategory", false]], "eigh() (in module mlx.core.linalg)": [[186, "mlx.core.linalg.eigh", false]], "eigvalsh() (in module mlx.core.linalg)": [[187, "mlx.core.linalg.eigvalsh", false]], "einsum() (in module mlx.core)": [[130, "mlx.core.einsum", false]], "einsum_path() (in module mlx.core)": [[131, "mlx.core.einsum_path", false]], "elu (class in mlx.nn)": [[339, "mlx.nn.ELU", false], [410, "mlx.nn.elu", false]], "embedding (class in mlx.nn)": [[340, "mlx.nn.Embedding", false]], "enable_compile() (in module mlx.core)": [[132, "mlx.core.enable_compile", false]], "equal (c++ function)": [[0, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", false]], "equal() (in module mlx.core)": [[133, "mlx.core.equal", false]], "erf (c++ function)": [[0, "_CPPv43erfRK5array14StreamOrDevice", false]], "erf() (in module mlx.core)": [[134, "mlx.core.erf", false]], "erfinv (c++ function)": [[0, "_CPPv46erfinvRK5array14StreamOrDevice", false]], "erfinv() (in module mlx.core)": [[135, "mlx.core.erfinv", false]], "eval() (in module mlx.core)": [[136, "mlx.core.eval", false]], "eval() (module method)": [[361, "mlx.nn.Module.eval", false]], "exp (c++ function)": [[0, "_CPPv43expRK5array14StreamOrDevice", false]], "exp() (array method)": [[48, "mlx.core.array.exp", false]], "exp() (in module mlx.core)": [[137, "mlx.core.exp", false]], "expand_dims (c++ function)": [[0, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", false]], "expand_dims() (in module mlx.core)": [[138, "mlx.core.expand_dims", false]], "expm1 (c++ function)": [[0, "_CPPv45expm1RK5array14StreamOrDevice", false]], "expm1() (in module mlx.core)": [[139, "mlx.core.expm1", false]], "exponential_decay() (in module mlx.optimizers)": [[469, "mlx.optimizers.exponential_decay", false]], "eye (c++ function)": [[0, "_CPPv43eyei14StreamOrDevice", false], [0, "_CPPv43eyei5Dtype14StreamOrDevice", false], [0, "_CPPv43eyeii14StreamOrDevice", false], [0, "_CPPv43eyeiii14StreamOrDevice", false], [0, "_CPPv43eyeiii5Dtype14StreamOrDevice", false]], "eye() (in module mlx.core)": [[140, "mlx.core.eye", false]], "fft() (in module mlx.core.fft)": [[147, "mlx.core.fft.fft", false]], "fft2() (in module mlx.core.fft)": [[148, "mlx.core.fft.fft2", false]], "fftn() (in module mlx.core.fft)": [[149, "mlx.core.fft.fftn", false]], "filter_and_map() (module method)": [[362, "mlx.nn.Module.filter_and_map", false]], "flatten (c++ function)": [[0, "_CPPv47flattenRK5array14StreamOrDevice", false], [0, "_CPPv47flattenRK5arrayii14StreamOrDevice", false]], "flatten() (array method)": [[49, "mlx.core.array.flatten", false]], "flatten() (in module mlx.core)": [[159, "mlx.core.flatten", false]], "floor (c++ function)": [[0, "_CPPv45floorRK5array14StreamOrDevice", false]], "floor() (in module mlx.core)": [[160, "mlx.core.floor", false]], "floor_divide (c++ function)": [[0, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", false]], "floor_divide() (in module mlx.core)": [[161, "mlx.core.floor_divide", false]], "freeze() (module method)": [[363, "mlx.nn.Module.freeze", false]], "full (c++ function)": [[0, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", false], [0, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", false], [0, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", false], [0, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", false]], "full() (in module mlx.core)": [[162, "mlx.core.full", false]], "gather (c++ function)": [[0, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false]], "gather_mm (c++ function)": [[0, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", false]], "gather_mm() (in module mlx.core)": [[163, "mlx.core.gather_mm", false]], "gather_qmm (c++ function)": [[0, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", false]], "gather_qmm() (in module mlx.core)": [[164, "mlx.core.gather_qmm", false]], "gaussian_nll_loss (class in mlx.nn.losses)": [[424, "mlx.nn.losses.gaussian_nll_loss", false]], "gelu (class in mlx.nn)": [[341, "mlx.nn.GELU", false], [411, "mlx.nn.gelu", false]], "gelu_approx (class in mlx.nn)": [[412, "mlx.nn.gelu_approx", false]], "gelu_fast_approx (class in mlx.nn)": [[413, "mlx.nn.gelu_fast_approx", false]], "get_active_memory() (in module mlx.core.metal)": [[211, "mlx.core.metal.get_active_memory", false]], "get_cache_memory() (in module mlx.core.metal)": [[212, "mlx.core.metal.get_cache_memory", false]], "get_peak_memory() (in module mlx.core.metal)": [[213, "mlx.core.metal.get_peak_memory", false]], "glorot_normal() (in module mlx.nn.init)": [[402, "mlx.nn.init.glorot_normal", false]], "glorot_uniform() (in module mlx.nn.init)": [[403, "mlx.nn.init.glorot_uniform", false]], "glu (class in mlx.nn)": [[342, "mlx.nn.GLU", false], [414, "mlx.nn.glu", false]], "grad() (in module mlx.core)": [[165, "mlx.core.grad", false]], "greater (c++ function)": [[0, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", false]], "greater() (in module mlx.core)": [[166, "mlx.core.greater", false]], "greater_equal (c++ function)": [[0, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", false]], "greater_equal() (in module mlx.core)": [[167, "mlx.core.greater_equal", false]], "group (class in mlx.core.distributed)": [[120, "mlx.core.distributed.Group", false]], "groupnorm (class in mlx.nn)": [[344, "mlx.nn.GroupNorm", false]], "gru (class in mlx.nn)": [[343, "mlx.nn.GRU", false]], "gumbel() (in module mlx.core.random)": [[242, "mlx.core.random.gumbel", false]], "hadamard_transform (c++ function)": [[0, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", false]], "hadamard_transform() (in module mlx.core)": [[168, "mlx.core.hadamard_transform", false]], "hard_shrink (class in mlx.nn)": [[415, "mlx.nn.hard_shrink", false]], "hard_tanh (class in mlx.nn)": [[416, "mlx.nn.hard_tanh", false]], "hardshrink (class in mlx.nn)": [[345, "mlx.nn.HardShrink", false]], "hardswish (class in mlx.nn)": [[347, "mlx.nn.Hardswish", false], [417, "mlx.nn.hardswish", false]], "hardtanh (class in mlx.nn)": [[346, "mlx.nn.HardTanh", false]], "he_normal() (in module mlx.nn.init)": [[404, "mlx.nn.init.he_normal", false]], "he_uniform() (in module mlx.nn.init)": [[405, "mlx.nn.init.he_uniform", false]], "hinge_loss (class in mlx.nn.losses)": [[425, "mlx.nn.losses.hinge_loss", false]], "huber_loss (class in mlx.nn.losses)": [[426, "mlx.nn.losses.huber_loss", false]], "identity (c++ function)": [[0, "_CPPv48identityi14StreamOrDevice", false], [0, "_CPPv48identityi5Dtype14StreamOrDevice", false]], "identity() (in module mlx.core)": [[169, "mlx.core.identity", false]], "identity() (in module mlx.nn.init)": [[406, "mlx.nn.init.identity", false]], "ifft() (in module mlx.core.fft)": [[150, "mlx.core.fft.ifft", false]], "ifft2() (in module mlx.core.fft)": [[151, "mlx.core.fft.ifft2", false]], "ifftn() (in module mlx.core.fft)": [[152, "mlx.core.fft.ifftn", false]], "imag (c++ function)": [[0, "_CPPv44imagRK5array14StreamOrDevice", false]], "imag() (in module mlx.core)": [[170, "mlx.core.imag", false]], "init() (in module mlx.core.distributed)": [[123, "mlx.core.distributed.init", false]], "init() (optimizer method)": [[463, "mlx.optimizers.Optimizer.init", false]], "inner (c++ function)": [[0, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", false]], "inner() (in module mlx.core)": [[171, "mlx.core.inner", false]], "instancenorm (class in mlx.nn)": [[348, "mlx.nn.InstanceNorm", false]], "inv() (in module mlx.core.linalg)": [[188, "mlx.core.linalg.inv", false]], "irfft() (in module mlx.core.fft)": [[153, "mlx.core.fft.irfft", false]], "irfft2() (in module mlx.core.fft)": [[154, "mlx.core.fft.irfft2", false]], "irfftn() (in module mlx.core.fft)": [[155, "mlx.core.fft.irfftn", false]], "is_available() (in module mlx.core.distributed)": [[124, "mlx.core.distributed.is_available", false]], "is_available() (in module mlx.core.metal)": [[214, "mlx.core.metal.is_available", false]], "isclose (c++ function)": [[0, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", false]], "isclose() (in module mlx.core)": [[172, "mlx.core.isclose", false]], "isfinite (c++ function)": [[0, "_CPPv48isfiniteRK5array14StreamOrDevice", false]], "isfinite() (in module mlx.core)": [[173, "mlx.core.isfinite", false]], "isinf (c++ function)": [[0, "_CPPv45isinfRK5array14StreamOrDevice", false]], "isinf() (in module mlx.core)": [[174, "mlx.core.isinf", false]], "isnan (c++ function)": [[0, "_CPPv45isnanRK5array14StreamOrDevice", false]], "isnan() (in module mlx.core)": [[175, "mlx.core.isnan", false]], "isneginf (c++ function)": [[0, "_CPPv48isneginfRK5array14StreamOrDevice", false]], "isneginf() (in module mlx.core)": [[176, "mlx.core.isneginf", false]], "isposinf (c++ function)": [[0, "_CPPv48isposinfRK5array14StreamOrDevice", false]], "isposinf() (in module mlx.core)": [[177, "mlx.core.isposinf", false]], "issubdtype() (in module mlx.core)": [[178, "mlx.core.issubdtype", false]], "item() (array method)": [[50, "mlx.core.array.item", false]], "itemsize (array property)": [[51, "mlx.core.array.itemsize", false]], "join_schedules() (in module mlx.optimizers)": [[470, "mlx.optimizers.join_schedules", false]], "jvp() (in module mlx.core)": [[179, "mlx.core.jvp", false]], "key() (in module mlx.core.random)": [[243, "mlx.core.random.key", false]], "kl_div_loss (class in mlx.nn.losses)": [[427, "mlx.nn.losses.kl_div_loss", false]], "l1_loss (class in mlx.nn.losses)": [[428, "mlx.nn.losses.l1_loss", false]], "laplace() (in module mlx.core.random)": [[244, "mlx.core.random.laplace", false]], "layer_norm() (in module mlx.core.fast)": [[142, "mlx.core.fast.layer_norm", false]], "layernorm (class in mlx.nn)": [[350, "mlx.nn.LayerNorm", false]], "leaf_modules() (module method)": [[364, "mlx.nn.Module.leaf_modules", false]], "leaky_relu (class in mlx.nn)": [[418, "mlx.nn.leaky_relu", false]], "leakyrelu (class in mlx.nn)": [[351, "mlx.nn.LeakyReLU", false]], "left_shift (c++ function)": [[0, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", false]], "left_shift() (in module mlx.core)": [[180, "mlx.core.left_shift", false]], "less (c++ function)": [[0, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", false]], "less() (in module mlx.core)": [[181, "mlx.core.less", false]], "less_equal (c++ function)": [[0, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", false]], "less_equal() (in module mlx.core)": [[182, "mlx.core.less_equal", false]], "linear (class in mlx.nn)": [[352, "mlx.nn.Linear", false]], "linear_schedule() (in module mlx.optimizers)": [[471, "mlx.optimizers.linear_schedule", false]], "linspace (c++ function)": [[0, "_CPPv48linspaceddi5Dtype14StreamOrDevice", false]], "linspace() (in module mlx.core)": [[193, "mlx.core.linspace", false]], "lion (class in mlx.optimizers)": [[461, "mlx.optimizers.Lion", false]], "load() (in module mlx.core)": [[194, "mlx.core.load", false]], "load_weights() (module method)": [[365, "mlx.nn.Module.load_weights", false]], "log (c++ function)": [[0, "_CPPv43logRK5array14StreamOrDevice", false]], "log() (array method)": [[52, "mlx.core.array.log", false]], "log() (in module mlx.core)": [[195, "mlx.core.log", false]], "log10 (c++ function)": [[0, "_CPPv45log10RK5array14StreamOrDevice", false]], "log10() (array method)": [[53, "mlx.core.array.log10", false]], "log10() (in module mlx.core)": [[196, "mlx.core.log10", false]], "log1p (c++ function)": [[0, "_CPPv45log1pRK5array14StreamOrDevice", false]], "log1p() (array method)": [[54, "mlx.core.array.log1p", false]], "log1p() (in module mlx.core)": [[197, "mlx.core.log1p", false]], "log2 (c++ function)": [[0, "_CPPv44log2RK5array14StreamOrDevice", false]], "log2() (array method)": [[55, "mlx.core.array.log2", false]], "log2() (in module mlx.core)": [[198, "mlx.core.log2", false]], "log_cosh_loss (class in mlx.nn.losses)": [[429, "mlx.nn.losses.log_cosh_loss", false]], "log_sigmoid (class in mlx.nn)": [[419, "mlx.nn.log_sigmoid", false]], "log_softmax (class in mlx.nn)": [[420, "mlx.nn.log_softmax", false]], "logaddexp (c++ function)": [[0, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", false]], "logaddexp() (in module mlx.core)": [[199, "mlx.core.logaddexp", false]], "logical_and (c++ function)": [[0, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", false]], "logical_and() (in module mlx.core)": [[200, "mlx.core.logical_and", false]], "logical_not (c++ function)": [[0, "_CPPv411logical_notRK5array14StreamOrDevice", false]], "logical_not() (in module mlx.core)": [[201, "mlx.core.logical_not", false]], "logical_or (c++ function)": [[0, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", false]], "logical_or() (in module mlx.core)": [[202, "mlx.core.logical_or", false]], "logsigmoid (class in mlx.nn)": [[353, "mlx.nn.LogSigmoid", false]], "logsoftmax (class in mlx.nn)": [[354, "mlx.nn.LogSoftmax", false]], "logsumexp (c++ function)": [[0, "_CPPv49logsumexpRK5array14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", false]], "logsumexp() (array method)": [[56, "mlx.core.array.logsumexp", false]], "logsumexp() (in module mlx.core)": [[203, "mlx.core.logsumexp", false]], "lstm (class in mlx.nn)": [[349, "mlx.nn.LSTM", false]], "margin_ranking_loss (class in mlx.nn.losses)": [[430, "mlx.nn.losses.margin_ranking_loss", false]], "matmul (c++ function)": [[0, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", false]], "matmul() (in module mlx.core)": [[204, "mlx.core.matmul", false]], "max (c++ function)": [[0, "_CPPv43maxRK5array14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayb14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayib14StreamOrDevice", false]], "max() (array method)": [[57, "mlx.core.array.max", false]], "max() (in module mlx.core)": [[205, "mlx.core.max", false]], "maximum (c++ function)": [[0, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", false]], "maximum() (in module mlx.core)": [[206, "mlx.core.maximum", false]], "maxpool1d (class in mlx.nn)": [[355, "mlx.nn.MaxPool1d", false]], "maxpool2d (class in mlx.nn)": [[356, "mlx.nn.MaxPool2d", false]], "mean (c++ function)": [[0, "_CPPv44meanRK5array14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayb14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayib14StreamOrDevice", false]], "mean() (array method)": [[58, "mlx.core.array.mean", false]], "mean() (in module mlx.core)": [[207, "mlx.core.mean", false]], "meshgrid (c++ function)": [[0, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", false]], "meshgrid() (in module mlx.core)": [[208, "mlx.core.meshgrid", false]], "metal_kernel() (in module mlx.core.fast)": [[143, "mlx.core.fast.metal_kernel", false]], "min (c++ function)": [[0, "_CPPv43minRK5array14StreamOrDevice", false], [0, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43minRK5arrayb14StreamOrDevice", false], [0, "_CPPv43minRK5arrayib14StreamOrDevice", false]], "min() (array method)": [[59, "mlx.core.array.min", false]], "min() (in module mlx.core)": [[221, "mlx.core.min", false]], "minimum (c++ function)": [[0, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", false]], "minimum() (in module mlx.core)": [[222, "mlx.core.minimum", false]], "mish (class in mlx.nn)": [[357, "mlx.nn.Mish", false], [435, "mlx.nn.mish", false]], "module (class in mlx.nn)": [[452, "mlx.nn.Module", false]], "modules() (module method)": [[366, "mlx.nn.Module.modules", false]], "moveaxis (c++ function)": [[0, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", false]], "moveaxis() (array method)": [[60, "mlx.core.array.moveaxis", false]], "moveaxis() (in module mlx.core)": [[223, "mlx.core.moveaxis", false]], "mse_loss (class in mlx.nn.losses)": [[431, "mlx.nn.losses.mse_loss", false]], "multiheadattention (class in mlx.nn)": [[378, "mlx.nn.MultiHeadAttention", false]], "multiply (c++ function)": [[0, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", false]], "multiply() (in module mlx.core)": [[224, "mlx.core.multiply", false]], "multivariate_normal() (in module mlx.core.random)": [[245, "mlx.core.random.multivariate_normal", false]], "named_modules() (module method)": [[367, "mlx.nn.Module.named_modules", false]], "nan_to_num (c++ function)": [[0, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", false]], "nan_to_num() (in module mlx.core)": [[225, "mlx.core.nan_to_num", false]], "nbytes (array property)": [[61, "mlx.core.array.nbytes", false]], "ndim (array property)": [[62, "mlx.core.array.ndim", false]], "negative (c++ function)": [[0, "_CPPv48negativeRK5array14StreamOrDevice", false]], "negative() (in module mlx.core)": [[226, "mlx.core.negative", false]], "new_stream() (in module mlx.core)": [[227, "mlx.core.new_stream", false]], "nll_loss (class in mlx.nn.losses)": [[432, "mlx.nn.losses.nll_loss", false]], "norm() (in module mlx.core.linalg)": [[189, "mlx.core.linalg.norm", false]], "normal() (in module mlx.core.random)": [[246, "mlx.core.random.normal", false]], "normal() (in module mlx.nn.init)": [[407, "mlx.nn.init.normal", false]], "not_equal (c++ function)": [[0, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", false]], "not_equal() (in module mlx.core)": [[228, "mlx.core.not_equal", false]], "number_of_elements (c++ function)": [[0, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", false]], "ones (c++ function)": [[0, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", false]], "ones() (in module mlx.core)": [[229, "mlx.core.ones", false]], "ones_like (c++ function)": [[0, "_CPPv49ones_likeRK5array14StreamOrDevice", false]], "ones_like() (in module mlx.core)": [[230, "mlx.core.ones_like", false]], "operator!= (c++ function)": [[0, "_CPPv4I0Ene5array1TRK5array", false], [0, "_CPPv4I0Ene5arrayRK5array1T", false], [0, "_CPPv4neRK5arrayRK5array", false]], "operator% (c++ function)": [[0, "_CPPv4I0Erm5array1TRK5array", false], [0, "_CPPv4I0Erm5arrayRK5array1T", false], [0, "_CPPv4rmRK5arrayRK5array", false]], "operator& (c++ function)": [[0, "_CPPv4anRK5arrayRK5array", false]], "operator&& (c++ function)": [[0, "_CPPv4aaRK5arrayRK5array", false]], "operator* (c++ function)": [[0, "_CPPv4I0Eml5array1TRK5array", false], [0, "_CPPv4I0Eml5arrayRK5array1T", false], [0, "_CPPv4mlRK5arrayRK5array", false]], "operator+ (c++ function)": [[0, "_CPPv4I0Epl5array1TRK5array", false], [0, "_CPPv4I0Epl5arrayRK5array1T", false], [0, "_CPPv4plRK5arrayRK5array", false]], "operator- (c++ function)": [[0, "_CPPv4I0Emi5array1TRK5array", false], [0, "_CPPv4I0Emi5arrayRK5array1T", false], [0, "_CPPv4miRK5array", false], [0, "_CPPv4miRK5arrayRK5array", false]], "operator/ (c++ function)": [[0, "_CPPv4dvRK5arrayRK5array", false], [0, "_CPPv4dvRK5arrayd", false], [0, "_CPPv4dvdRK5array", false]], "operator< (c++ function)": [[0, "_CPPv4I0Elt5array1TRK5array", false], [0, "_CPPv4I0Elt5arrayRK5array1T", false], [0, "_CPPv4ltRK5arrayRK5array", false]], "operator<< (c++ function)": [[0, "_CPPv4lsRK5arrayRK5array", false]], "operator<= (c++ function)": [[0, "_CPPv4I0Ele5array1TRK5array", false], [0, "_CPPv4I0Ele5arrayRK5array1T", false], [0, "_CPPv4leRK5arrayRK5array", false]], "operator== (c++ function)": [[0, "_CPPv4I0Eeq5array1TRK5array", false], [0, "_CPPv4I0Eeq5arrayRK5array1T", false], [0, "_CPPv4eqRK5arrayRK5array", false]], "operator> (c++ function)": [[0, "_CPPv4I0Egt5array1TRK5array", false], [0, "_CPPv4I0Egt5arrayRK5array1T", false], [0, "_CPPv4gtRK5arrayRK5array", false]], "operator>= (c++ function)": [[0, "_CPPv4I0Ege5array1TRK5array", false], [0, "_CPPv4I0Ege5arrayRK5array1T", false], [0, "_CPPv4geRK5arrayRK5array", false]], "operator>> (c++ function)": [[0, "_CPPv4rsRK5arrayRK5array", false]], "operator^ (c++ function)": [[0, "_CPPv4eoRK5arrayRK5array", false]], "operator| (c++ function)": [[0, "_CPPv4orRK5arrayRK5array", false]], "operator|| (c++ function)": [[0, "_CPPv4ooRK5arrayRK5array", false]], "optimizer (class in mlx.optimizers)": [[474, "mlx.optimizers.Optimizer", false]], "outer (c++ function)": [[0, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", false]], "outer() (in module mlx.core)": [[231, "mlx.core.outer", false]], "pad (c++ function)": [[0, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", false]], "pad() (in module mlx.core)": [[232, "mlx.core.pad", false]], "parameters() (module method)": [[368, "mlx.nn.Module.parameters", false]], "partition (c++ function)": [[0, "_CPPv49partitionRK5arrayi14StreamOrDevice", false], [0, "_CPPv49partitionRK5arrayii14StreamOrDevice", false]], "partition() (in module mlx.core)": [[233, "mlx.core.partition", false]], "permutation() (in module mlx.core.random)": [[247, "mlx.core.random.permutation", false]], "power (c++ function)": [[0, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", false]], "power() (in module mlx.core)": [[234, "mlx.core.power", false]], "prelu (class in mlx.nn)": [[379, "mlx.nn.PReLU", false], [436, "mlx.nn.prelu", false]], "prod (c++ function)": [[0, "_CPPv44prodRK5array14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayb14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayib14StreamOrDevice", false]], "prod() (array method)": [[63, "mlx.core.array.prod", false]], "prod() (in module mlx.core)": [[235, "mlx.core.prod", false]], "put_along_axis (c++ function)": [[0, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false]], "put_along_axis() (in module mlx.core)": [[236, "mlx.core.put_along_axis", false]], "qr() (in module mlx.core.linalg)": [[190, "mlx.core.linalg.qr", false]], "quantize (c++ function)": [[0, "_CPPv48quantizeRK5arrayii14StreamOrDevice", false]], "quantize() (in module mlx.core)": [[237, "mlx.core.quantize", false]], "quantize() (in module mlx.nn)": [[307, "mlx.nn.quantize", false]], "quantized_matmul (c++ function)": [[0, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", false]], "quantized_matmul() (in module mlx.core)": [[238, "mlx.core.quantized_matmul", false]], "quantizedembedding (class in mlx.nn)": [[380, "mlx.nn.QuantizedEmbedding", false]], "quantizedlinear (class in mlx.nn)": [[381, "mlx.nn.QuantizedLinear", false]], "radians (c++ function)": [[0, "_CPPv47radiansRK5array14StreamOrDevice", false]], "radians() (in module mlx.core)": [[239, "mlx.core.radians", false]], "randint() (in module mlx.core.random)": [[248, "mlx.core.random.randint", false]], "real (c++ function)": [[0, "_CPPv44realRK5array14StreamOrDevice", false]], "real() (in module mlx.core)": [[253, "mlx.core.real", false]], "reciprocal (c++ function)": [[0, "_CPPv410reciprocalRK5array14StreamOrDevice", false]], "reciprocal() (array method)": [[64, "mlx.core.array.reciprocal", false]], "reciprocal() (in module mlx.core)": [[254, "mlx.core.reciprocal", false]], "recv() (in module mlx.core.distributed)": [[125, "mlx.core.distributed.recv", false]], "recv_like() (in module mlx.core.distributed)": [[126, "mlx.core.distributed.recv_like", false]], "relu (class in mlx.nn)": [[384, "mlx.nn.ReLU", false], [437, "mlx.nn.relu", false]], "relu6 (class in mlx.nn)": [[385, "mlx.nn.ReLU6", false], [438, "mlx.nn.relu6", false]], "remainder (c++ function)": [[0, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", false]], "remainder() (in module mlx.core)": [[255, "mlx.core.remainder", false]], "repeat (c++ function)": [[0, "_CPPv46repeatRK5arrayi14StreamOrDevice", false], [0, "_CPPv46repeatRK5arrayii14StreamOrDevice", false]], "repeat() (in module mlx.core)": [[256, "mlx.core.repeat", false]], "reset_peak_memory() (in module mlx.core.metal)": [[215, "mlx.core.metal.reset_peak_memory", false]], "reshape (c++ function)": [[0, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "reshape() (array method)": [[65, "mlx.core.array.reshape", false]], "reshape() (in module mlx.core)": [[257, "mlx.core.reshape", false]], "rfft() (in module mlx.core.fft)": [[156, "mlx.core.fft.rfft", false]], "rfft2() (in module mlx.core.fft)": [[157, "mlx.core.fft.rfft2", false]], "rfftn() (in module mlx.core.fft)": [[158, "mlx.core.fft.rfftn", false]], "right_shift (c++ function)": [[0, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", false]], "right_shift() (in module mlx.core)": [[258, "mlx.core.right_shift", false]], "rms_norm() (in module mlx.core.fast)": [[144, "mlx.core.fast.rms_norm", false]], "rmsnorm (class in mlx.nn)": [[382, "mlx.nn.RMSNorm", false]], "rmsprop (class in mlx.optimizers)": [[466, "mlx.optimizers.RMSprop", false]], "rnn (class in mlx.nn)": [[383, "mlx.nn.RNN", false]], "roll (c++ function)": [[0, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayi14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayii14StreamOrDevice", false]], "roll() (in module mlx.core)": [[259, "mlx.core.roll", false]], "rope (class in mlx.nn)": [[386, "mlx.nn.RoPE", false]], "rope() (in module mlx.core.fast)": [[145, "mlx.core.fast.rope", false]], "round (c++ function)": [[0, "_CPPv45roundRK5array14StreamOrDevice", false], [0, "_CPPv45roundRK5arrayi14StreamOrDevice", false]], "round() (array method)": [[66, "mlx.core.array.round", false]], "round() (in module mlx.core)": [[260, "mlx.core.round", false]], "rsqrt (c++ function)": [[0, "_CPPv45rsqrtRK5array14StreamOrDevice", false]], "rsqrt() (array method)": [[67, "mlx.core.array.rsqrt", false]], "rsqrt() (in module mlx.core)": [[261, "mlx.core.rsqrt", false]], "save() (in module mlx.core)": [[262, "mlx.core.save", false]], "save_gguf() (in module mlx.core)": [[263, "mlx.core.save_gguf", false]], "save_safetensors() (in module mlx.core)": [[264, "mlx.core.save_safetensors", false]], "save_weights() (module method)": [[369, "mlx.nn.Module.save_weights", false]], "savez() (in module mlx.core)": [[265, "mlx.core.savez", false]], "savez_compressed() (in module mlx.core)": [[266, "mlx.core.savez_compressed", false]], "scaled_dot_product_attention() (in module mlx.core.fast)": [[146, "mlx.core.fast.scaled_dot_product_attention", false]], "scatter (c++ function)": [[0, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_add (c++ function)": [[0, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_max (c++ function)": [[0, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_min (c++ function)": [[0, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_prod (c++ function)": [[0, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "seed() (in module mlx.core.random)": [[249, "mlx.core.random.seed", false]], "selu (class in mlx.nn)": [[387, "mlx.nn.SELU", false], [439, "mlx.nn.selu", false]], "send() (in module mlx.core.distributed)": [[127, "mlx.core.distributed.send", false]], "sequential (class in mlx.nn)": [[388, "mlx.nn.Sequential", false]], "set_cache_limit() (in module mlx.core.metal)": [[216, "mlx.core.metal.set_cache_limit", false]], "set_default_device() (in module mlx.core)": [[267, "mlx.core.set_default_device", false]], "set_default_stream() (in module mlx.core)": [[268, "mlx.core.set_default_stream", false]], "set_dtype() (module method)": [[370, "mlx.nn.Module.set_dtype", false]], "set_memory_limit() (in module mlx.core.metal)": [[217, "mlx.core.metal.set_memory_limit", false]], "set_wired_limit() (in module mlx.core.metal)": [[218, "mlx.core.metal.set_wired_limit", false]], "sgd (class in mlx.optimizers)": [[467, "mlx.optimizers.SGD", false]], "shape (array property)": [[68, "mlx.core.array.shape", false]], "sigmoid (c++ function)": [[0, "_CPPv47sigmoidRK5array14StreamOrDevice", false]], "sigmoid (class in mlx.nn)": [[390, "mlx.nn.Sigmoid", false], [440, "mlx.nn.sigmoid", false]], "sigmoid() (in module mlx.core)": [[269, "mlx.core.sigmoid", false]], "sign (c++ function)": [[0, "_CPPv44signRK5array14StreamOrDevice", false]], "sign() (in module mlx.core)": [[270, "mlx.core.sign", false]], "silu (class in mlx.nn)": [[389, "mlx.nn.SiLU", false], [441, "mlx.nn.silu", false]], "sin (c++ function)": [[0, "_CPPv43sinRK5array14StreamOrDevice", false]], "sin() (array method)": [[69, "mlx.core.array.sin", false]], "sin() (in module mlx.core)": [[271, "mlx.core.sin", false]], "sinh (c++ function)": [[0, "_CPPv44sinhRK5array14StreamOrDevice", false]], "sinh() (in module mlx.core)": [[272, "mlx.core.sinh", false]], "sinusoidalpositionalencoding (class in mlx.nn)": [[391, "mlx.nn.SinusoidalPositionalEncoding", false]], "size (array property)": [[70, "mlx.core.array.size", false]], "slice (c++ function)": [[0, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false]], "slice_update (c++ function)": [[0, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false]], "smooth_l1_loss (class in mlx.nn.losses)": [[433, "mlx.nn.losses.smooth_l1_loss", false]], "softmax (c++ function)": [[0, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv47softmaxRK5arrayb14StreamOrDevice", false], [0, "_CPPv47softmaxRK5arrayib14StreamOrDevice", false]], "softmax (class in mlx.nn)": [[392, "mlx.nn.Softmax", false], [442, "mlx.nn.softmax", false]], "softmax() (in module mlx.core)": [[273, "mlx.core.softmax", false]], "softmin (class in mlx.nn)": [[393, "mlx.nn.Softmin", false], [443, "mlx.nn.softmin", false]], "softplus (class in mlx.nn)": [[394, "mlx.nn.Softplus", false], [444, "mlx.nn.softplus", false]], "softshrink (class in mlx.nn)": [[395, "mlx.nn.Softshrink", false], [445, "mlx.nn.softshrink", false]], "softsign (class in mlx.nn)": [[396, "mlx.nn.Softsign", false]], "sort (c++ function)": [[0, "_CPPv44sortRK5array14StreamOrDevice", false], [0, "_CPPv44sortRK5arrayi14StreamOrDevice", false]], "sort() (in module mlx.core)": [[274, "mlx.core.sort", false]], "split (c++ function)": [[0, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayi14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayii14StreamOrDevice", false]], "split() (array method)": [[71, "mlx.core.array.split", false]], "split() (in module mlx.core)": [[275, "mlx.core.split", false]], "split() (in module mlx.core.random)": [[250, "mlx.core.random.split", false]], "sqrt (c++ function)": [[0, "_CPPv44sqrtRK5array14StreamOrDevice", false]], "sqrt() (array method)": [[72, "mlx.core.array.sqrt", false]], "sqrt() (in module mlx.core)": [[276, "mlx.core.sqrt", false]], "square (c++ function)": [[0, "_CPPv46squareRK5array14StreamOrDevice", false]], "square() (array method)": [[73, "mlx.core.array.square", false]], "square() (in module mlx.core)": [[277, "mlx.core.square", false]], "squeeze (c++ function)": [[0, "_CPPv47squeezeRK5array14StreamOrDevice", false], [0, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv47squeezeRK5arrayi14StreamOrDevice", false]], "squeeze() (array method)": [[74, "mlx.core.array.squeeze", false]], "squeeze() (in module mlx.core)": [[278, "mlx.core.squeeze", false]], "stack (c++ function)": [[0, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", false], [0, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", false]], "stack() (in module mlx.core)": [[279, "mlx.core.stack", false]], "start_capture() (in module mlx.core.metal)": [[219, "mlx.core.metal.start_capture", false]], "state (module property)": [[371, "mlx.nn.Module.state", false]], "state (optimizer property)": [[464, "mlx.optimizers.Optimizer.state", false]], "std (c++ function)": [[0, "_CPPv4StRK5array14StreamOrDevice", false], [0, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", false], [0, "_CPPv4StRK5arraybi14StreamOrDevice", false], [0, "_CPPv4StRK5arrayibi14StreamOrDevice", false]], "std() (array method)": [[75, "mlx.core.array.std", false]], "std() (in module mlx.core)": [[280, "mlx.core.std", false]], "step (class in mlx.nn)": [[397, "mlx.nn.Step", false], [446, "mlx.nn.step", false]], "step_decay() (in module mlx.optimizers)": [[472, "mlx.optimizers.step_decay", false]], "stop_capture() (in module mlx.core.metal)": [[220, "mlx.core.metal.stop_capture", false]], "stop_gradient (c++ function)": [[0, "_CPPv413stop_gradientRK5array14StreamOrDevice", false]], "stop_gradient() (in module mlx.core)": [[281, "mlx.core.stop_gradient", false]], "stream (class in mlx.core)": [[315, "mlx.core.Stream", false]], "stream() (in module mlx.core)": [[282, "mlx.core.stream", false]], "subtract (c++ function)": [[0, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", false]], "subtract() (in module mlx.core)": [[283, "mlx.core.subtract", false]], "sum (c++ function)": [[0, "_CPPv43sumRK5array14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayb14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayib14StreamOrDevice", false]], "sum() (array method)": [[76, "mlx.core.array.sum", false]], "sum() (in module mlx.core)": [[284, "mlx.core.sum", false]], "svd() (in module mlx.core.linalg)": [[191, "mlx.core.linalg.svd", false]], "swapaxes (c++ function)": [[0, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", false]], "swapaxes() (array method)": [[77, "mlx.core.array.swapaxes", false]], "swapaxes() (in module mlx.core)": [[285, "mlx.core.swapaxes", false]], "synchronize() (in module mlx.core)": [[286, "mlx.core.synchronize", false]], "t (array property)": [[31, "mlx.core.array.T", false]], "take (c++ function)": [[0, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayi14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayii14StreamOrDevice", false]], "take() (in module mlx.core)": [[287, "mlx.core.take", false]], "take_along_axis (c++ function)": [[0, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", false]], "take_along_axis() (in module mlx.core)": [[288, "mlx.core.take_along_axis", false]], "tan (c++ function)": [[0, "_CPPv43tanRK5array14StreamOrDevice", false]], "tan() (in module mlx.core)": [[289, "mlx.core.tan", false]], "tanh (c++ function)": [[0, "_CPPv44tanhRK5array14StreamOrDevice", false]], "tanh (class in mlx.nn)": [[398, "mlx.nn.Tanh", false], [447, "mlx.nn.tanh", false]], "tanh() (in module mlx.core)": [[290, "mlx.core.tanh", false]], "tensordot (c++ function)": [[0, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", false], [0, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false]], "tensordot() (in module mlx.core)": [[291, "mlx.core.tensordot", false]], "tile (c++ function)": [[0, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "tile() (in module mlx.core)": [[292, "mlx.core.tile", false]], "tolist() (array method)": [[78, "mlx.core.array.tolist", false]], "topk (c++ function)": [[0, "_CPPv44topkRK5arrayi14StreamOrDevice", false], [0, "_CPPv44topkRK5arrayii14StreamOrDevice", false]], "topk() (in module mlx.core)": [[293, "mlx.core.topk", false]], "trace (c++ function)": [[0, "_CPPv45traceRK5array14StreamOrDevice", false], [0, "_CPPv45traceRK5arrayiii14StreamOrDevice", false], [0, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", false]], "trace() (in module mlx.core)": [[294, "mlx.core.trace", false]], "train() (module method)": [[372, "mlx.nn.Module.train", false]], "trainable_parameters() (module method)": [[373, "mlx.nn.Module.trainable_parameters", false]], "training (module property)": [[374, "mlx.nn.Module.training", false]], "transformer (class in mlx.nn)": [[399, "mlx.nn.Transformer", false]], "transpose (c++ function)": [[0, "_CPPv49transposeRK5array14StreamOrDevice", false], [0, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", false], [0, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "transpose() (array method)": [[79, "mlx.core.array.transpose", false]], "transpose() (in module mlx.core)": [[295, "mlx.core.transpose", false]], "tree_flatten() (in module mlx.utils)": [[310, "mlx.utils.tree_flatten", false]], "tree_map() (in module mlx.utils)": [[311, "mlx.utils.tree_map", false]], "tree_map_with_path() (in module mlx.utils)": [[312, "mlx.utils.tree_map_with_path", false]], "tree_reduce() (in module mlx.utils)": [[313, "mlx.utils.tree_reduce", false]], "tree_unflatten() (in module mlx.utils)": [[314, "mlx.utils.tree_unflatten", false]], "tri (c++ function)": [[0, "_CPPv43trii5Dtype14StreamOrDevice", false], [0, "_CPPv43triiii5Dtype14StreamOrDevice", false]], "tri() (in module mlx.core)": [[296, "mlx.core.tri", false]], "tri_inv() (in module mlx.core.linalg)": [[192, "mlx.core.linalg.tri_inv", false]], "tril (c++ function)": [[0, "_CPPv44tril5arrayi14StreamOrDevice", false]], "tril() (in module mlx.core)": [[297, "mlx.core.tril", false]], "triplet_loss (class in mlx.nn.losses)": [[434, "mlx.nn.losses.triplet_loss", false]], "triu (c++ function)": [[0, "_CPPv44triu5arrayi14StreamOrDevice", false]], "triu() (in module mlx.core)": [[298, "mlx.core.triu", false]], "truncated_normal() (in module mlx.core.random)": [[251, "mlx.core.random.truncated_normal", false]], "unfreeze() (module method)": [[375, "mlx.nn.Module.unfreeze", false]], "uniform() (in module mlx.core.random)": [[252, "mlx.core.random.uniform", false]], "uniform() (in module mlx.nn.init)": [[408, "mlx.nn.init.uniform", false]], "update() (module method)": [[376, "mlx.nn.Module.update", false]], "update() (optimizer method)": [[465, "mlx.optimizers.Optimizer.update", false]], "update_modules() (module method)": [[377, "mlx.nn.Module.update_modules", false]], "upsample (class in mlx.nn)": [[400, "mlx.nn.Upsample", false]], "value_and_grad() (in module mlx.core)": [[299, "mlx.core.value_and_grad", false]], "value_and_grad() (in module mlx.nn)": [[308, "mlx.nn.value_and_grad", false]], "var (c++ function)": [[0, "_CPPv43varRK5array14StreamOrDevice", false], [0, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", false], [0, "_CPPv43varRK5arraybi14StreamOrDevice", false], [0, "_CPPv43varRK5arrayibi14StreamOrDevice", false]], "var() (array method)": [[80, "mlx.core.array.var", false]], "var() (in module mlx.core)": [[300, "mlx.core.var", false]], "view (c++ function)": [[0, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", false]], "view() (array method)": [[81, "mlx.core.array.view", false]], "view() (in module mlx.core)": [[301, "mlx.core.view", false]], "vjp() (in module mlx.core)": [[302, "mlx.core.vjp", false]], "vmap() (in module mlx.core)": [[303, "mlx.core.vmap", false]], "where (c++ function)": [[0, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", false]], "where() (in module mlx.core)": [[304, "mlx.core.where", false]], "zeros (c++ function)": [[0, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", false]], "zeros() (in module mlx.core)": [[305, "mlx.core.zeros", false]], "zeros_like (c++ function)": [[0, "_CPPv410zeros_likeRK5array14StreamOrDevice", false]], "zeros_like() (in module mlx.core)": [[306, "mlx.core.zeros_like", false]]}, "objects": {"": [[0, 0, 1, "_CPPv43absRK5array14StreamOrDevice", "abs"], [0, 1, 1, "_CPPv43absRK5array14StreamOrDevice", "abs::a"], [0, 1, 1, "_CPPv43absRK5array14StreamOrDevice", "abs::s"], [0, 0, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::a"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::b"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::s"], [0, 0, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::a"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::alpha"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::b"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::beta"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::c"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::s"], [0, 0, 1, "_CPPv43allRK5array14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all"], [0, 1, 1, "_CPPv43allRK5array14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::axes"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::axis"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5array14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::s"], [0, 0, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::a"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::atol"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::b"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::equal_nan"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::rtol"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::s"], [0, 0, 1, "_CPPv43anyRK5array14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any"], [0, 1, 1, "_CPPv43anyRK5array14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::axes"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::axis"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5array14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::s"], [0, 0, 1, "_CPPv46aranged14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangedd14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeddd14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangei14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeii14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeiii14StreamOrDevice", "arange"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46aranged14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangei14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46aranged14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangei14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::stop"], [0, 0, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos"], [0, 1, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos::a"], [0, 1, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos::s"], [0, 0, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh"], [0, 1, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh::a"], [0, 1, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh::s"], [0, 0, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin"], [0, 1, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin::a"], [0, 1, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin::s"], [0, 0, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh"], [0, 1, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh::a"], [0, 1, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh::s"], [0, 0, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan"], [0, 0, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::a"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::b"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::s"], [0, 1, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan::a"], [0, 1, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan::s"], [0, 0, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh"], [0, 1, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh::a"], [0, 1, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh::s"], [0, 0, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax"], [0, 0, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax"], [0, 0, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax"], [0, 1, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::axis"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::keepdims"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::keepdims"], [0, 1, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax::s"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::s"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::s"], [0, 0, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin"], [0, 0, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin"], [0, 0, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin"], [0, 1, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::axis"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::keepdims"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::keepdims"], [0, 1, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin::s"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::s"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::s"], [0, 0, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition"], [0, 0, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::a"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::a"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::axis"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::kth"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::kth"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::s"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::s"], [0, 0, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort"], [0, 0, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort"], [0, 1, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort::a"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::a"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::axis"], [0, 1, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort::s"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::s"], [0, 0, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal"], [0, 0, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::a"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::a"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::b"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::b"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::equal_nan"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::s"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::s"], [0, 0, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::a"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::offset"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::s"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::shape"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::strides"], [0, 0, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::a"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::dtype"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::s"], [0, 0, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d"], [0, 0, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d"], [0, 1, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d::a"], [0, 1, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d::a"], [0, 1, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d::s"], [0, 1, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d::s"], [0, 0, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d"], [0, 0, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d"], [0, 1, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d::a"], [0, 1, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d::a"], [0, 1, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d::s"], [0, 1, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d::s"], [0, 0, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d"], [0, 0, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d"], [0, 1, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d::a"], [0, 1, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d::a"], [0, 1, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d::s"], [0, 1, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d::s"], [0, 0, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::a"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::b"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::s"], [0, 0, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::a"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::b"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::s"], [0, 0, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::a"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::b"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::s"], [0, 0, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::a"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::b"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::block_size"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_lhs"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_out"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_rhs"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::s"], [0, 0, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays"], [0, 1, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays::inputs"], [0, 1, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays::s"], [0, 0, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::a"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::s"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::shape"], [0, 0, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil"], [0, 1, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil::a"], [0, 1, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil::s"], [0, 0, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a_max"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a_min"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::s"], [0, 0, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate"], [0, 0, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate::arrays"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::arrays"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::axis"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate::s"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::s"], [0, 0, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate"], [0, 1, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate::a"], [0, 1, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate::s"], [0, 0, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::dilation"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::groups"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::input"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::padding"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::s"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::stride"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::weight"], [0, 0, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::dilation"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::groups"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::input"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::padding"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::s"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::stride"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::weight"], [0, 0, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::dilation"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::groups"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::input"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::padding"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::s"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::stride"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::weight"], [0, 0, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general"], [0, 0, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::flip"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::flip"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::groups"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::groups"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input_dilation"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::kernel_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::kernel_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding_hi"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding_lo"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::s"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::s"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::stride"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::stride"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::weight"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::weight"], [0, 0, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::dilation"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::groups"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::input"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::padding"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::s"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::stride"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::weight"], [0, 0, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::dilation"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::groups"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::input"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::padding"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::s"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::stride"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::weight"], [0, 0, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::dilation"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::groups"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::input"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::padding"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::s"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::stride"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::weight"], [0, 0, 1, "_CPPv44copy5array14StreamOrDevice", "copy"], [0, 1, 1, "_CPPv44copy5array14StreamOrDevice", "copy::a"], [0, 1, 1, "_CPPv44copy5array14StreamOrDevice", "copy::s"], [0, 0, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos"], [0, 1, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos::a"], [0, 1, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos::s"], [0, 0, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh"], [0, 1, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh::a"], [0, 1, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh::s"], [0, 0, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::a"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::axis"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::inclusive"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::reverse"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::s"], [0, 0, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::a"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::axis"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::inclusive"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::reverse"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::s"], [0, 0, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::a"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::axis"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::inclusive"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::reverse"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::s"], [0, 0, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::a"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::axis"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::inclusive"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::reverse"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::s"], [0, 0, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees"], [0, 1, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees::a"], [0, 1, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees::s"], [0, 0, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends"], [0, 1, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends::dependencies"], [0, 1, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends::inputs"], [0, 0, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::biases"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::bits"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::group_size"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::s"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::scales"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::w"], [0, 0, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::a"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::k"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::s"], [0, 0, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::a"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::axis1"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::axis2"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::offset"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::s"], [0, 0, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::a"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::b"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::s"], [0, 0, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::a"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::b"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::s"], [0, 0, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::a"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::b"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::s"], [0, 0, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf"], [0, 1, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf::a"], [0, 1, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf::s"], [0, 0, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv"], [0, 1, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv::a"], [0, 1, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv::s"], [0, 0, 1, "_CPPv43expRK5array14StreamOrDevice", "exp"], [0, 1, 1, "_CPPv43expRK5array14StreamOrDevice", "exp::a"], [0, 1, 1, "_CPPv43expRK5array14StreamOrDevice", "exp::s"], [0, 0, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims"], [0, 0, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::a"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::a"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::axes"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::axis"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::s"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::s"], [0, 0, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1"], [0, 1, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1::a"], [0, 1, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1::s"], [0, 0, 1, "_CPPv43eyei14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeii14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeiii14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::dtype"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::dtype"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::k"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::k"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyei14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyei14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::s"], [0, 0, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten"], [0, 0, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten"], [0, 1, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten::a"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::a"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::end_axis"], [0, 1, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten::s"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::s"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::start_axis"], [0, 0, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor"], [0, 1, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor::a"], [0, 1, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor::s"], [0, 0, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::a"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::b"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::s"], [0, 0, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full"], [0, 0, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full"], [0, 0, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full"], [0, 0, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full"], [0, 2, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::T"], [0, 2, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::T"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::dtype"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::dtype"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::val"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::val"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::vals"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::vals"], [0, 0, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather"], [0, 0, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::a"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::a"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::axes"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::axis"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::indices"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::indices"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::s"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::s"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::slice_sizes"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::slice_sizes"], [0, 0, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::a"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::b"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::lhs_indices"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::rhs_indices"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::s"], [0, 0, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::biases"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::bits"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::group_size"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::lhs_indices"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::rhs_indices"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::s"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::scales"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::transpose"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::w"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::x"], [0, 0, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::a"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::b"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::s"], [0, 0, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::a"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::b"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::s"], [0, 0, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::a"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::s"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::scale"], [0, 0, 1, "_CPPv48identityi14StreamOrDevice", "identity"], [0, 0, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::dtype"], [0, 1, 1, "_CPPv48identityi14StreamOrDevice", "identity::n"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::n"], [0, 1, 1, "_CPPv48identityi14StreamOrDevice", "identity::s"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::s"], [0, 0, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag"], [0, 1, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag::a"], [0, 1, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag::s"], [0, 0, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::a"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::b"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::s"], [0, 0, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::a"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::atol"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::b"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::equal_nan"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::rtol"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::s"], [0, 0, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite"], [0, 1, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite::a"], [0, 1, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite::s"], [0, 0, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf"], [0, 1, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf::a"], [0, 1, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf::s"], [0, 0, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan"], [0, 1, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan::a"], [0, 1, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan::s"], [0, 0, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf"], [0, 1, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf::a"], [0, 1, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf::s"], [0, 0, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf"], [0, 1, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf::a"], [0, 1, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf::s"], [0, 0, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::a"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::b"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::s"], [0, 0, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::a"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::b"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::s"], [0, 0, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::a"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::b"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::s"], [0, 0, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::dtype"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::num"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::s"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::start"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::stop"], [0, 0, 1, "_CPPv43logRK5array14StreamOrDevice", "log"], [0, 0, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10"], [0, 1, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10::a"], [0, 1, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10::s"], [0, 0, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p"], [0, 1, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p::a"], [0, 1, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p::s"], [0, 0, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2"], [0, 1, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2::a"], [0, 1, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2::s"], [0, 1, 1, "_CPPv43logRK5array14StreamOrDevice", "log::a"], [0, 1, 1, "_CPPv43logRK5array14StreamOrDevice", "log::s"], [0, 0, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::a"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::b"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::s"], [0, 0, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::a"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::b"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::s"], [0, 0, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not"], [0, 1, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not::a"], [0, 1, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not::s"], [0, 0, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::a"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::b"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::s"], [0, 0, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp"], [0, 1, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::axes"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::axis"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::s"], [0, 0, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::a"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::b"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::s"], [0, 0, 1, "_CPPv43maxRK5array14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max"], [0, 1, 1, "_CPPv43maxRK5array14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::axes"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::axis"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5array14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::s"], [0, 0, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::a"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::b"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::s"], [0, 0, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean"], [0, 1, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::axes"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::axis"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::s"], [0, 0, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::arrays"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::indexing"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::s"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::sparse"], [0, 0, 1, "_CPPv43minRK5array14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min"], [0, 1, 1, "_CPPv43minRK5array14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::axes"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::axis"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5array14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::s"], [0, 0, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::a"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::b"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::s"], [0, 0, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::a"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::destination"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::s"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::source"], [0, 0, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::a"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::b"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::s"], [0, 0, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::a"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::nan"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::neginf"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::posinf"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::s"], [0, 0, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative"], [0, 1, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative::a"], [0, 1, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative::s"], [0, 0, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::a"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::b"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::s"], [0, 0, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::a"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::axes"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::dtype"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::inverted"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::s"], [0, 0, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones"], [0, 0, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::dtype"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones::s"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::s"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones::shape"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::shape"], [0, 0, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like"], [0, 1, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like::a"], [0, 1, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like::s"], [0, 0, 1, "_CPPv4I0Ene5array1TRK5array", "operator!="], [0, 0, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!="], [0, 0, 1, "_CPPv4neRK5arrayRK5array", "operator!="], [0, 2, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::T"], [0, 2, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::T"], [0, 1, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::a"], [0, 1, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::a"], [0, 1, 1, "_CPPv4neRK5arrayRK5array", "operator!=::a"], [0, 1, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::b"], [0, 1, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::b"], [0, 1, 1, "_CPPv4neRK5arrayRK5array", "operator!=::b"], [0, 0, 1, "_CPPv4I0Erm5array1TRK5array", "operator%"], [0, 0, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%"], [0, 0, 1, "_CPPv4rmRK5arrayRK5array", "operator%"], [0, 2, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::T"], [0, 2, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::T"], [0, 1, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::a"], [0, 1, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::a"], [0, 1, 1, "_CPPv4rmRK5arrayRK5array", "operator%::a"], [0, 1, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::b"], [0, 1, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::b"], [0, 1, 1, "_CPPv4rmRK5arrayRK5array", "operator%::b"], [0, 0, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;"], [0, 0, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;"], [0, 1, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;::a"], [0, 1, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;::b"], [0, 1, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;::a"], [0, 1, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;::b"], [0, 0, 1, "_CPPv4I0Eml5array1TRK5array", "operator*"], [0, 0, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*"], [0, 0, 1, "_CPPv4mlRK5arrayRK5array", "operator*"], [0, 2, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::T"], [0, 2, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::T"], [0, 1, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::a"], [0, 1, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::a"], [0, 1, 1, "_CPPv4mlRK5arrayRK5array", "operator*::a"], [0, 1, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::b"], [0, 1, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::b"], [0, 1, 1, "_CPPv4mlRK5arrayRK5array", "operator*::b"], [0, 0, 1, "_CPPv4I0Epl5array1TRK5array", "operator+"], [0, 0, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+"], [0, 0, 1, "_CPPv4plRK5arrayRK5array", "operator+"], [0, 2, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::T"], [0, 2, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::T"], [0, 1, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::a"], [0, 1, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::a"], [0, 1, 1, "_CPPv4plRK5arrayRK5array", "operator+::a"], [0, 1, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::b"], [0, 1, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::b"], [0, 1, 1, "_CPPv4plRK5arrayRK5array", "operator+::b"], [0, 0, 1, "_CPPv4I0Emi5array1TRK5array", "operator-"], [0, 0, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-"], [0, 0, 1, "_CPPv4miRK5array", "operator-"], [0, 0, 1, "_CPPv4miRK5arrayRK5array", "operator-"], [0, 2, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::T"], [0, 2, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::T"], [0, 1, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::a"], [0, 1, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::a"], [0, 1, 1, "_CPPv4miRK5array", "operator-::a"], [0, 1, 1, "_CPPv4miRK5arrayRK5array", "operator-::a"], [0, 1, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::b"], [0, 1, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::b"], [0, 1, 1, "_CPPv4miRK5arrayRK5array", "operator-::b"], [0, 0, 1, "_CPPv4dvRK5arrayRK5array", "operator/"], [0, 0, 1, "_CPPv4dvRK5arrayd", "operator/"], [0, 0, 1, "_CPPv4dvdRK5array", "operator/"], [0, 1, 1, "_CPPv4dvRK5arrayRK5array", "operator/::a"], [0, 1, 1, "_CPPv4dvRK5arrayd", "operator/::a"], [0, 1, 1, "_CPPv4dvdRK5array", "operator/::a"], [0, 1, 1, "_CPPv4dvRK5arrayRK5array", "operator/::b"], [0, 1, 1, "_CPPv4dvRK5arrayd", "operator/::b"], [0, 1, 1, "_CPPv4dvdRK5array", "operator/::b"], [0, 0, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;"], [0, 0, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;"], [0, 0, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;"], [0, 2, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::T"], [0, 2, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::T"], [0, 1, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::a"], [0, 1, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::a"], [0, 1, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;::a"], [0, 1, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::b"], [0, 1, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::b"], [0, 1, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;::b"], [0, 0, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;"], [0, 1, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;::a"], [0, 1, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;::b"], [0, 0, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;="], [0, 0, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;="], [0, 0, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;="], [0, 2, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::T"], [0, 2, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::T"], [0, 1, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::a"], [0, 1, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::a"], [0, 1, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;=::a"], [0, 1, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::b"], [0, 1, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::b"], [0, 1, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;=::b"], [0, 0, 1, "_CPPv4I0Eeq5array1TRK5array", "operator=="], [0, 0, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator=="], [0, 0, 1, "_CPPv4eqRK5arrayRK5array", "operator=="], [0, 2, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::T"], [0, 2, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::T"], [0, 1, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::a"], [0, 1, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::a"], [0, 1, 1, "_CPPv4eqRK5arrayRK5array", "operator==::a"], [0, 1, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::b"], [0, 1, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::b"], [0, 1, 1, "_CPPv4eqRK5arrayRK5array", "operator==::b"], [0, 0, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;"], [0, 0, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;"], [0, 0, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;"], [0, 2, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::T"], [0, 2, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::T"], [0, 1, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::a"], [0, 1, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::a"], [0, 1, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;::a"], [0, 1, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::b"], [0, 1, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::b"], [0, 1, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;::b"], [0, 0, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;="], [0, 0, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;="], [0, 0, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;="], [0, 2, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::T"], [0, 2, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::T"], [0, 1, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::a"], [0, 1, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::a"], [0, 1, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;=::a"], [0, 1, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::b"], [0, 1, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::b"], [0, 1, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;=::b"], [0, 0, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;"], [0, 1, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;::a"], [0, 1, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;::b"], [0, 0, 1, "_CPPv4eoRK5arrayRK5array", "operator^"], [0, 1, 1, "_CPPv4eoRK5arrayRK5array", "operator^::a"], [0, 1, 1, "_CPPv4eoRK5arrayRK5array", "operator^::b"], [0, 0, 1, "_CPPv4orRK5arrayRK5array", "operator|"], [0, 1, 1, "_CPPv4orRK5arrayRK5array", "operator|::a"], [0, 1, 1, "_CPPv4orRK5arrayRK5array", "operator|::b"], [0, 0, 1, "_CPPv4ooRK5arrayRK5array", "operator||"], [0, 1, 1, "_CPPv4ooRK5arrayRK5array", "operator||::a"], [0, 1, 1, "_CPPv4ooRK5arrayRK5array", "operator||::b"], [0, 0, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::a"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::b"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::s"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::axes"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::high_pad_size"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::low_pad_size"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 0, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition"], [0, 0, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::a"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::a"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::axis"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::kth"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::kth"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::s"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::s"], [0, 0, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::a"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::b"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::s"], [0, 0, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod"], [0, 1, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::axes"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::axis"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::s"], [0, 0, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::a"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::axis"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::indices"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::s"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::values"], [0, 0, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::bits"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::group_size"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::s"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::w"], [0, 0, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::biases"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::bits"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::group_size"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::s"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::scales"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::transpose"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::w"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::x"], [0, 0, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians"], [0, 1, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians::a"], [0, 1, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians::s"], [0, 0, 1, "_CPPv44realRK5array14StreamOrDevice", "real"], [0, 1, 1, "_CPPv44realRK5array14StreamOrDevice", "real::a"], [0, 1, 1, "_CPPv44realRK5array14StreamOrDevice", "real::s"], [0, 0, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal"], [0, 1, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal::a"], [0, 1, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal::s"], [0, 0, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::a"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::b"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::s"], [0, 0, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat"], [0, 0, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::arr"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::arr"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::axis"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::repeats"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::repeats"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::s"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::s"], [0, 0, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::a"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::s"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::shape"], [0, 0, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::a"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::b"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::s"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::axes"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::axes"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::axis"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::axis"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::shift"], [0, 0, 1, "_CPPv45roundRK5array14StreamOrDevice", "round"], [0, 0, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round"], [0, 1, 1, "_CPPv45roundRK5array14StreamOrDevice", "round::a"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::a"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::decimals"], [0, 1, 1, "_CPPv45roundRK5array14StreamOrDevice", "round::s"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::s"], [0, 0, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt"], [0, 1, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt::a"], [0, 1, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt::s"], [0, 0, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter"], [0, 0, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::a"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::a"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::axes"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::axis"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::indices"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::indices"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::s"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::s"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::updates"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::updates"], [0, 0, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add"], [0, 0, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::a"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::a"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::axes"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::axis"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::indices"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::indices"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::s"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::s"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::updates"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::updates"], [0, 0, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max"], [0, 0, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::a"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::a"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::axes"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::axis"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::indices"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::indices"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::s"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::s"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::updates"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::updates"], [0, 0, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min"], [0, 0, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::a"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::a"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::axes"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::axis"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::indices"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::indices"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::s"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::s"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::updates"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::updates"], [0, 0, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod"], [0, 0, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::a"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::a"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::axes"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::axis"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::indices"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::indices"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::s"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::s"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::updates"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::updates"], [0, 0, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid"], [0, 1, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid::a"], [0, 1, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid::s"], [0, 0, 1, "_CPPv44signRK5array14StreamOrDevice", "sign"], [0, 1, 1, "_CPPv44signRK5array14StreamOrDevice", "sign::a"], [0, 1, 1, "_CPPv44signRK5array14StreamOrDevice", "sign::s"], [0, 0, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin"], [0, 1, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin::a"], [0, 1, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin::s"], [0, 0, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh"], [0, 1, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh::a"], [0, 1, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh::s"], [0, 0, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice"], [0, 0, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::a"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::a"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::s"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::s"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::start"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::start"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::stop"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::stop"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::strides"], [0, 0, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update"], [0, 0, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::s"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::s"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::src"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::src"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::start"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::start"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::stop"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::stop"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::strides"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::update"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::update"], [0, 0, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax"], [0, 0, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax"], [0, 0, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::axes"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::axis"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::s"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::s"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::s"], [0, 0, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort"], [0, 0, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort"], [0, 1, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort::a"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::a"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::axis"], [0, 1, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort::s"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::s"], [0, 0, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::axis"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::axis"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::indices"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::indices"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::num_splits"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::num_splits"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::s"], [0, 0, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt"], [0, 1, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt::a"], [0, 1, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt::s"], [0, 0, 1, "_CPPv46squareRK5array14StreamOrDevice", "square"], [0, 1, 1, "_CPPv46squareRK5array14StreamOrDevice", "square::a"], [0, 1, 1, "_CPPv46squareRK5array14StreamOrDevice", "square::s"], [0, 0, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze"], [0, 0, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze"], [0, 0, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze"], [0, 1, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::axes"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::axis"], [0, 1, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze::s"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::s"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::s"], [0, 0, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack"], [0, 0, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack::arrays"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::arrays"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::axis"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack::s"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::s"], [0, 0, 1, "_CPPv4StRK5array14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std"], [0, 1, 1, "_CPPv4StRK5array14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::axes"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::axis"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5array14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::s"], [0, 0, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient"], [0, 1, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient::a"], [0, 1, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient::s"], [0, 0, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::a"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::b"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::s"], [0, 0, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum"], [0, 1, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::axes"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::axis"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::s"], [0, 0, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::a"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::axis1"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::axis2"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::s"], [0, 0, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::axis"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::axis"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::index"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::index"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::indices"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::indices"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::s"], [0, 0, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::a"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::axis"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::indices"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::s"], [0, 0, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan"], [0, 1, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan::a"], [0, 1, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan::s"], [0, 0, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh"], [0, 1, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh::a"], [0, 1, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh::s"], [0, 0, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot"], [0, 0, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::axes_a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::axes_b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::axis"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::s"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::s"], [0, 0, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::arr"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::reps"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::s"], [0, 0, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk"], [0, 0, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::a"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::a"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::axis"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::k"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::k"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::s"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::s"], [0, 0, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace"], [0, 0, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace"], [0, 0, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace"], [0, 1, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::axis1"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::axis1"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::axis2"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::axis2"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::dtype"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::offset"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::offset"], [0, 1, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace::s"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::s"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::s"], [0, 0, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose"], [0, 0, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose"], [0, 0, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose"], [0, 1, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::axes"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::axes"], [0, 1, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose::s"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::s"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::s"], [0, 0, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri"], [0, 0, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::k"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::m"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::n"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::n"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::s"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::s"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::type"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::type"], [0, 0, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::k"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::s"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::x"], [0, 0, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::k"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::s"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::x"], [0, 0, 1, "_CPPv43varRK5array14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var"], [0, 1, 1, "_CPPv43varRK5array14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::axes"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::axis"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5array14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::s"], [0, 0, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::a"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::dtype"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::s"], [0, 0, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::condition"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::s"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::x"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::y"], [0, 0, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros"], [0, 0, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::dtype"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros::s"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::s"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros::shape"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::shape"], [0, 0, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like"], [0, 1, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like::a"], [0, 1, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like::s"]], "mlx.core": [[9, 3, 1, "", "Device"], [10, 3, 1, "", "Dtype"], [11, 3, 1, "", "DtypeCategory"], [315, 3, 1, "", "Stream"], [12, 5, 1, "", "abs"], [13, 5, 1, "", "add"], [14, 5, 1, "", "addmm"], [15, 5, 1, "", "all"], [16, 5, 1, "", "allclose"], [17, 5, 1, "", "any"], [18, 5, 1, "", "arange"], [19, 5, 1, "", "arccos"], [20, 5, 1, "", "arccosh"], [21, 5, 1, "", "arcsin"], [22, 5, 1, "", "arcsinh"], [23, 5, 1, "", "arctan"], [24, 5, 1, "", "arctan2"], [25, 5, 1, "", "arctanh"], [26, 5, 1, "", "argmax"], [27, 5, 1, "", "argmin"], [28, 5, 1, "", "argpartition"], [29, 5, 1, "", "argsort"], [30, 3, 1, "", "array"], [82, 5, 1, "", "array_equal"], [83, 5, 1, "", "as_strided"], [84, 5, 1, "", "atleast_1d"], [85, 5, 1, "", "atleast_2d"], [86, 5, 1, "", "atleast_3d"], [87, 5, 1, "", "bitwise_and"], [88, 5, 1, "", "bitwise_or"], [89, 5, 1, "", "bitwise_xor"], [90, 5, 1, "", "block_masked_mm"], [91, 5, 1, "", "broadcast_to"], [92, 5, 1, "", "ceil"], [93, 5, 1, "", "clip"], [94, 5, 1, "", "compile"], [95, 5, 1, "", "concatenate"], [96, 5, 1, "", "conj"], [97, 5, 1, "", "conjugate"], [98, 5, 1, "", "conv1d"], [99, 5, 1, "", "conv2d"], [100, 5, 1, "", "conv3d"], [101, 5, 1, "", "conv_general"], [102, 5, 1, "", "conv_transpose1d"], [103, 5, 1, "", "conv_transpose2d"], [104, 5, 1, "", "conv_transpose3d"], [105, 5, 1, "", "convolve"], [106, 5, 1, "", "cos"], [107, 5, 1, "", "cosh"], [108, 5, 1, "", "cummax"], [109, 5, 1, "", "cummin"], [110, 5, 1, "", "cumprod"], [111, 5, 1, "", "cumsum"], [112, 3, 1, "", "custom_function"], [113, 5, 1, "", "default_device"], [114, 5, 1, "", "default_stream"], [115, 5, 1, "", "degrees"], [116, 5, 1, "", "dequantize"], [117, 5, 1, "", "diag"], [118, 5, 1, "", "diagonal"], [119, 5, 1, "", "disable_compile"], [128, 5, 1, "", "divide"], [129, 5, 1, "", "divmod"], [130, 5, 1, "", "einsum"], [131, 5, 1, "", "einsum_path"], [132, 5, 1, "", "enable_compile"], [133, 5, 1, "", "equal"], [134, 5, 1, "", "erf"], [135, 5, 1, "", "erfinv"], [136, 5, 1, "", "eval"], [137, 5, 1, "", "exp"], [138, 5, 1, "", "expand_dims"], [139, 5, 1, "", "expm1"], [140, 5, 1, "", "eye"], [159, 5, 1, "", "flatten"], [160, 5, 1, "", "floor"], [161, 5, 1, "", "floor_divide"], [162, 5, 1, "", "full"], [163, 5, 1, "", "gather_mm"], [164, 5, 1, "", "gather_qmm"], [165, 5, 1, "", "grad"], [166, 5, 1, "", "greater"], [167, 5, 1, "", "greater_equal"], [168, 5, 1, "", "hadamard_transform"], [169, 5, 1, "", "identity"], [170, 5, 1, "", "imag"], [171, 5, 1, "", "inner"], [172, 5, 1, "", "isclose"], [173, 5, 1, "", "isfinite"], [174, 5, 1, "", "isinf"], [175, 5, 1, "", "isnan"], [176, 5, 1, "", "isneginf"], [177, 5, 1, "", "isposinf"], [178, 5, 1, "", "issubdtype"], [179, 5, 1, "", "jvp"], [180, 5, 1, "", "left_shift"], [181, 5, 1, "", "less"], [182, 5, 1, "", "less_equal"], [193, 5, 1, "", "linspace"], [194, 5, 1, "", "load"], [195, 5, 1, "", "log"], [196, 5, 1, "", "log10"], [197, 5, 1, "", "log1p"], [198, 5, 1, "", "log2"], [199, 5, 1, "", "logaddexp"], [200, 5, 1, "", "logical_and"], [201, 5, 1, "", "logical_not"], [202, 5, 1, "", "logical_or"], [203, 5, 1, "", "logsumexp"], [204, 5, 1, "", "matmul"], [205, 5, 1, "", "max"], [206, 5, 1, "", "maximum"], [207, 5, 1, "", "mean"], [208, 5, 1, "", "meshgrid"], [221, 5, 1, "", "min"], [222, 5, 1, "", "minimum"], [223, 5, 1, "", "moveaxis"], [224, 5, 1, "", "multiply"], [225, 5, 1, "", "nan_to_num"], [226, 5, 1, "", "negative"], [227, 5, 1, "", "new_stream"], [228, 5, 1, "", "not_equal"], [229, 5, 1, "", "ones"], [230, 5, 1, "", "ones_like"], [231, 5, 1, "", "outer"], [232, 5, 1, "", "pad"], [233, 5, 1, "", "partition"], [234, 5, 1, "", "power"], [235, 5, 1, "", "prod"], [236, 5, 1, "", "put_along_axis"], [237, 5, 1, "", "quantize"], [238, 5, 1, "", "quantized_matmul"], [239, 5, 1, "", "radians"], [253, 5, 1, "", "real"], [254, 5, 1, "", "reciprocal"], [255, 5, 1, "", "remainder"], [256, 5, 1, "", "repeat"], [257, 5, 1, "", "reshape"], [258, 5, 1, "", "right_shift"], [259, 5, 1, "", "roll"], [260, 5, 1, "", "round"], [261, 5, 1, "", "rsqrt"], [262, 5, 1, "", "save"], [263, 5, 1, "", "save_gguf"], [264, 5, 1, "", "save_safetensors"], [265, 5, 1, "", "savez"], [266, 5, 1, "", "savez_compressed"], [267, 5, 1, "", "set_default_device"], [268, 5, 1, "", "set_default_stream"], [269, 5, 1, "", "sigmoid"], [270, 5, 1, "", "sign"], [271, 5, 1, "", "sin"], [272, 5, 1, "", "sinh"], [273, 5, 1, "", "softmax"], [274, 5, 1, "", "sort"], [275, 5, 1, "", "split"], [276, 5, 1, "", "sqrt"], [277, 5, 1, "", "square"], [278, 5, 1, "", "squeeze"], [279, 5, 1, "", "stack"], [280, 5, 1, "", "std"], [281, 5, 1, "", "stop_gradient"], [282, 5, 1, "", "stream"], [283, 5, 1, "", "subtract"], [284, 5, 1, "", "sum"], [285, 5, 1, "", "swapaxes"], [286, 5, 1, "", "synchronize"], [287, 5, 1, "", "take"], [288, 5, 1, "", "take_along_axis"], [289, 5, 1, "", "tan"], [290, 5, 1, "", "tanh"], [291, 5, 1, "", "tensordot"], [292, 5, 1, "", "tile"], [293, 5, 1, "", "topk"], [294, 5, 1, "", "trace"], [295, 5, 1, "", "transpose"], [296, 5, 1, "", "tri"], [297, 5, 1, "", "tril"], [298, 5, 1, "", "triu"], [299, 5, 1, "", "value_and_grad"], [300, 5, 1, "", "var"], [301, 5, 1, "", "view"], [302, 5, 1, "", "vjp"], [303, 5, 1, "", "vmap"], [304, 5, 1, "", "where"], [305, 5, 1, "", "zeros"], [306, 5, 1, "", "zeros_like"]], "mlx.core.Device": [[9, 4, 1, "", "__init__"]], "mlx.core.Dtype": [[10, 4, 1, "", "__init__"]], "mlx.core.DtypeCategory": [[11, 4, 1, "", "__init__"]], "mlx.core.Stream": [[315, 4, 1, "", "__init__"]], "mlx.core.array": [[31, 6, 1, "", "T"], [30, 4, 1, "", "__init__"], [32, 4, 1, "", "abs"], [33, 4, 1, "", "all"], [34, 4, 1, "", "any"], [35, 4, 1, "", "argmax"], [36, 4, 1, "", "argmin"], [37, 4, 1, "", "astype"], [38, 6, 1, "", "at"], [39, 4, 1, "", "conj"], [40, 4, 1, "", "cos"], [41, 4, 1, "", "cummax"], [42, 4, 1, "", "cummin"], [43, 4, 1, "", "cumprod"], [44, 4, 1, "", "cumsum"], [45, 4, 1, "", "diag"], [46, 4, 1, "", "diagonal"], [47, 6, 1, "", "dtype"], [48, 4, 1, "", "exp"], [49, 4, 1, "", "flatten"], [50, 4, 1, "", "item"], [51, 6, 1, "", "itemsize"], [52, 4, 1, "", "log"], [53, 4, 1, "", "log10"], [54, 4, 1, "", "log1p"], [55, 4, 1, "", "log2"], [56, 4, 1, "", "logsumexp"], [57, 4, 1, "", "max"], [58, 4, 1, "", "mean"], [59, 4, 1, "", "min"], [60, 4, 1, "", "moveaxis"], [61, 6, 1, "", "nbytes"], [62, 6, 1, "", "ndim"], [63, 4, 1, "", "prod"], [64, 4, 1, "", "reciprocal"], [65, 4, 1, "", "reshape"], [66, 4, 1, "", "round"], [67, 4, 1, "", "rsqrt"], [68, 6, 1, "", "shape"], [69, 4, 1, "", "sin"], [70, 6, 1, "", "size"], [71, 4, 1, "", "split"], [72, 4, 1, "", "sqrt"], [73, 4, 1, "", "square"], [74, 4, 1, "", "squeeze"], [75, 4, 1, "", "std"], [76, 4, 1, "", "sum"], [77, 4, 1, "", "swapaxes"], [78, 4, 1, "", "tolist"], [79, 4, 1, "", "transpose"], [80, 4, 1, "", "var"], [81, 4, 1, "", "view"]], "mlx.core.custom_function": [[112, 4, 1, "", "__init__"]], "mlx.core.distributed": [[120, 3, 1, "", "Group"], [121, 5, 1, "", "all_gather"], [122, 5, 1, "", "all_sum"], [123, 5, 1, "", "init"], [124, 5, 1, "", "is_available"], [125, 5, 1, "", "recv"], [126, 5, 1, "", "recv_like"], [127, 5, 1, "", "send"]], "mlx.core.distributed.Group": [[120, 4, 1, "", "__init__"]], "mlx.core.fast": [[141, 5, 1, "", "affine_quantize"], [142, 5, 1, "", "layer_norm"], [143, 5, 1, "", "metal_kernel"], [144, 5, 1, "", "rms_norm"], [145, 5, 1, "", "rope"], [146, 5, 1, "", "scaled_dot_product_attention"]], "mlx.core.fft": [[147, 5, 1, "", "fft"], [148, 5, 1, "", "fft2"], [149, 5, 1, "", "fftn"], [150, 5, 1, "", "ifft"], [151, 5, 1, "", "ifft2"], [152, 5, 1, "", "ifftn"], [153, 5, 1, "", "irfft"], [154, 5, 1, "", "irfft2"], [155, 5, 1, "", "irfftn"], [156, 5, 1, "", "rfft"], [157, 5, 1, "", "rfft2"], [158, 5, 1, "", "rfftn"]], "mlx.core.linalg": [[183, 5, 1, "", "cholesky"], [184, 5, 1, "", "cholesky_inv"], [185, 5, 1, "", "cross"], [186, 5, 1, "", "eigh"], [187, 5, 1, "", "eigvalsh"], [188, 5, 1, "", "inv"], [189, 5, 1, "", "norm"], [190, 5, 1, "", "qr"], [191, 5, 1, "", "svd"], [192, 5, 1, "", "tri_inv"]], "mlx.core.metal": [[209, 5, 1, "", "clear_cache"], [210, 5, 1, "", "device_info"], [211, 5, 1, "", "get_active_memory"], [212, 5, 1, "", "get_cache_memory"], [213, 5, 1, "", "get_peak_memory"], [214, 5, 1, "", "is_available"], [215, 5, 1, "", "reset_peak_memory"], [216, 5, 1, "", "set_cache_limit"], [217, 5, 1, "", "set_memory_limit"], [218, 5, 1, "", "set_wired_limit"], [219, 5, 1, "", "start_capture"], [220, 5, 1, "", "stop_capture"]], "mlx.core.random": [[240, 5, 1, "", "bernoulli"], [241, 5, 1, "", "categorical"], [242, 5, 1, "", "gumbel"], [243, 5, 1, "", "key"], [244, 5, 1, "", "laplace"], [245, 5, 1, "", "multivariate_normal"], [246, 5, 1, "", "normal"], [247, 5, 1, "", "permutation"], [248, 5, 1, "", "randint"], [249, 5, 1, "", "seed"], [250, 5, 1, "", "split"], [251, 5, 1, "", "truncated_normal"], [252, 5, 1, "", "uniform"]], "mlx.nn": [[325, 3, 1, "", "ALiBi"], [326, 3, 1, "", "AvgPool1d"], [327, 3, 1, "", "AvgPool2d"], [328, 3, 1, "", "BatchNorm"], [329, 3, 1, "", "CELU"], [330, 3, 1, "", "Conv1d"], [331, 3, 1, "", "Conv2d"], [332, 3, 1, "", "Conv3d"], [333, 3, 1, "", "ConvTranspose1d"], [334, 3, 1, "", "ConvTranspose2d"], [335, 3, 1, "", "ConvTranspose3d"], [336, 3, 1, "", "Dropout"], [337, 3, 1, "", "Dropout2d"], [338, 3, 1, "", "Dropout3d"], [339, 3, 1, "", "ELU"], [340, 3, 1, "", "Embedding"], [341, 3, 1, "", "GELU"], [342, 3, 1, "", "GLU"], [343, 3, 1, "", "GRU"], [344, 3, 1, "", "GroupNorm"], [345, 3, 1, "", "HardShrink"], [346, 3, 1, "", "HardTanh"], [347, 3, 1, "", "Hardswish"], [348, 3, 1, "", "InstanceNorm"], [349, 3, 1, "", "LSTM"], [350, 3, 1, "", "LayerNorm"], [351, 3, 1, "", "LeakyReLU"], [352, 3, 1, "", "Linear"], [353, 3, 1, "", "LogSigmoid"], [354, 3, 1, "", "LogSoftmax"], [355, 3, 1, "", "MaxPool1d"], [356, 3, 1, "", "MaxPool2d"], [357, 3, 1, "", "Mish"], [452, 3, 1, "", "Module"], [378, 3, 1, "", "MultiHeadAttention"], [379, 3, 1, "", "PReLU"], [380, 3, 1, "", "QuantizedEmbedding"], [381, 3, 1, "", "QuantizedLinear"], [382, 3, 1, "", "RMSNorm"], [383, 3, 1, "", "RNN"], [384, 3, 1, "", "ReLU"], [385, 3, 1, "", "ReLU6"], [386, 3, 1, "", "RoPE"], [387, 3, 1, "", "SELU"], [388, 3, 1, "", "Sequential"], [389, 3, 1, "", "SiLU"], [390, 3, 1, "", "Sigmoid"], [391, 3, 1, "", "SinusoidalPositionalEncoding"], [392, 3, 1, "", "Softmax"], [393, 3, 1, "", "Softmin"], [394, 3, 1, "", "Softplus"], [395, 3, 1, "", "Softshrink"], [396, 3, 1, "", "Softsign"], [397, 3, 1, "", "Step"], [398, 3, 1, "", "Tanh"], [399, 3, 1, "", "Transformer"], [400, 3, 1, "", "Upsample"], [409, 3, 1, "", "celu"], [410, 3, 1, "", "elu"], [411, 3, 1, "", "gelu"], [412, 3, 1, "", "gelu_approx"], [413, 3, 1, "", "gelu_fast_approx"], [414, 3, 1, "", "glu"], [415, 3, 1, "", "hard_shrink"], [416, 3, 1, "", "hard_tanh"], [417, 3, 1, "", "hardswish"], [418, 3, 1, "", "leaky_relu"], [419, 3, 1, "", "log_sigmoid"], [420, 3, 1, "", "log_softmax"], [435, 3, 1, "", "mish"], [436, 3, 1, "", "prelu"], [307, 5, 1, "", "quantize"], [437, 3, 1, "", "relu"], [438, 3, 1, "", "relu6"], [439, 3, 1, "", "selu"], [440, 3, 1, "", "sigmoid"], [441, 3, 1, "", "silu"], [442, 3, 1, "", "softmax"], [443, 3, 1, "", "softmin"], [444, 3, 1, "", "softplus"], [445, 3, 1, "", "softshrink"], [446, 3, 1, "", "step"], [447, 3, 1, "", "tanh"], [308, 5, 1, "", "value_and_grad"]], "mlx.nn.Module": [[358, 4, 1, "", "apply"], [359, 4, 1, "", "apply_to_modules"], [360, 4, 1, "", "children"], [361, 4, 1, "", "eval"], [362, 4, 1, "", "filter_and_map"], [363, 4, 1, "", "freeze"], [364, 4, 1, "", "leaf_modules"], [365, 4, 1, "", "load_weights"], [366, 4, 1, "", "modules"], [367, 4, 1, "", "named_modules"], [368, 4, 1, "", "parameters"], [369, 4, 1, "", "save_weights"], [370, 4, 1, "", "set_dtype"], [371, 6, 1, "", "state"], [372, 4, 1, "", "train"], [373, 4, 1, "", "trainable_parameters"], [374, 6, 1, "", "training"], [375, 4, 1, "", "unfreeze"], [376, 4, 1, "", "update"], [377, 4, 1, "", "update_modules"]], "mlx.nn.init": [[401, 5, 1, "", "constant"], [402, 5, 1, "", "glorot_normal"], [403, 5, 1, "", "glorot_uniform"], [404, 5, 1, "", "he_normal"], [405, 5, 1, "", "he_uniform"], [406, 5, 1, "", "identity"], [407, 5, 1, "", "normal"], [408, 5, 1, "", "uniform"]], "mlx.nn.losses": [[421, 3, 1, "", "binary_cross_entropy"], [422, 3, 1, "", "cosine_similarity_loss"], [423, 3, 1, "", "cross_entropy"], [424, 3, 1, "", "gaussian_nll_loss"], [425, 3, 1, "", "hinge_loss"], [426, 3, 1, "", "huber_loss"], [427, 3, 1, "", "kl_div_loss"], [428, 3, 1, "", "l1_loss"], [429, 3, 1, "", "log_cosh_loss"], [430, 3, 1, "", "margin_ranking_loss"], [431, 3, 1, "", "mse_loss"], [432, 3, 1, "", "nll_loss"], [433, 3, 1, "", "smooth_l1_loss"], [434, 3, 1, "", "triplet_loss"]], "mlx.optimizers": [[455, 3, 1, "", "AdaDelta"], [456, 3, 1, "", "Adafactor"], [457, 3, 1, "", "Adagrad"], [458, 3, 1, "", "Adam"], [459, 3, 1, "", "AdamW"], [460, 3, 1, "", "Adamax"], [461, 3, 1, "", "Lion"], [474, 3, 1, "", "Optimizer"], [466, 3, 1, "", "RMSprop"], [467, 3, 1, "", "SGD"], [309, 5, 1, "", "clip_grad_norm"], [468, 5, 1, "", "cosine_decay"], [469, 5, 1, "", "exponential_decay"], [470, 5, 1, "", "join_schedules"], [471, 5, 1, "", "linear_schedule"], [472, 5, 1, "", "step_decay"]], "mlx.optimizers.Optimizer": [[462, 4, 1, "", "apply_gradients"], [463, 4, 1, "", "init"], [464, 6, 1, "", "state"], [465, 4, 1, "", "update"]], "mlx.utils": [[310, 5, 1, "", "tree_flatten"], [311, 5, 1, "", "tree_map"], [312, 5, 1, "", "tree_map_with_path"], [313, 5, 1, "", "tree_reduce"], [314, 5, 1, "", "tree_unflatten"]]}, "objnames": {"0": ["cpp", "function", "C++ function"], "1": ["cpp", "functionParam", "C++ function parameter"], "2": ["cpp", "templateParam", "C++ template parameter"], "3": ["py", "class", "Python class"], "4": ["py", "method", "Python method"], "5": ["py", "function", "Python function"], "6": ["py", "property", "Python property"]}, "objtypes": {"0": "cpp:function", "1": "cpp:functionParam", "2": "cpp:templateParam", "3": "py:class", "4": "py:method", "5": "py:function", "6": "py:property"}, "terms": {"": [0, 1, 2, 4, 5, 6, 47, 51, 62, 94, 114, 116, 141, 148, 149, 151, 152, 154, 155, 157, 158, 165, 184, 189, 191, 194, 207, 231, 237, 241, 260, 263, 264, 280, 282, 299, 300, 301, 303, 308, 324, 327, 343, 349, 356, 362, 363, 365, 369, 370, 371, 375, 383, 454, 463, 464, 476, 479, 481, 484, 485, 486, 487], "0": [0, 1, 2, 4, 5, 6, 8, 9, 14, 18, 38, 45, 46, 49, 66, 71, 75, 80, 83, 95, 98, 99, 100, 101, 102, 103, 104, 117, 118, 140, 143, 146, 159, 163, 165, 186, 188, 189, 190, 192, 209, 216, 218, 225, 232, 240, 244, 246, 247, 252, 256, 260, 275, 279, 280, 294, 296, 297, 298, 299, 300, 303, 309, 310, 312, 313, 324, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 344, 345, 348, 350, 351, 355, 356, 379, 384, 386, 391, 395, 397, 399, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 413, 415, 416, 417, 418, 421, 423, 425, 426, 430, 433, 434, 436, 437, 438, 439, 445, 446, 449, 452, 455, 456, 458, 459, 460, 461, 463, 466, 467, 468, 469, 470, 471, 472, 476, 479, 480, 481, 482, 483, 484, 485, 486], "00005": 4, "0001": 391, "0005": 412, "001": 456, "00364": 4, "01": [4, 351, 418, 459], "0137595": 404, "015": 413, "0184009": 405, "02264": 403, "025": 481, "02765": 404, "0300242": 405, "044715": [341, 412], "0485873": 423, "05": [16, 172, 328, 344, 348, 350, 382], "0507": 439, "05202": 5, "06": [424, 434, 455], "0638": 430, "06450": 350, "0645099": 407, "06561": 469, "06675": 461, "07467": 382, "08": [16, 172, 422, 457, 458, 459, 460, 466], "08022": 348, "081": 472, "08415": 413, "08494": 344, "08619": 405, "08681": [357, 435], "09864": 5, "0999938": 470, "0999961": 468, "0f": 0, "1": [0, 1, 2, 3, 5, 6, 14, 18, 28, 29, 38, 46, 49, 98, 99, 100, 101, 102, 103, 104, 117, 118, 139, 143, 146, 147, 148, 150, 151, 153, 154, 155, 156, 157, 158, 159, 168, 171, 178, 184, 185, 186, 187, 189, 190, 204, 208, 217, 231, 233, 237, 241, 244, 245, 246, 252, 269, 274, 287, 293, 294, 299, 309, 312, 313, 317, 324, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 342, 343, 344, 348, 349, 350, 352, 355, 356, 379, 382, 383, 386, 390, 391, 397, 400, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 413, 414, 416, 419, 420, 421, 422, 423, 424, 425, 426, 427, 429, 430, 432, 433, 434, 439, 440, 442, 443, 444, 446, 449, 452, 454, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 468, 469, 470, 471, 472, 479, 480, 481, 482, 484, 485, 486, 487], "10": [0, 3, 5, 6, 196, 260, 265, 311, 324, 365, 449, 470, 472, 479, 480, 482], "100": [2, 4, 5, 421, 471, 479, 481, 483, 487], "1000": [468, 479], "10000": 386, "101": 471, "1024": [1, 5], "105361": 421, "109": 2, "10_000": 4, "10x": 461, "11": 189, "114": 2, "12": [5, 168, 470], "1212": 455, "12451": 403, "128": [265, 324], "13": 8, "14": 8, "15": [1, 8, 189, 218, 313, 479], "150594": 402, "15268": 404, "16": [1, 143, 317, 326, 348, 355, 358, 452], "1606": 413, "1607": [348, 350], "16384": 168, "16506": 405, "17": 8, "177208": 404, "1803": 344, "1908": [357, 435], "1910": 382, "191107": 402, "1985": 189, "1_000": 4, "1d": [0, 98, 102, 105, 263, 288], "1e": [0, 4, 6, 16, 172, 328, 344, 348, 350, 351, 382, 422, 424, 434, 454, 455, 456, 457, 458, 459, 460, 463, 466, 468, 469, 470, 471, 472], "1e3": 479, "1st": 237, "2": [0, 1, 2, 4, 5, 6, 38, 99, 103, 117, 118, 134, 148, 151, 153, 154, 155, 156, 157, 158, 159, 168, 178, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 198, 204, 237, 245, 250, 291, 294, 296, 297, 298, 309, 313, 317, 324, 326, 327, 331, 334, 341, 351, 355, 356, 382, 391, 400, 401, 402, 403, 404, 405, 406, 407, 408, 412, 423, 424, 426, 433, 434, 449, 452, 454, 455, 457, 458, 459, 463, 466, 479, 480, 481, 482, 483, 484, 485, 486, 487], "20": [168, 189], "200": [5, 470], "2002": 5, "2011": 457, "2012": [455, 466], "2015": [337, 458, 460], "2019": [5, 459], "2020": 5, "2021": 5, "20397": 421, "20_000": 5, "21": [5, 472], "2104": 5, "223144": 421, "223404": 403, "225": 189, "225763": 430, "2302": 461, "23607": [189, 190], "24": 8, "24264": 189, "247": 5, "25": [379, 400], "25211": 404, "256": [1, 2, 6, 143], "256995": 430, "28": 168, "2d": [0, 99, 103, 118, 237, 328, 337], "2nd": 237, "2x": 484, "3": [0, 1, 2, 5, 8, 100, 104, 159, 178, 185, 186, 187, 189, 190, 309, 313, 332, 335, 341, 400, 403, 405, 412, 417, 456, 461, 476, 479, 482, 484, 485], "30": 456, "3118": 484, "32": [1, 5, 6, 90, 237, 238, 317, 327, 356, 382, 479], "330": 5, "33333": 400, "348587": 423, "363207": 402, "36788": 479, "379159": 403, "380709": 407, "39": 5, "390": 481, "3d": [0, 2, 100, 104, 328, 338, 400], "3f": [2, 6, 479], "3x": 2, "4": [0, 1, 2, 5, 116, 141, 143, 159, 164, 189, 237, 238, 265, 307, 313, 317, 326, 327, 328, 348, 355, 356, 380, 381, 399, 400, 402, 403, 404, 421, 479, 480, 482, 485, 487], "4096": [479, 481, 487], "40x": 1, "41421": 189, "417497": 408, "42": 314, "437": 5, "44": 5, "447214": 190, "458835": 404, "475": 5, "48095": 402, "4d": [1, 400], "4m": 1, "5": [0, 1, 2, 4, 5, 8, 189, 217, 240, 313, 326, 328, 336, 337, 338, 341, 345, 348, 355, 395, 400, 401, 404, 405, 412, 415, 433, 445, 449, 454, 466, 468, 469, 479, 481, 482], "50": [0, 193], "500": [5, 487], "5000": 2, "510826": 421, "512": [2, 3, 5, 399, 487], "534422": 407, "539245": 421, "53947": 402, "55": 1, "5701": 455, "573409": 430, "57771": 190, "579": 5, "5f": 4, "6": [1, 2, 5, 189, 265, 385, 399, 403, 412, 413, 417, 424, 434, 438, 466, 479, 482, 485], "61278": 402, "617261": 408, "628": 5, "633": 5, "64": [0, 1, 90, 116, 141, 164, 237, 238, 307, 317, 380, 381], "64331": 405, "666329": 405, "66667": 400, "67326": 439, "676": 1, "690": 5, "6967": 404, "7": [2, 5, 189, 237, 482], "702": [341, 413], "707107": 186, "71828": 479, "74166": 189, "74597": 189, "75": 400, "75596": 430, "75787": 404, "765166": 430, "773433": 430, "776856": 403, "793615": 405, "79854": 405, "7b": 5, "7m": 1, "8": [0, 1, 2, 5, 8, 189, 237, 317, 327, 348, 356, 399, 422, 455, 456, 457, 458, 459, 460, 466, 479, 482, 485, 487], "8192": [5, 168], "84804": 189, "863726": 408, "883935": 408, "890597": 403, "894427": 190, "89613": 402, "8gb": 5, "8x": 1, "9": [8, 189, 423, 455, 458, 459, 460, 461, 463, 469, 472, 484], "90041": 403, "912766": 403, "916291": 421, "95": 6, "982273": 407, "99": [461, 466], "995016": 402, "999": [458, 459, 460], "A": [0, 2, 5, 7, 8, 9, 68, 82, 94, 142, 143, 144, 146, 165, 178, 179, 184, 186, 187, 189, 190, 191, 194, 203, 204, 205, 210, 221, 237, 240, 241, 242, 244, 245, 246, 247, 248, 251, 252, 275, 279, 282, 299, 302, 303, 307, 308, 309, 310, 311, 312, 313, 314, 315, 324, 328, 337, 343, 344, 348, 350, 362, 366, 367, 370, 376, 377, 382, 388, 391, 399, 402, 403, 405, 413, 434, 435, 452, 454, 458, 460, 462, 463, 465, 470, 479, 480, 481, 483, 484], "AS": 163, "And": [5, 400], "As": [6, 38, 287, 324], "At": 93, "But": 487, "By": [5, 307, 370, 421, 481, 484], "For": [0, 1, 2, 5, 8, 38, 146, 163, 178, 189, 237, 314, 324, 328, 337, 341, 358, 363, 372, 375, 381, 386, 391, 400, 402, 403, 404, 405, 421, 449, 454, 476, 479, 480, 481, 482, 483, 484, 485, 486, 487], "If": [0, 1, 2, 5, 8, 15, 16, 17, 18, 26, 27, 28, 29, 78, 82, 83, 93, 95, 105, 108, 109, 110, 111, 117, 118, 121, 122, 123, 125, 126, 127, 136, 142, 145, 156, 157, 158, 161, 162, 165, 172, 183, 184, 185, 189, 194, 203, 204, 205, 207, 208, 216, 217, 221, 225, 229, 232, 233, 235, 236, 241, 245, 247, 256, 259, 273, 274, 275, 280, 284, 286, 287, 288, 291, 293, 294, 299, 300, 303, 305, 307, 311, 313, 328, 330, 331, 332, 333, 334, 335, 344, 350, 352, 363, 365, 375, 381, 383, 386, 388, 391, 400, 421, 423, 434, 456, 479, 480, 481, 483, 486, 487, 488], "In": [0, 1, 2, 5, 6, 38, 204, 237, 311, 324, 337, 344, 452, 455, 457, 458, 460, 461, 462, 478, 479, 480, 481, 483, 486, 487], "It": [2, 5, 8, 126, 165, 268, 299, 309, 313, 324, 377, 381, 462, 474, 484, 486], "Its": 324, "No": [2, 5, 186, 187], "Not": [94, 228, 479], "ON": [3, 8], "Of": 481, "On": [1, 479, 481, 483], "One": [147, 150, 156, 232, 261, 479, 481], "THE": 8, "That": 5, "The": [0, 1, 2, 3, 5, 6, 7, 8, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 47, 51, 61, 62, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 179, 180, 181, 182, 185, 186, 187, 189, 190, 191, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 212, 213, 216, 217, 218, 219, 221, 222, 223, 224, 226, 228, 229, 230, 231, 232, 233, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 263, 264, 269, 270, 271, 272, 273, 274, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 317, 319, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 358, 359, 363, 365, 369, 370, 371, 372, 375, 376, 377, 378, 380, 381, 382, 383, 386, 388, 391, 397, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 449, 452, 454, 455, 456, 457, 458, 459, 460, 461, 464, 466, 467, 468, 471, 474, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "Then": [4, 8], "There": [1, 2, 324, 400, 479], "These": [1, 2, 94, 236, 288, 423, 487], "To": [0, 2, 3, 4, 5, 6, 8, 216, 324, 449, 454, 479, 480, 481, 485], "With": 2, "_": [1, 3, 4, 5, 312, 324, 468, 469, 470, 471, 472, 476, 479, 483, 487], "__call__": [1, 5, 6, 324, 452], "__init__": [2, 5, 6, 9, 10, 11, 30, 112, 120, 315, 324, 452], "__main__": [2, 5], "__name__": [2, 5], "_a": 2, "_ext": 2, "_f": 189, "_in": [402, 403], "_out": [402, 403], "_p": 434, "_size": [326, 327, 355, 356], "_val": 416, "a1": 163, "a2": 163, "a_": 189, "a_max": [0, 93], "a_min": [0, 93], "a_ndim": 1, "a_shap": 1, "a_strid": 1, "a_view": 484, "ab": [0, 16, 172, 189, 299, 344, 348, 350, 357, 382, 413, 435, 479], "abil": 480, "abl": [2, 237], "about": [1, 2, 5, 6, 131, 210, 483, 487], "abov": [1, 2, 5, 237, 297, 324, 400, 459, 480, 481, 482, 483, 487], "absolut": [0, 12, 16, 172, 412, 413, 433], "acc": 313, "acceler": [2, 328], "access": [0, 5, 50, 324, 452, 463, 480, 483, 487], "accord": [0, 242, 304, 307, 378, 402, 403, 404, 405], "accordingli": 2, "accross": 8, "accumul": [313, 382], "accuraci": 6, "accustom": 5, "achiev": [324, 480], "across": [1, 2, 344, 480], "act": [2, 429], "action": 324, "activ": [2, 8, 211, 337, 397, 399, 415, 435, 445, 446, 448, 479], "actual": [5, 18, 365, 452, 483], "ad": [0, 1, 2, 4, 8, 142, 348, 452, 455, 456, 457, 458, 459, 460, 466, 480, 483, 486], "adadelta": 454, "adafactor": 454, "adagrad": 454, "adam": [454, 460, 461, 470, 471], "adamax": 454, "adamw": [454, 461], "adapt": [455, 456, 457, 480], "add": [0, 1, 2, 3, 5, 14, 38, 138, 199, 232, 237, 330, 331, 332, 333, 334, 335, 481, 487], "add_argu": 5, "add_depend": 2, "add_librari": 2, "addit": [0, 2, 5, 8, 13, 14, 142, 144, 146, 194, 328, 344, 350, 378, 382, 452, 481], "addmm": 0, "address": 2, "adjac": 337, "advanc": [5, 479], "advantag": 487, "advis": 484, "affin": [328, 344, 348, 350, 352, 381], "after": [2, 5, 6, 28, 159, 161, 164, 209, 233, 237, 328, 344, 350, 358, 359, 363, 365, 372, 375, 376, 377, 378, 399, 433, 479, 487], "after_1": 232, "after_2": 232, "after_i": 232, "after_n": 232, "afternoon": 5, "again": [5, 8, 324, 479], "against": 0, "aggreg": 378, "ago": 5, "ai": 112, "ainv": [188, 192], "albeit": 487, "algebra": 7, "algorithm": [400, 461], "alia": [96, 97, 341], "alibi": 324, "align": [184, 237, 327, 343, 349, 356], "align_corn": 400, "all": [0, 1, 2, 3, 6, 8, 16, 28, 38, 84, 85, 86, 94, 99, 100, 101, 103, 104, 112, 121, 122, 123, 140, 149, 152, 155, 158, 163, 164, 191, 204, 232, 233, 259, 278, 307, 324, 358, 359, 363, 366, 367, 368, 373, 375, 378, 391, 399, 400, 449, 452, 474, 476, 479, 482, 483, 485, 488], "all_avg": 480, "all_reduce_grad": 480, "all_sum": 480, "allclos": [0, 1, 143], "alloc": [2, 212, 216, 217, 452], "allow": [0, 1, 2, 178, 309, 324, 377, 452, 474, 480, 482, 485], "almost": 5, "alon": [2, 484], "along": [0, 2, 26, 27, 94, 95, 108, 109, 110, 111, 121, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 163, 164, 168, 185, 189, 236, 247, 256, 259, 273, 275, 279, 287, 288, 291, 292, 293, 294, 301, 324, 342, 383, 414], "alpha": [0, 2, 14, 237, 329, 339, 409, 410, 434, 436, 439, 459, 466], "alpha_": 2, "alreadi": [2, 3, 5, 480], "also": [0, 1, 2, 5, 6, 7, 8, 11, 13, 87, 88, 89, 119, 128, 129, 133, 149, 152, 155, 158, 166, 167, 180, 181, 182, 199, 206, 222, 224, 228, 234, 237, 255, 258, 283, 307, 308, 319, 324, 362, 376, 378, 380, 381, 389, 411, 439, 441, 448, 454, 479, 480, 481, 482, 483, 484, 485, 488], "altern": 476, "alwai": [1, 83, 211, 310, 481], "am": 5, "among": 2, "amount": [5, 213, 326, 355], "amus": 5, "an": [0, 1, 2, 3, 5, 6, 8, 10, 15, 17, 30, 84, 85, 86, 91, 98, 99, 100, 101, 102, 103, 104, 120, 125, 126, 127, 136, 140, 142, 146, 159, 162, 169, 173, 183, 189, 194, 217, 218, 223, 229, 230, 232, 235, 236, 237, 238, 247, 256, 257, 259, 260, 275, 278, 285, 287, 288, 291, 292, 296, 303, 305, 306, 310, 311, 312, 313, 324, 326, 327, 336, 341, 344, 349, 350, 352, 355, 356, 358, 378, 379, 381, 383, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 412, 436, 449, 454, 455, 465, 469, 474, 476, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "anaconda": 480, "anchor": 434, "angl": [115, 239, 351], "angular": [145, 386], "ani": [0, 1, 2, 5, 7, 18, 94, 310, 311, 312, 313, 314, 324, 341, 358, 359, 362, 371, 381, 399, 400, 449, 471, 478, 479, 481, 483, 485, 486, 487], "anonym": 479, "anoth": [0, 93, 178, 204, 283, 304, 317, 324, 358, 479, 481, 482, 487], "anwywher": 8, "anyhow": 5, "anymor": 5, "anyth": [5, 299, 483], "anytim": 483, "api": [1, 2, 341, 480, 481], "app": 8, "append": [5, 204, 479, 483], "appl": [2, 5, 7, 8, 487], "appli": [0, 38, 145, 146, 163, 191, 311, 312, 313, 324, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 337, 338, 339, 341, 342, 344, 345, 346, 347, 348, 350, 351, 352, 353, 354, 355, 356, 357, 359, 372, 379, 381, 382, 383, 384, 385, 387, 389, 390, 392, 393, 394, 395, 396, 397, 398, 400, 409, 410, 411, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 449, 462, 465, 471, 474, 479, 480], "applic": [3, 8], "apply_fn": 359, "apply_gradi": 454, "apply_to_modul": [324, 363], "approach": [429, 481], "appropri": [2, 479], "approx": 341, "approxim": [16, 341, 411, 412, 413], "ar": [0, 1, 2, 4, 5, 6, 7, 8, 16, 18, 82, 90, 91, 93, 94, 101, 105, 112, 118, 125, 126, 136, 140, 143, 148, 149, 151, 152, 154, 155, 157, 158, 159, 164, 165, 172, 173, 174, 175, 176, 177, 178, 179, 186, 187, 189, 190, 194, 204, 217, 231, 232, 233, 237, 238, 240, 241, 242, 247, 248, 251, 252, 259, 265, 266, 278, 279, 287, 299, 302, 303, 307, 310, 311, 317, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 344, 348, 350, 352, 365, 378, 381, 400, 421, 423, 424, 448, 452, 454, 461, 463, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487], "arang": [0, 1, 189, 247, 317, 400, 482, 484], "arbitrari": [310, 452], "arbitrarili": [1, 94, 324, 478, 481, 485], "arc": 0, "arcco": 0, "arccosh": 0, "architectur": [5, 8, 210, 324, 377, 487], "archiv": 486, "arcsin": 0, "arcsinh": 0, "arctan": 0, "arctan2": 0, "arctanh": 0, "arg": [2, 5, 10, 18, 120, 136, 265, 266], "arg1": 178, "arg2": 178, "argmax": [0, 6], "argmin": 0, "argnam": [165, 299], "argnum": [2, 165, 299, 481], "argpars": 5, "argpartit": 0, "argsort": 0, "argument": [1, 31, 65, 79, 94, 136, 165, 299, 311, 312, 313, 324, 400, 476, 480, 481, 486, 487, 488], "argumentpars": 5, "ari": [84, 85, 86], "aris": 484, "arm": 8, "arm64": 8, "around": 5, "arr": [0, 262, 482], "arr_0": 486, "arrai": [0, 1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 309, 324, 328, 349, 358, 365, 368, 373, 379, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 446, 449, 452, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 469, 470, 471, 472, 479, 480, 481, 483, 484, 485, 487], "array_equ": [0, 16, 172], "arrayfir": 7, "arxiv": [5, 344, 348, 350, 357, 382, 413, 435, 455, 461], "as_strid": 0, "ascend": [186, 187], "ask": [5, 480], "assert": [1, 2, 143], "assign": [0, 2, 38, 452], "associ": [2, 265, 266, 483], "assum": [0, 2, 5, 90, 185, 186, 187, 190, 311, 324, 326, 327, 344, 355, 356], "astyp": [0, 1, 2, 5, 143, 358, 484], "atleast": 0, "atleast_1d": 0, "atleast_2d": 0, "atleast_3d": 0, "atol": [0, 16, 172], "atom": [1, 143], "atomic_fetch_add_explicit": 1, "atomic_output": [1, 143], "attach": 2, "attempt": 94, "attend": 378, "attent": [146, 363, 378, 391, 399], "attention_norm": 5, "attribut": [1, 9, 10, 11, 30, 315, 371, 452, 474], "audio": 400, "auto": [0, 2, 8], "autom": 481, "automat": [1, 2, 7, 143, 194, 480, 485, 486, 487], "autoregress": 5, "avail": [2, 4, 5, 6, 8, 10, 124, 214, 319, 487], "averag": [326, 327, 455, 456, 458, 459, 460, 480], "avgpool1d": 324, "avgpool2d": 324, "avoid": [1, 2, 370, 479], "awai": [2, 5], "awar": [479, 483], "ax": [0, 2, 15, 17, 26, 27, 79, 112, 138, 148, 149, 151, 152, 154, 155, 157, 158, 159, 171, 189, 203, 205, 207, 221, 232, 235, 259, 273, 278, 280, 284, 285, 291, 295, 300, 481], "axes_a": 0, "axes_b": 0, "axi": [0, 2, 5, 6, 15, 17, 26, 27, 28, 29, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 71, 74, 75, 76, 80, 95, 108, 109, 110, 111, 118, 121, 138, 142, 144, 147, 150, 153, 154, 155, 156, 157, 158, 159, 168, 185, 189, 203, 205, 207, 221, 223, 232, 233, 235, 236, 241, 247, 256, 259, 273, 274, 275, 278, 279, 280, 284, 285, 287, 288, 292, 293, 294, 295, 300, 301, 303, 326, 327, 342, 355, 356, 383, 414, 420, 422, 423, 427, 432, 434, 442, 443, 482], "axis1": [0, 46, 77, 118, 285, 294], "axis2": [0, 46, 77, 118, 285, 294], "axpbi": 2, "axpby_": 2, "axpby_gener": 2, "axpby_general_": 2, "axpby_impl": 2, "axpby_impl_acceler": 2, "b": [0, 1, 2, 3, 5, 13, 14, 16, 24, 82, 87, 88, 89, 90, 128, 129, 133, 143, 161, 163, 166, 167, 171, 172, 180, 181, 182, 185, 189, 199, 200, 202, 204, 206, 222, 224, 228, 231, 234, 237, 244, 255, 258, 283, 291, 299, 312, 313, 342, 352, 383, 400, 414, 481, 482, 483, 484, 485, 486, 487], "b1": 163, "b2": 163, "b_": [343, 349], "b_stride": 1, "ba": [458, 460], "back": [5, 112, 214, 484], "backend": [1, 8, 123, 124], "backward": [1, 479, 481], "bad": 483, "balanc": 429, "baltimor": 189, "bandwidth": [479, 480], "bar": 480, "base": [0, 2, 145, 189, 196, 198, 234, 386, 399, 452, 454, 460, 474, 476, 479, 482], "base_idx": 1, "basi": 474, "basic": [4, 260, 481], "batch": [5, 14, 90, 163, 164, 204, 245, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 349, 378, 383, 400, 483], "batch_idx": 1, "batch_iter": [6, 454], "batch_siz": [6, 454], "batchnorm": 324, "becaus": [5, 211, 324, 483], "been": [0, 2, 5, 212, 483], "befor": [1, 2, 5, 8, 28, 143, 233, 362, 399, 463, 480, 482, 483], "before_1": 232, "before_2": 232, "before_i": 232, "before_n": 232, "beforehand": 231, "beggin": 259, "begin": [83, 184, 213, 237, 327, 343, 349, 356, 397, 415, 426, 433, 439, 445, 446], "behav": 112, "behavior": [245, 429, 482, 483], "behaviour": [112, 183, 184], "behind": 481, "being": [281, 324], "bell": 2, "below": [2, 8, 189, 296, 298, 317, 400, 483], "bench": 2, "benchmark": [2, 479], "benefici": [337, 338, 483], "best": 480, "beta": [0, 2, 14, 116, 141, 237, 328, 344, 348, 350, 433, 454, 458, 459, 460, 461], "beta_": 2, "beta_1": [456, 458, 459, 460, 461], "beta_2": [458, 459, 460, 461], "better": [481, 487], "between": [0, 2, 7, 93, 159, 399, 422, 425, 426, 429, 470, 480, 483, 484, 487], "beyond": [259, 468, 471], "bfloat16": [2, 11, 168, 317, 484], "bfloat16_t": 2, "bia": [5, 116, 141, 142, 164, 237, 238, 311, 324, 330, 331, 332, 333, 334, 335, 343, 349, 350, 352, 363, 365, 375, 378, 381, 383, 458, 459, 460, 463, 481], "bias": [0, 116, 141, 164, 237, 238, 343, 349, 363, 375, 378], "bicub": 400, "big": [1, 479], "bigger": [5, 456], "bilinear": [1, 400], "binari": [194, 262, 263, 264, 265, 266, 301, 397, 421, 446, 479], "binary_cross_entropi": [324, 479], "bit": [0, 116, 141, 164, 180, 237, 238, 258, 307, 317, 358, 380, 381, 382], "bitwis": [0, 87, 88, 89, 180, 258], "bitwise_and": 0, "bitwise_or": 0, "bitwise_xor": 0, "block": [0, 2, 5, 90, 399], "block_masked_mm": 0, "block_siz": [0, 90], "bn": 328, "bodi": [1, 143], "bool": [0, 1, 2, 15, 16, 17, 26, 27, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 75, 76, 78, 80, 82, 94, 101, 108, 109, 110, 111, 123, 124, 143, 145, 164, 172, 178, 183, 184, 189, 192, 194, 203, 205, 207, 208, 214, 217, 221, 235, 238, 280, 284, 300, 328, 330, 331, 332, 333, 334, 335, 343, 344, 348, 349, 350, 352, 358, 362, 363, 365, 370, 372, 375, 378, 381, 383, 386, 391, 399, 400, 421, 424, 456, 467], "bool_": [11, 317], "boolean": [0, 16, 82, 172, 173, 174, 175, 176, 177, 178, 200, 201, 202, 317, 374, 482], "both": [1, 2, 13, 87, 88, 89, 128, 129, 133, 166, 167, 178, 180, 181, 182, 189, 199, 206, 222, 224, 228, 234, 241, 255, 258, 283, 307, 326, 327, 348, 349, 355, 356, 454, 479, 480, 481, 485, 487], "bottom": 400, "bound": [0, 248, 251, 252, 341, 408, 479, 482, 487], "boundari": 470, "bracket": 5, "brain": 317, "break": 484, "bregler": 337, "broadcast": [0, 2, 13, 16, 87, 88, 89, 91, 93, 128, 129, 133, 162, 166, 167, 172, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 236, 240, 241, 245, 251, 252, 255, 258, 283, 288, 304, 378], "broadcast_arrai": [0, 2], "broadcast_to": 0, "broadcasted_input": 2, "brought": 7, "btl_tcp_link": 480, "buffer": [1, 2, 211, 484], "bui": 5, "build": [3, 5, 7, 404, 452, 479], "build_ext": [2, 8], "build_shared_lib": [2, 8], "built": [1, 2, 8, 483], "bundl": 5, "byte": [51, 61, 211, 212, 213, 216, 217, 218, 317], "c": [0, 1, 2, 5, 14, 189, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 348, 349, 355, 356, 484, 485, 487], "c_": [349, 461], "c_in": [98, 99, 100, 101, 102, 103, 104], "c_j": [326, 327, 355, 356], "c_out": [98, 99, 100, 101, 102, 103, 104], "c_pad": 1, "c_t": [349, 461], "cach": [5, 8, 209, 211, 212, 216, 479], "calcul": [189, 421, 424, 430, 456], "call": [2, 3, 5, 6, 31, 126, 161, 209, 213, 324, 340, 363, 375, 380, 388, 452, 454, 463, 479, 480, 481, 483], "callabl": [94, 112, 143, 165, 179, 299, 302, 303, 307, 308, 310, 311, 312, 313, 358, 359, 362, 370, 383, 388, 399, 401, 402, 403, 404, 405, 406, 407, 408, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 469, 470, 471, 472], "can": [1, 2, 3, 5, 7, 8, 13, 18, 65, 79, 83, 87, 88, 89, 94, 118, 119, 120, 128, 129, 133, 136, 166, 167, 180, 181, 182, 189, 199, 206, 218, 222, 224, 228, 234, 240, 241, 248, 251, 252, 255, 258, 263, 283, 294, 299, 313, 324, 327, 340, 341, 356, 362, 375, 380, 388, 400, 423, 449, 452, 454, 462, 463, 476, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "cannot": [5, 93, 482, 484], "captur": [2, 3, 94, 219, 220, 324, 479], "care": [5, 483], "carri": 2, "cartesian": 208, "case": [2, 5, 121, 122, 123, 125, 126, 127, 149, 152, 153, 155, 156, 157, 158, 159, 183, 184, 185, 186, 187, 188, 190, 191, 192, 204, 257, 278, 327, 337, 356, 397, 415, 433, 439, 445, 446, 462, 463, 479, 481, 485, 486, 487, 488], "cast": [2, 37, 156, 157, 158, 194, 358, 370, 484], "caster": 2, "categor": 5, "categori": [11, 178, 317], "catlas_saxpbi": 2, "caus": [324, 479, 483], "causal": 5, "caution": 83, "cd": [3, 8], "cdf": [242, 341, 411], "cdot": [413, 422, 425, 441], "ceil": 0, "ceildiv": 1, "cell": 349, "celu": 324, "certain": [2, 372, 479], "chang": [83, 94, 268, 301, 376, 381, 400, 426, 433, 479, 484], "channel": [1, 98, 99, 100, 101, 102, 103, 104, 328, 330, 331, 332, 333, 334, 335, 337, 338], "channel_idx": 1, "charact": 310, "check": [0, 2, 8, 82, 124, 178, 186, 187, 214, 365, 481, 482], "checklist": 480, "checkout": [3, 479], "checkpoint": [399, 454], "chen": 461, "child": 377, "children": 324, "chip": 8, "choleski": 184, "choos": [5, 145, 386], "chosen": 131, "clamp": 159, "clang": 8, "clariti": 481, "class": [2, 5, 6, 9, 10, 11, 30, 112, 120, 315, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 452, 455, 456, 457, 458, 459, 460, 461, 466, 467, 474], "class_pred": 307, "classif": [404, 405], "classifi": 6, "classmethod": [380, 381], "clear": 209, "click": 8, "clip": [0, 309, 421, 456], "clip_threshold": 456, "clipped_grad": 309, "clone": 8, "close": [4, 7, 8, 16, 172], "closer": 311, "cmake": [3, 8], "cmake_arg": 3, "cmake_build_parallel_level": 8, "cmake_build_typ": 8, "cmake_current_list_dir": 2, "cmake_host_system_processor": 8, "cmake_library_output_directori": 2, "cmakebuild": 2, "cmakeextens": 2, "cmakelist": 2, "cmdclass": 2, "co": [0, 2, 112, 391, 481], "code": [1, 143, 479, 480, 483], "coeffici": [2, 455, 456, 458, 459, 460, 461], "col": 296, "col_contigu": 2, "cold": 8, "collect": [2, 311, 312, 478], "column": [2, 140, 169, 186, 237], "com": [8, 480], "combin": [5, 191, 313], "come": [2, 5, 480, 481], "command": [2, 3, 8, 480], "command_buff": 2, "common": [2, 454, 479, 483], "commonli": [6, 376, 449, 479], "commun": [7, 120, 123, 124], "compar": [2, 82, 479], "comparison": [16, 133, 166, 167, 181, 182, 228], "compat": [5, 241, 245, 341, 486], "compil": [0, 3, 7, 8, 119, 132, 143, 480, 481, 483], "compiled_fun": 479, "compiled_grad_fn": 479, "complet": [4, 5, 8, 217, 376, 377, 481, 487], "complex": [2, 96, 97, 154, 155, 156, 157, 158, 170, 186, 187, 253, 310, 317, 324, 377, 479, 481], "complex64": [2, 11, 317], "complex64_t": 2, "complexflo": 11, "compon": [2, 5], "compos": [7, 324, 479, 481, 485], "composit": 485, "compress": 266, "compromis": 5, "comput": [0, 1, 2, 4, 5, 6, 7, 8, 108, 109, 110, 111, 112, 116, 131, 139, 141, 145, 165, 179, 183, 184, 185, 186, 187, 188, 189, 192, 199, 207, 231, 237, 255, 273, 280, 281, 291, 299, 300, 302, 308, 324, 328, 343, 344, 348, 349, 350, 363, 376, 381, 382, 386, 399, 402, 403, 404, 405, 412, 413, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 454, 455, 456, 458, 459, 460, 461, 465, 479, 480, 481, 485, 487], "computation": 483, "compute_encod": 2, "concaten": [0, 5, 121], "concept": 452, "concis": 5, "concret": [2, 343, 349, 352, 383, 483, 487], "conda": [8, 480], "condit": [0, 2, 304, 487], "config": [2, 480], "configu": 454, "configur": [116, 141, 480], "confirm": 480, "confus": 6, "conj": 97, "conjug": [0, 96], "connect": 480, "consecut": [145, 237, 386], "consequ": 5, "consid": [5, 16, 82, 172, 310, 311, 312, 344, 478], "consider": 479, "const": [0, 1, 2, 424], "constant": [0, 2, 5, 8, 142, 144, 232, 324, 328, 344, 350, 382, 424, 434, 466, 468, 479, 484], "constant_valu": 232, "constitut": 311, "construct": [0, 2, 6, 45, 117, 162, 229, 292, 305], "consum": 483, "contain": [2, 5, 8, 28, 29, 68, 94, 118, 131, 153, 154, 155, 163, 164, 186, 189, 200, 201, 202, 237, 275, 304, 309, 324, 362, 364, 365, 371, 399, 430, 449, 452, 479, 480, 481], "content": [8, 362, 479], "context": 282, "contigu": [1, 2, 83, 143], "continu": [329, 409, 481], "contract": [0, 131], "contrast": 459, "contribut": 2, "contriv": [481, 487], "control": [0, 351, 476, 483], "conv": 105, "conv1d": [0, 324], "conv2d": [0, 324], "conv3d": [0, 324], "conv_gener": 0, "conv_transpose1d": 0, "conv_transpose2d": 0, "conv_transpose3d": 0, "conveni": [1, 2, 6, 178], "convent": [18, 105, 130, 131, 400, 459], "convers": 7, "convert": [0, 1, 2, 78, 84, 85, 86, 115, 159, 239, 380, 381, 483, 484, 485], "convolut": [0, 98, 99, 100, 101, 102, 103, 104, 105, 330, 331, 332, 333, 334, 335, 337, 338], "convolv": [98, 99, 100, 101, 102, 103, 104], "convtranspose1d": 324, "convtranspose2d": 324, "convtranspose3d": 324, "coordin": [0, 208], "copi": [0, 1, 2, 5, 7, 233, 274, 484], "copy_inplac": 2, "copytyp": 2, "core": [1, 2, 3, 4, 5, 6, 307, 324, 326, 327, 328, 348, 355, 356, 365, 368, 370, 373, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 449, 452, 454, 479, 480, 484, 485], "corner": 400, "correct": [2, 8, 458, 459, 460, 482, 483], "correctli": 38, "correl": [101, 337], "correspond": [0, 1, 2, 15, 17, 78, 93, 116, 118, 141, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 186, 203, 205, 221, 235, 284, 291, 303, 311, 481], "cos_first": 391, "cosh": [0, 429], "cosin": [0, 19, 20, 106, 107, 422, 468, 470, 481], "cosine_decai": [454, 470], "cosine_similarity_loss": 324, "cost": [8, 456, 480, 483], "costli": 483, "cot": 1, "cot_index": 1, "cotan": [2, 112], "cotang": [1, 2, 112, 302], "could": [5, 324], "count": [324, 470], "counter": 476, "cours": 481, "coursera": 466, "cov": 245, "covari": [245, 328], "cover": 2, "cpp": 2, "cpu": [7, 8, 186, 187, 190, 487], "cpython": 2, "crash": [83, 479], "creat": [0, 2, 5, 8, 83, 123, 140, 169, 282, 324, 452, 454, 470, 479, 482, 484], "create_additive_causal_mask": 5, "criteria": 2, "cross": [6, 101, 421, 423], "cross_entropi": [6, 324], "crowd": 5, "cry": 5, "cubic": 400, "cummax": 0, "cummin": 0, "cumprod": 0, "cumsum": 0, "cumul": [0, 83, 108, 109, 110, 111], "current": [5, 7, 8, 83, 90, 100, 103, 104, 127, 210, 212, 237, 313, 324, 456, 480, 483], "custom": [7, 112, 143, 399], "custom_decod": 399, "custom_encod": 399, "custom_funct": 1, "custom_kernel_myexp_float": 1, "custom_tim": 2, "cvpr": 337, "cycl": 478, "d": [0, 1, 2, 5, 100, 104, 117, 118, 171, 189, 204, 208, 231, 287, 294, 296, 297, 298, 314, 332, 335, 338, 343, 349, 383, 455, 458, 460, 487], "d1": 487, "d2": 487, "d2fdx2": 481, "d_i": 352, "dampen": 467, "darwin": 2, "data": [0, 2, 6, 7, 10, 18, 125, 140, 156, 157, 162, 169, 193, 225, 229, 242, 251, 294, 296, 301, 305, 338, 401, 402, 403, 404, 405, 406, 407, 408, 479, 480, 482, 484], "dataset": [4, 480, 483], "datatyp": 51, "dbuild_shared_lib": 8, "dcmake_build_typ": 8, "ddof": [0, 75, 80, 280, 300], "deal": 479, "debug": [1, 3, 480], "debugg": 7, "decai": [456, 459, 461, 467, 468, 469, 472], "decay_r": [456, 469, 472], "decay_step": 468, "decent": 6, "decid": [311, 362], "decim": [0, 66, 260], "declar": 2, "decltyp": 1, "decod": 399, "decomposit": [183, 184, 191], "decor": [1, 112], "decoupl": 459, "deep": [328, 402, 403, 404, 405], "def": [1, 2, 4, 5, 6, 112, 143, 299, 324, 452, 479, 480, 481, 482, 483, 484, 487], "default": [1, 2, 8, 14, 15, 16, 17, 18, 26, 27, 28, 29, 82, 83, 90, 94, 95, 98, 99, 100, 101, 102, 103, 104, 112, 113, 114, 116, 117, 118, 121, 122, 123, 125, 126, 127, 140, 141, 143, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 163, 164, 165, 168, 169, 172, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 203, 205, 207, 208, 216, 217, 218, 221, 225, 229, 232, 233, 235, 237, 238, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 256, 257, 260, 267, 268, 274, 275, 278, 279, 280, 282, 284, 286, 291, 293, 294, 295, 296, 297, 298, 299, 300, 303, 305, 307, 317, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 339, 342, 343, 345, 348, 349, 351, 352, 355, 356, 358, 363, 365, 370, 372, 375, 378, 379, 380, 381, 383, 386, 391, 395, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 452, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 476, 478, 479, 481, 484, 486, 488], "default_devic": 488, "default_stream": 488, "defin": [1, 2, 4, 5, 6, 8, 112, 126, 143, 164, 185, 189, 238, 307, 310, 484], "definit": [112, 183, 184, 245], "degre": [0, 239, 434], "delta": [426, 455], "delv": [404, 405], "demonstr": 484, "denomin": [348, 422, 455, 457, 458, 459, 460, 466], "dens": [208, 487], "depend": [0, 2, 3, 4, 8, 78, 189, 343, 349, 383, 480, 482, 486, 487], "depth": [310, 332, 335, 338, 481], "dequant": [0, 237], "deriv": [2, 481, 483], "descend": 360, "descent": [467, 479, 483], "describ": [2, 483], "descript": [2, 5, 317], "design": [1, 4, 7, 476, 487], "destin": [0, 2, 60, 127, 223, 236], "destroi": 479, "detach": 481, "detail": [1, 2, 10, 216, 324, 337, 386, 391, 400, 402, 403, 404, 405, 455, 457, 458, 460, 461, 482, 485], "determin": [0, 2, 118, 245, 313, 317, 369, 486], "dev": [2, 8], "develop": [2, 8], "developer_dir": 8, "deviat": [0, 246, 280, 402, 404, 407], "deviatoin": 0, "devic": [1, 2, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 210, 217, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 301, 304, 305, 306, 315, 487, 488], "device_info": 218, "devicetyp": 9, "df": 484, "dfdx": [481, 482], "dft": [147, 148, 149, 150, 151, 152, 156, 157, 158], "dhwc": 338, "diag": [0, 191], "diagon": [0, 45, 117, 140, 294, 296, 297, 298], "dict": [94, 136, 194, 210, 263, 264, 265, 309, 368, 373, 376, 377, 452, 454, 462, 463, 465, 478, 481, 486], "dict_kei": [311, 463], "dictionari": [5, 94, 194, 210, 263, 264, 309, 310, 313, 324, 362, 371, 376, 377, 464, 478, 486], "did": 5, "diff": 2, "differ": [7, 178, 283, 301, 433, 481], "differenti": [1, 2, 7, 329, 409], "difficult": 481, "difficulti": [402, 403], "dilat": [0, 98, 99, 100, 101, 102, 103, 104, 330, 331, 333, 334], "dim": [1, 5, 145, 146, 340, 344, 348, 350, 378, 380, 382, 386, 391, 399], "dimens": [0, 1, 2, 5, 15, 17, 26, 27, 62, 68, 78, 84, 85, 86, 94, 99, 100, 101, 103, 104, 118, 138, 145, 154, 155, 157, 158, 159, 163, 164, 171, 183, 184, 186, 187, 188, 189, 190, 191, 192, 203, 204, 205, 207, 221, 235, 236, 237, 241, 250, 280, 284, 288, 291, 295, 300, 328, 330, 331, 332, 333, 334, 335, 337, 338, 342, 343, 344, 348, 349, 350, 378, 382, 383, 386, 399, 400, 414, 423, 479, 481], "dimension": [30, 142, 144, 147, 148, 149, 150, 151, 152, 156, 157, 158, 326, 327, 328, 330, 331, 332, 333, 334, 335, 340, 352, 355, 356, 380, 381, 391, 482, 484], "direct": [2, 5, 360, 461, 487], "directli": [2, 5, 83], "directori": [2, 5, 8], "disabl": [119, 216, 479], "disable_compil": 479, "disappoint": 5, "discard": [5, 310], "discov": 8, "discoveri": 461, "discret": [105, 147, 148, 149, 150, 151, 152, 156, 157, 158, 340, 380], "discuss": 2, "disk": 5, "dispatch": 2, "dispatchthread": [1, 2], "displai": 324, "distanc": [5, 434], "distribut": [7, 8, 240, 241, 242, 244, 245, 246, 251, 252, 352, 402, 403, 404, 405, 407, 408, 424, 427, 432, 434, 449], "diverg": 427, "divid": [0, 2, 38, 161, 237, 255, 480], "divis": [0, 128, 161, 237, 255], "divisor": [280, 300], "divmod": 0, "dloss_dw": 481, "dloss_dx": 481, "dlpack": 484, "dlvalu": 299, "dmlx_build_cpu": 8, "dmlx_build_gguf": 8, "dmlx_build_safetensor": 8, "dmlx_metal_debug": 3, "dmlx_metal_jit": 8, "do": [0, 2, 5, 8, 301, 324, 364, 375, 449, 452, 459, 479, 480, 481, 483], "doc": [2, 6, 480], "document": [2, 3, 65, 79, 143, 263, 264, 317, 479, 481, 482], "doe": [0, 2, 3, 5, 8, 211, 301, 309, 324, 479, 482, 483, 484], "doesn": [2, 324], "domain": [251, 480], "don": [1, 8, 479, 487], "done": [324, 336, 382, 479, 480, 483, 484], "dot": [183, 188, 192, 291, 310, 367, 378], "doubl": [0, 5], "doubt": 5, "down": [5, 309], "dparam": 299, "draw": 241, "drop": 362, "dropout": [324, 337, 338, 372, 399, 479], "dropout2d": 324, "dropout3d": 324, "dst": 127, "dt": 134, "dtype": [0, 1, 2, 5, 11, 18, 30, 37, 38, 78, 81, 125, 126, 140, 143, 159, 162, 169, 178, 186, 187, 189, 190, 193, 229, 242, 244, 245, 246, 248, 251, 252, 294, 296, 301, 305, 317, 370, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 468, 469, 470, 471, 472, 479, 480, 481, 482, 484, 485, 486], "dtypecategori": [178, 317], "dual": 429, "duchi": 457, "dure": [3, 94, 336, 337, 338, 400, 484], "dx": 112, "dy": 112, "dyld": 480, "dyld_library_path": 480, "dylib": 2, "dynam": 483, "e": [2, 6, 8, 112, 134, 143, 163, 164, 179, 269, 328, 330, 331, 332, 333, 334, 335, 337, 338, 344, 348, 350, 363, 382, 419, 420, 442, 443, 448, 454, 457, 479, 483, 488], "e5": 317, "e8": 317, "each": [0, 1, 2, 68, 116, 136, 141, 145, 164, 178, 183, 184, 186, 187, 188, 191, 192, 204, 208, 232, 237, 238, 241, 256, 265, 266, 275, 292, 295, 301, 303, 304, 337, 338, 340, 343, 344, 349, 383, 386, 399, 421, 423, 476, 479, 480, 483], "eager": 483, "earli": 337, "earlier": 2, "eas": 5, "easi": [2, 324, 480], "easier": [1, 483], "edg": [93, 232, 400, 479], "edit": [8, 377], "effect": [337, 479, 483], "effici": [5, 7, 163, 337, 386, 483, 485], "eigenvalu": [186, 187], "eigenvector": 186, "einstein": [130, 131], "einsum": 131, "either": [8, 13, 65, 78, 79, 87, 88, 89, 93, 128, 129, 133, 161, 166, 167, 180, 181, 182, 189, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283, 299, 327, 356, 388, 400, 404, 405, 484], "elem": [1, 143], "elem_to_loc": [1, 2], "element": [0, 1, 2, 12, 13, 19, 20, 21, 22, 23, 24, 25, 28, 70, 83, 87, 88, 89, 92, 106, 107, 108, 109, 110, 111, 116, 128, 129, 133, 134, 135, 137, 139, 140, 141, 160, 161, 164, 166, 167, 172, 173, 174, 175, 176, 177, 180, 181, 182, 195, 196, 197, 198, 199, 200, 201, 202, 206, 208, 222, 224, 226, 228, 233, 234, 237, 238, 254, 255, 256, 258, 259, 261, 269, 270, 271, 272, 276, 277, 283, 287, 289, 290, 293, 299, 301, 304, 329, 336, 337, 338, 343, 347, 349, 357, 379, 383, 386, 390, 409, 416, 417, 419, 420, 435, 436, 438, 441, 442, 443, 444, 479, 481], "elementwis": [1, 96, 97], "elif": 5, "ellipsi": 482, "elman": 383, "els": [0, 2, 5, 324, 363, 480, 483], "elsewher": [296, 482], "elu": [324, 439], "emb": [5, 340, 380, 391], "embed": [5, 307, 324, 380, 386, 391, 422], "empti": [127, 245], "enabl": [3, 5, 8, 94, 132, 467], "encod": [2, 145, 386, 391, 399, 423], "encount": [2, 481], "end": [118, 184, 214, 237, 259, 327, 343, 349, 356, 397, 415, 426, 433, 439, 445, 446, 468, 471], "end_axi": [0, 49, 159], "end_encod": 2, "endif": 2, "endswith": 363, "enhanc": [5, 386, 483], "enjoi": 2, "enough": [2, 483], "ensur": [0, 1, 2, 8, 143, 309, 429, 480], "ensure_row_contigu": [1, 143], "enter": 5, "entir": [15, 17, 26, 27, 203, 205, 207, 221, 235, 280, 284, 300, 337, 338], "entri": [0, 247, 337, 338], "entropi": [6, 421, 423], "enumer": 324, "environ": [8, 119, 132, 480], "ep": [4, 142, 144, 328, 344, 348, 350, 382, 422, 424, 434, 454, 455, 456, 457, 458, 459, 460, 466], "epoch": 6, "epsilon": [328, 344, 348, 350, 382, 422, 424, 455, 457, 458, 459, 460, 466], "epsilon_1": 456, "epsilon_2": 456, "equal": [0, 1, 16, 28, 82, 140, 167, 172, 182, 228, 233, 248, 275, 348, 352], "equal_nan": [0, 16, 82, 172], "equat": [130, 131], "equival": [0, 2, 31, 65, 79, 126, 129, 161, 164, 168, 287, 329, 339, 341, 345, 346, 347, 353, 354, 377, 379, 381, 384, 385, 387, 389, 392, 393, 394, 395, 396, 398], "erf": [0, 135, 479], "erfinv": 0, "error": [0, 2, 8, 123, 134, 135, 217, 218, 275, 341, 411, 412, 413, 429, 431, 481, 484], "error_norm": 4, "estim": [458, 460], "eta": 461, "etc": [2, 237, 324, 400, 480], "eval": [2, 3, 4, 5, 6, 324, 452, 454, 479, 480, 481, 483, 485], "eval_cpu": 2, "eval_fn": 6, "eval_gpu": 2, "evalu": [2, 5, 6, 7, 127, 136, 179, 302, 324, 361, 372, 452, 454, 479, 485], "even": [1, 2, 5, 94, 479, 483, 484], "evenli": [0, 193], "everi": [237, 311, 454, 472, 481], "everyth": [5, 480], "everywher": 0, "exact": [412, 413], "exactli": [2, 5, 145, 365, 481], "exampl": [0, 3, 4, 5, 6, 8, 18, 38, 112, 143, 159, 178, 186, 187, 189, 190, 282, 287, 309, 312, 313, 324, 326, 327, 328, 348, 355, 356, 363, 365, 372, 375, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 449, 454, 463, 468, 469, 470, 471, 472, 476, 481, 482, 483, 484, 485, 486], "exce": 309, "exceed": 217, "except": [7, 140, 153, 154, 156, 157, 158, 344, 365, 482, 484], "exclud": [236, 288], "exclus": [0, 83, 89], "execut": [2, 8, 84, 85, 86, 213, 484, 487], "exist": [2, 3, 5, 363, 375], "exp": [0, 1, 139, 143, 199, 203, 242, 273, 329, 339, 390, 409, 410, 427, 439, 440, 444, 479, 487], "exp_elementwis": [1, 143], "expand_dim": 0, "expect": [2, 5, 330, 331, 332, 333, 334, 335, 336, 337, 338, 391, 399, 424, 479, 482], "expens": 399, "expensive_fun": 483, "experiment": 484, "explain": 2, "explicit": [2, 463, 476, 484], "explicitli": [163, 324, 476], "explor": 8, "expm1": 0, "exponenti": [0, 137, 139, 329, 339, 387, 409, 410, 439, 469], "exponential_decai": 454, "export": 8, "ext_modul": 2, "extend": [2, 232], "extens": [7, 194, 219, 369, 486], "extern": 484, "extra": [1, 311, 312], "extract": [0, 5, 45, 117, 118, 324, 362, 452, 480], "extras_requir": 2, "extrem": [482, 483], "ey": [0, 5, 188, 192], "f": [0, 2, 4, 6, 112, 189, 324, 349, 459, 479, 484], "f_jvp": 112, "f_t": 349, "f_vjp": 112, "f_vmap": 112, "face": 5, "factor": [2, 14, 168, 183, 184, 190, 400, 423, 469, 472], "fall": [2, 112], "fallback": 2, "fals": [0, 1, 2, 5, 15, 16, 17, 26, 27, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 75, 76, 80, 82, 94, 101, 108, 109, 110, 111, 123, 143, 172, 178, 183, 184, 189, 192, 194, 203, 205, 207, 208, 217, 221, 235, 280, 284, 300, 304, 307, 310, 311, 312, 313, 317, 344, 348, 350, 352, 363, 365, 375, 378, 381, 386, 391, 399, 400, 421, 424, 456, 467, 484], "famili": 5, "fan": [402, 403, 404, 405], "fan_in": [402, 403, 404, 405], "fan_out": [402, 403, 404, 405], "far": 454, "fast": [1, 7, 341, 413, 480, 487], "faster": [1, 2, 8, 129, 411, 421, 479, 481], "featur": [1, 7, 98, 99, 100, 101, 102, 103, 104, 145, 328, 343, 344, 348, 349, 350, 352, 381, 382, 383, 386, 399, 400, 479, 483], "feed": 5, "feed_forward": 5, "feedforward": [402, 403], "feel": 5, "fetch": 1, "few": [1, 2, 5, 6, 7, 8, 480, 483, 485], "ffn": 5, "ffn_norm": 5, "fft": 7, "figur": 480, "file": [5, 8, 194, 262, 263, 264, 265, 266, 365, 369, 480, 481, 486], "file_or_weight": 365, "fill": [0, 2, 162, 230, 296, 306, 401, 402, 403, 404, 405, 407, 408], "filter": [0, 105, 330, 331, 332, 333, 334, 335, 358, 362], "filter_and_map": 324, "filter_fn": [358, 362], "final": [2, 4, 5, 6, 168, 468, 471], "find": [2, 4, 8, 480], "find_packag": 2, "finder": 8, "fine": [476, 483], "finetun": 324, "finish": 2, "finit": [0, 173, 225], "first": [0, 1, 2, 3, 4, 5, 6, 8, 118, 121, 159, 165, 178, 180, 191, 200, 202, 204, 233, 250, 258, 285, 291, 294, 299, 310, 312, 313, 324, 327, 344, 356, 400, 422, 430, 456, 458, 459, 460, 463, 479, 481, 484, 487], "first_lay": 483, "fit": [2, 237, 487], "five": 479, "fix": [2, 5, 8, 483], "flag": [2, 8, 479, 484], "flat": [163, 164, 310, 314], "flat_param": 265, "flatten": [0, 28, 29, 108, 109, 110, 111, 189, 231, 233, 236, 256, 259, 274, 287, 288, 293, 310], "flexibl": 7, "flexibli": 377, "flip": [0, 101, 105], "float": [0, 1, 2, 11, 14, 16, 18, 78, 142, 143, 144, 145, 146, 161, 162, 168, 172, 178, 189, 225, 238, 240, 244, 246, 309, 317, 328, 336, 337, 338, 344, 348, 350, 358, 370, 382, 386, 391, 397, 399, 400, 401, 402, 403, 404, 405, 407, 408, 422, 423, 424, 426, 430, 433, 434, 445, 446, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 469, 471, 472], "float16": [1, 2, 11, 143, 168, 194, 317, 358, 483, 484], "float16_t": [1, 2], "float32": [0, 1, 2, 11, 18, 140, 143, 146, 168, 169, 178, 186, 187, 189, 190, 193, 229, 242, 244, 245, 246, 251, 252, 296, 305, 317, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 468, 469, 470, 471, 472, 479, 480, 481, 482, 483, 484, 485, 486], "float64": 178, "floor": [0, 1, 161], "floor_divid": 0, "flow": [0, 281, 483], "flush": 2, "fn": [308, 311, 312, 313, 485], "follow": [1, 2, 5, 6, 7, 8, 18, 105, 116, 141, 163, 189, 232, 237, 312, 324, 412, 413, 427, 455, 456, 457, 458, 459, 460, 461, 467, 476, 479, 480, 481, 487], "foo": 480, "food": 5, "forc": [5, 6, 324, 480, 485], "forg": 8, "formal": [116, 141, 237], "format": [5, 194, 262, 263, 264, 265, 266, 484], "formul": [329, 339], "formula": 433, "forth": 400, "forward": [1, 2, 299, 479, 483], "found": 362, "four": 328, "fourier": [147, 148, 149, 150, 151, 152, 156, 157, 158], "frac": [134, 237, 269, 326, 327, 328, 336, 337, 338, 344, 348, 350, 352, 355, 356, 382, 390, 402, 403, 404, 405, 422, 424, 426, 429, 440, 442, 443, 455, 457, 458, 459, 460, 466], "fraction": 18, "framework": [2, 7], "free": 216, "freez": [324, 375, 452], "freq": 145, "frequenc": [145, 386, 391], "frequent": [479, 483], "friend": 5, "fro": 189, "frobeniu": 189, "from": [0, 1, 2, 5, 6, 7, 83, 115, 116, 118, 121, 122, 125, 126, 127, 141, 143, 154, 155, 157, 158, 162, 163, 168, 189, 194, 204, 208, 213, 216, 230, 237, 239, 240, 241, 242, 243, 244, 248, 251, 265, 278, 281, 283, 287, 288, 293, 294, 304, 306, 310, 311, 312, 313, 314, 324, 352, 363, 365, 378, 402, 403, 404, 405, 407, 408, 424, 433, 449, 454, 478, 479, 480, 481, 483, 484, 485, 486, 487], "from_embed": 380, "from_linear": 381, "front": 2, "frozen": [324, 363, 373, 375, 381, 452], "fuction": 129, "full": [0, 1, 2, 6, 65, 79, 105, 143, 273, 376, 377, 424, 479, 480, 483], "full_turn": 391, "fulli": [2, 7, 480, 484, 487], "fun": [94, 165, 179, 299, 302, 303, 479, 482, 483, 487], "fun1": 483, "func": 383, "function": [0, 1, 2, 3, 4, 5, 6, 7, 16, 18, 83, 94, 112, 129, 134, 135, 143, 165, 172, 179, 183, 184, 186, 187, 188, 189, 190, 191, 192, 204, 218, 269, 299, 302, 303, 308, 309, 311, 312, 313, 324, 329, 339, 341, 342, 345, 346, 347, 353, 354, 357, 359, 363, 370, 375, 379, 383, 384, 385, 387, 388, 389, 390, 392, 393, 394, 395, 396, 397, 398, 399, 411, 412, 413, 414, 415, 416, 417, 419, 420, 421, 435, 440, 442, 443, 444, 445, 446, 447, 449, 454, 463, 476, 478, 480, 482, 483, 484, 486], "functool": 479, "further": [2, 8, 481], "fuse": [1, 479], "fusibl": 479, "futur": [5, 381, 482, 483], "g": [3, 8, 112, 143, 189, 237, 349, 448, 466, 467, 483, 488], "g_t": [349, 455, 457, 458, 459, 460, 461, 466, 467], "gain": [402, 403, 404, 405], "gamma": [328, 344, 348, 350, 382, 402, 403, 404, 405], "gap": 1, "gate": [342, 343, 414], "gather": [0, 121, 163, 164], "gather_mm": [0, 164], "gather_qmm": 0, "gaurante": 301, "gaussian": [4, 341, 411, 412, 413, 424], "gaussian_nll_loss": 324, "gelu": [324, 412, 413, 479], "gelu_approx": [324, 341, 411], "gelu_fast_approx": [324, 341, 411], "geluapprox": 341, "gelufast": 341, "gener": [0, 1, 2, 3, 4, 11, 18, 101, 140, 143, 154, 155, 193, 208, 240, 245, 246, 247, 248, 251, 252, 399, 476, 479, 482, 483, 488], "general_": 2, "generate_stub": 8, "geq": [397, 446], "get": [2, 4, 6, 8, 99, 100, 101, 103, 104, 113, 114, 210, 211, 212, 213, 243, 324, 479, 481, 483, 487], "get_cache_memori": 209, "get_command_encod": 2, "get_kernel": 2, "gguf": [8, 194, 263, 486], "gh": 1, "gii": 1, "git": 8, "github": [4, 6, 8, 479], "give": [2, 5, 6, 28, 479], "given": [0, 2, 8, 15, 17, 28, 38, 83, 91, 93, 95, 108, 109, 110, 111, 116, 118, 131, 136, 138, 141, 147, 148, 149, 150, 151, 152, 156, 157, 158, 162, 163, 189, 203, 205, 207, 216, 221, 225, 227, 235, 245, 247, 248, 259, 260, 268, 273, 275, 280, 284, 286, 292, 293, 294, 296, 297, 298, 300, 315, 326, 327, 336, 355, 356, 362, 378, 422, 424, 430], "gix": 1, "gix_mult": 1, "giy_mult": 1, "global": [119, 121, 122, 123, 125, 126, 127, 132, 249, 309, 476, 479], "glorot": [402, 403], "glorot_norm": 324, "glorot_uniform": 324, "glu": [5, 324], "gm": 1, "gn": 1, "go": [2, 5, 481], "golub": 189, "good": [2, 8, 454, 479, 480, 487], "goroshin": 337, "gower": 5, "gpu": [1, 3, 7, 8, 210, 482, 487], "gputrac": [3, 219], "grad": [2, 4, 6, 299, 309, 454, 462, 479, 480, 481, 482, 483, 485], "grad_fn": [4, 479, 481], "gradient": [0, 4, 6, 112, 165, 281, 299, 308, 309, 324, 363, 376, 381, 399, 429, 452, 454, 455, 456, 458, 459, 460, 461, 462, 465, 467, 479, 480, 481, 482, 483, 484, 485], "grain": 476, "graph": [2, 5, 6, 7, 481], "great": 3, "greater": [0, 5, 28, 139, 167, 233, 309, 397, 446], "greater_equ": 0, "grep": 8, "grid": [2, 143, 208], "grid_dim": 2, "grid_grad": 1, "grid_idx": 1, "grid_sampl": 1, "grid_sample_grad": 1, "grid_sample_ref": 1, "grid_sample_vjp": 1, "grid_shap": 1, "grid_siz": 1, "ground": [4, 5, 423, 433], "group": [0, 1, 98, 99, 100, 101, 102, 103, 104, 116, 121, 122, 123, 125, 126, 127, 141, 146, 164, 237, 238, 301, 307, 330, 344, 380, 381, 480], "group_dim": 2, "group_siz": [0, 116, 141, 164, 237, 238, 307, 380, 381], "groupnorm": 324, "grow": 483, "gru": 324, "guid": [2, 7], "gw": 1, "h": [1, 2, 98, 99, 100, 102, 103, 104, 189, 327, 328, 331, 332, 334, 335, 337, 338, 343, 349, 356, 383, 481, 483], "h_": [327, 343, 349, 356, 383], "h_in": 1, "h_stride": 1, "h_t": [343, 349, 383], "ha": [2, 3, 5, 6, 7, 8, 78, 94, 118, 127, 153, 154, 156, 157, 158, 165, 183, 184, 186, 187, 188, 191, 192, 208, 212, 241, 328, 343, 349, 352, 383, 452, 454, 479, 482, 483, 485, 487], "had": 5, "hadamard": [0, 168], "hadamard_transform": 0, "half": [2, 18, 248, 252, 386, 483], "halv": [342, 414], "hand": [5, 481, 483], "handi": 481, "handl": [2, 324, 479], "happen": [2, 5, 142, 399, 454, 479, 483], "happi": 5, "hard": 5, "hard_shrink": [324, 345], "hard_tanh": [324, 346], "hardshrink": [324, 415], "hardswish": 324, "hardtanh": [324, 416], "hat": [116, 141, 237], "have": [0, 1, 2, 5, 8, 16, 82, 84, 85, 86, 90, 121, 154, 155, 157, 158, 164, 172, 204, 219, 241, 301, 310, 349, 378, 388, 461, 463, 478, 479, 480, 482, 483, 487], "haven": 5, "hazan": 457, "he": [5, 404, 405], "he_norm": 324, "he_uniform": 324, "head": [146, 378, 399], "header": [2, 143], "heart": 5, "heavi": 5, "height": [327, 328, 331, 332, 334, 335, 337, 338, 356], "hello": [310, 314], "help": [2, 5, 479, 487], "helper": [5, 143, 479], "henc": [0, 2, 237, 479], "hendryck": 413, "here": [2, 5, 454, 479, 481, 483, 486, 487], "hermitian": [186, 187], "hf": 349, "hg": 349, "hh": 383, "hi": [5, 349], "hidden": [343, 349, 383, 399], "hidden_dim": [6, 452, 454], "hidden_s": [343, 349, 383], "hierarchi": 317, "high": [248, 252, 324, 340, 408, 449], "high_pad_s": 0, "higher": [2, 171, 218, 430, 481], "highli": 8, "him": 5, "hing": 425, "hinge_loss": 324, "hinton": 466, "hit": 2, "hn": 343, "ho": 349, "hold": [2, 5, 10, 11, 189, 479], "homebrew": 480, "hopkin": 189, "host": 2, "host1": 480, "host2": 480, "host_nam": [1, 2], "hostfil": 480, "hostnam": 480, "hot": 423, "hour": 5, "how": [2, 5, 6, 324, 326, 327, 330, 331, 332, 333, 334, 335, 340, 355, 356, 380, 400, 462, 479, 482, 487], "howev": [2, 112, 324, 341, 344, 463, 476, 479, 480, 483, 484], "hr": 343, "http": [344, 348, 350, 357, 382, 413, 435], "huber": 426, "huber_loss": 324, "human": [404, 405], "hundr": 8, "hurri": 5, "hutter": 459, "hyperbol": [0, 20, 22, 25, 107, 272, 290, 398, 447], "hz": 343, "i": [0, 1, 2, 3, 5, 6, 7, 8, 16, 18, 28, 37, 78, 83, 93, 99, 100, 101, 103, 104, 105, 108, 109, 110, 111, 112, 117, 118, 121, 122, 124, 125, 126, 127, 129, 136, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 168, 172, 173, 178, 179, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 194, 199, 203, 204, 208, 214, 217, 218, 232, 233, 236, 237, 238, 245, 246, 247, 257, 259, 262, 263, 264, 269, 273, 275, 280, 281, 286, 287, 288, 291, 294, 295, 299, 300, 301, 302, 303, 304, 307, 309, 310, 311, 312, 313, 317, 319, 324, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 341, 343, 344, 348, 349, 350, 352, 355, 356, 362, 363, 369, 371, 372, 374, 375, 377, 378, 379, 381, 382, 383, 386, 391, 397, 399, 400, 404, 405, 411, 413, 421, 422, 424, 429, 430, 433, 434, 436, 441, 446, 452, 454, 456, 459, 461, 462, 463, 468, 470, 471, 476, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "i386": 8, "i_n": 1, "i_nw": 1, "i_s": 1, "i_sw": 1, "i_t": 349, "iclr": [458, 459, 460], "id": [6, 8], "idea": [481, 483], "idempot": [363, 375], "ident": [0, 112, 140, 281, 324, 372], "identifi": [2, 310, 478], "idim": 6, "idiom": [6, 479], "idx": [38, 482], "ie": [375, 480], "ieee": 317, "ignor": [5, 38, 93, 94, 136, 456], "ih": 383, "ii": 1, "ij": 208, "imag": [0, 331, 332, 334, 335, 337, 338, 400], "imagenet": [404, 405], "imaginari": 170, "immedi": [5, 358], "implement": [0, 1, 4, 6, 145, 146, 189, 340, 362, 378, 386, 388, 391, 397, 399, 400, 446, 455, 456, 457, 458, 460, 461, 462, 474, 479, 481], "impli": 301, "implicit": [476, 479, 481], "implicitli": 483, "import": [2, 3, 4, 5, 6, 8, 112, 168, 189, 265, 299, 310, 311, 312, 313, 314, 324, 326, 327, 328, 348, 355, 356, 365, 400, 421, 423, 430, 449, 452, 454, 479, 480, 481, 482, 483, 484, 485], "improv": [1, 2, 3, 5, 421, 455, 456, 457, 458, 459, 460, 466, 479, 480], "in_ax": [303, 481], "in_channel": [330, 331, 332, 333, 334, 335], "in_dim": [324, 452], "in_proj": 452, "inci": 2, "includ": [1, 2, 108, 109, 110, 111, 143, 211, 212, 217, 350, 359, 371, 381, 424, 454, 479, 481, 482, 485, 486, 488], "include_dir": 2, "inclus": [0, 41, 42, 43, 44, 108, 109, 110, 111, 159], "incom": 2, "inconveni": 479, "incorpor": 484, "incorrect": 484, "increas": 218, "increment": 18, "incur": [5, 8], "incx": 2, "independ": [120, 337, 338], "index": [0, 1, 2, 7, 9, 28, 38, 138, 140, 165, 208, 233, 287, 288, 299, 315], "indic": [0, 2, 16, 26, 27, 28, 29, 38, 163, 164, 165, 172, 173, 174, 175, 176, 177, 178, 191, 236, 275, 287, 288, 299, 372, 374, 423, 430, 470, 482], "indices_or_sect": [71, 275], "indirectli": 484, "individu": [324, 337, 338], "ineffici": [482, 483], "inexact": [11, 178], "inf": [189, 225, 378], "infer": [7, 162, 194, 294, 480], "infin": [0, 174, 176, 177, 225, 355, 356, 460], "infinit": [16, 172, 173], "info": [5, 8], "inform": [3, 5, 6, 8, 131, 210, 263, 264, 317, 324, 328, 341, 378, 481, 487], "inherit": [6, 478], "inifn": 174, "init": [324, 379, 449, 454, 468, 469, 471, 472, 480], "init_fn": [401, 402, 403, 404, 405, 406, 407, 408, 449], "init_valu": 1, "initi": [1, 3, 4, 5, 123, 313, 324, 328, 344, 348, 350, 352, 379, 382, 401, 402, 403, 404, 405, 406, 407, 408, 452, 463, 468, 469, 471, 472, 479, 480, 483], "initializer_list": 0, "inject": 0, "inlin": 0, "inner": [0, 479], "inorm": 348, "inp": [1, 143], "inp_ndim": 1, "inp_shap": 1, "inp_strid": 1, "inplac": [2, 8], "input": [0, 1, 2, 4, 5, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 117, 118, 121, 122, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 221, 222, 223, 224, 225, 226, 228, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 247, 250, 253, 254, 255, 256, 257, 258, 259, 260, 261, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 297, 298, 299, 300, 301, 303, 304, 306, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 378, 381, 382, 383, 386, 397, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 424, 425, 426, 427, 429, 430, 432, 434, 446, 449, 479, 481, 482, 485, 486], "input_dil": [0, 101], "input_dim": [6, 324, 352, 381], "input_nam": [1, 143], "input_s": [343, 349, 383], "inputs1": 430, "inputs2": 430, "insert": [118, 138, 487], "insid": 479, "inspect": [3, 479, 485], "inspir": 7, "instabl": 434, "instal": 2, "instanc": [5, 38, 112, 237, 314, 324, 348, 358, 359, 360, 363, 365, 366, 367, 372, 375, 376, 377, 388, 452, 484], "instancenorm": 324, "instanti": [1, 2, 6, 483], "instantiate_axpbi": 2, "instead": [2, 8, 112, 324, 377, 391, 480, 481, 483], "int": [0, 1, 2, 5, 6, 9, 15, 17, 18, 26, 27, 28, 29, 33, 34, 35, 36, 41, 42, 43, 44, 45, 46, 49, 56, 57, 58, 59, 60, 63, 66, 68, 71, 74, 75, 76, 77, 78, 80, 83, 90, 91, 95, 98, 99, 100, 101, 102, 103, 104, 108, 109, 110, 111, 116, 117, 118, 125, 126, 127, 131, 138, 140, 141, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 164, 165, 169, 178, 185, 189, 193, 203, 205, 207, 210, 211, 212, 213, 216, 217, 218, 221, 223, 229, 232, 233, 235, 236, 237, 238, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 256, 257, 259, 260, 273, 274, 275, 278, 279, 280, 284, 285, 287, 288, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 303, 305, 307, 315, 324, 326, 327, 328, 330, 331, 332, 333, 334, 335, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 378, 380, 381, 382, 383, 386, 391, 399, 414, 422, 423, 427, 432, 434, 452, 468, 470, 471, 472], "int16": 317, "int32": [0, 1, 11, 18, 38, 159, 178, 189, 248, 317, 400, 482, 485], "int64": [11, 317], "int8": [11, 317], "int_0": 134, "integ": [0, 11, 161, 163, 164, 178, 189, 210, 232, 237, 238, 240, 247, 248, 275, 287, 291, 303, 317, 340, 370, 470, 482], "integr": [18, 287, 483], "intend": [0, 479], "interact": 399, "interest": 487, "interfac": 2, "intermedi": 484, "intern": 328, "interpol": 400, "interv": [18, 193, 248, 252], "introduc": [0, 259], "intuit": 324, "invalid": [0, 83], "invers": [0, 19, 20, 21, 22, 23, 24, 25, 135, 150, 151, 152, 153, 154, 155, 184, 188, 192], "invert": 0, "involv": [454, 479], "iogpu": 218, "ip": 480, "is_avail": 123, "is_equival": 2, "is_floating_point": 2, "is_leaf": [310, 311, 312, 313], "is_leaf_fn": 362, "isclos": 0, "isfinit": 0, "ish": 5, "ishmael": 5, "isinf": 0, "isnan": 0, "isneginf": 0, "isposinf": 0, "issu": [480, 481, 484], "issubdtyp": [11, 317], "item": [0, 2, 4, 5, 6, 311, 454, 483, 484, 485], "iter": [4, 6, 191, 311, 312, 476, 479, 483], "iterm": 8, "itertool": [5, 311], "its": [0, 1, 2, 8, 184, 204, 233, 250, 296, 308, 314, 324, 381, 454, 458, 459, 460, 480, 483, 484, 487], "itself": [2, 307, 463], "ix": 1, "ix_n": 1, "ix_nw": 1, "ix_s": 1, "ix_sw": 1, "iy_n": 1, "iy_nw": 1, "iy_s": 1, "iy_sw": 1, "j": [5, 8, 189, 337, 457, 458, 460], "j8": 2, "jacobian": [2, 179, 302, 485], "jain": 337, "jax": [7, 476], "jit": 143, "jmlr": 457, "jnp": 484, "john": 189, "join": 470, "join_schedul": 454, "jointli": 245, "just": [2, 6, 350, 479, 482], "jvp": [2, 112, 485], "k": [0, 5, 45, 90, 117, 140, 146, 163, 168, 293, 296, 297, 298, 326, 352, 355, 363], "k_h": [327, 356], "k_w": [327, 356], "kaim": 405, "keep": [2, 15, 17, 26, 27, 203, 205, 207, 221, 235, 280, 284, 300, 324, 362, 481, 483], "keepdim": [0, 15, 17, 26, 27, 33, 34, 35, 36, 56, 57, 58, 59, 63, 75, 76, 80, 189, 203, 205, 207, 221, 235, 273, 280, 284, 300], "kei": [1, 3, 5, 146, 210, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 310, 311, 362, 363, 375, 378, 463, 476, 478, 481], "kept": 218, "kernel": [2, 7, 8, 98, 99, 100, 101, 102, 103, 104, 143, 326, 327, 355, 356, 479, 482], "kernel_dil": [0, 101], "kernel_s": [326, 327, 330, 331, 332, 333, 334, 335, 355, 356], "key_cach": 5, "key_input_dim": 378, "key_proj": 5, "keyword": [165, 265, 266, 299, 311, 324, 476, 486, 488], "kind": 5, "kingma": [458, 460], "kl_div_loss": 324, "kname": 2, "know": [2, 5], "known": [389, 441], "kth": [0, 28, 233], "kullback": 427, "kw_onli": 2, "kwarg": [10, 120, 265, 266, 488], "l": [5, 6, 183, 184, 186, 187, 324, 326, 328, 330, 333, 343, 349, 355, 383, 433], "l1": [299, 426, 428, 429, 433], "l1_loss": 324, "l2": [426, 429, 467], "l2_loss": 324, "l_": [326, 355, 426], "la": 189, "label": [3, 4, 423, 430], "label_smooth": 423, "lack": 482, "lambd": [345, 395, 415, 445], "lambda": [311, 312, 313, 324, 345, 358, 363, 370, 395, 415, 439, 445, 455, 456, 457, 458, 459, 460, 461, 466, 467, 479, 480, 481], "languag": [1, 2], "larg": [5, 324, 378, 429, 479, 480, 483], "larger": [1, 145, 218, 386, 461], "largest": [189, 225, 293], "lasso": 299, "last": [0, 1, 5, 29, 78, 142, 144, 149, 152, 154, 155, 157, 158, 159, 163, 164, 171, 183, 184, 186, 187, 188, 190, 191, 192, 204, 213, 241, 274, 291, 301, 330, 331, 332, 333, 334, 335, 337, 338, 344, 400, 484], "latenc": 480, "later": [3, 8, 454], "launch": [1, 2, 123, 480, 482], "layer": [7, 142, 307, 324, 326, 327, 337, 338, 343, 344, 349, 350, 352, 355, 356, 372, 377, 380, 381, 383, 388, 399, 448, 452], "layer_s": 6, "layernorm": 324, "layout": 1, "lazi": [7, 452, 485], "lazili": [5, 324], "lceil": 90, "ld": [343, 349, 383], "ldot": [326, 327, 355, 356], "lead": [0, 18, 83, 479], "leaf": [94, 307, 310, 311, 312, 313, 362], "leaf_modul": 324, "leaki": [351, 418], "leaky_relu": 324, "leakyrelu": 324, "learn": [4, 6, 7, 328, 344, 348, 350, 379, 382, 454, 455, 456, 457, 458, 459, 460, 461, 466, 467], "learnabl": [330, 331, 332, 333, 334, 335, 388], "learning_r": [6, 454, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 468, 469, 470, 471, 472, 479], "least": [5, 84, 85, 86, 93, 183, 184, 186, 187, 188, 190, 191, 192, 237], "leav": [2, 136, 311, 312, 313], "lectur": 466, "lecun": 337, "left": [0, 5, 145, 180, 189, 237, 259, 326, 327, 341, 355, 356, 386, 400, 412, 413, 424, 426, 434], "left_shift": 0, "leibler": 427, "len": [5, 149, 152, 155, 158, 168, 470], "length": [5, 278, 328, 330, 333, 343, 349, 383, 470], "leq": [426, 439], "less": [0, 1, 5, 28, 182, 218, 233, 386, 433], "less_equ": 0, "let": [1, 2, 4, 5, 184, 479, 481, 483, 484], "level": [0, 163, 164, 404, 405], "lfloor": [326, 327, 355, 356], "lh": [343, 349, 383], "lhs_indic": [0, 163, 164], "lhs_mask": 90, "lib": 480, "libmlx": 8, "libmlx_ext": 2, "libmpi": 480, "librari": [2, 8, 319, 324], "like": [2, 5, 7, 126, 178, 230, 306, 338, 429, 463, 465, 479, 480, 481, 483, 484, 485, 487], "likelihood": [424, 432], "limit": [0, 2, 93, 216, 217, 218, 482], "linalg": 168, "line": [5, 480, 483, 484], "linear": [0, 2, 5, 6, 7, 307, 311, 324, 329, 339, 341, 342, 351, 365, 381, 383, 384, 385, 387, 389, 400, 409, 410, 411, 412, 413, 414, 418, 437, 438, 439, 441, 449, 452, 463, 471, 479], "linear1": 5, "linear2": 5, "linear3": 5, "linear_schedul": [454, 470], "linearli": 378, "link": [2, 8], "linspac": 0, "lion": 454, "list": [1, 5, 10, 15, 17, 30, 71, 78, 83, 84, 85, 86, 91, 94, 95, 101, 131, 136, 143, 148, 149, 151, 152, 154, 155, 157, 158, 162, 165, 179, 189, 203, 205, 207, 208, 221, 229, 232, 235, 240, 241, 242, 244, 245, 246, 248, 251, 252, 263, 273, 275, 279, 280, 284, 291, 292, 295, 299, 300, 302, 305, 310, 313, 314, 324, 363, 365, 366, 367, 368, 373, 375, 376, 377, 452, 454, 458, 459, 460, 461, 470, 478, 479, 480, 481, 483], "liter": [2, 232, 400, 404, 405, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434], "littl": 5, "liu": 5, "live": [7, 143, 487], "ll": [1, 4, 6, 426, 479, 481], "llama": 5, "llamaattent": 5, "llamaencoderlay": 5, "llm": 7, "load": [6, 7, 319, 365, 480], "load_weight": [324, 483], "loader": 6, "loader_path": 2, "loan": 189, "loc": [1, 244, 246], "local": [324, 337, 480], "locat": [0, 2, 83, 376, 377, 480, 487], "log": [0, 197, 199, 203, 353, 354, 419, 420, 421, 424, 427, 429, 432, 444], "log10": 0, "log1p": 0, "log2": 0, "log_cosh_loss": 324, "log_sigmoid": [324, 353], "log_softmax": [324, 354], "logaddexp": 0, "logarithm": [0, 195, 196, 197, 198], "logcosh": 429, "logic": [0, 2, 200, 201, 202], "logical_and": 0, "logical_not": 0, "logical_or": 0, "logist": [0, 4, 269, 413, 441], "logit": [5, 241, 421, 423, 479], "logsigmoid": 324, "logsoftmax": 324, "logsumexp": 0, "long": 5, "longer": [5, 105, 481], "look": [2, 5, 480], "lookup": 340, "loop": [5, 6, 479, 480, 481, 483], "loshchilov": 459, "loss": [4, 6, 299, 324, 454, 479, 480, 481, 483], "loss_and_grad": 324, "loss_and_grad_fn": [6, 454, 479, 481], "loss_fn": [4, 6, 454, 479, 481], "loss_grad_fn": 480, "lot": [480, 481], "low": [248, 252, 408, 449], "low_pad_s": 0, "lower": [183, 184, 186, 187, 192, 237, 248, 251, 252, 296, 408], "lr": [4, 461], "lr_schedul": [468, 469, 470, 472], "lstm": 324, "lto": 2, "lu": 5, "luckili": 483, "lvalu": 299, "m": [0, 2, 5, 8, 90, 140, 163, 168, 189, 296, 326, 327, 355, 356, 455, 479], "m1": [1, 5, 479, 481, 487], "m10": 317, "m7": 317, "m_": [458, 459, 460, 461], "m_t": [458, 459, 460, 461], "mac": 480, "machin": [5, 7, 8, 466, 480], "maco": [8, 218], "macosx": 8, "made": [5, 319], "mai": [2, 189, 307, 337, 480, 481, 482], "main": [7, 118, 140, 143, 294, 311, 312, 324, 480], "maintain": [337, 338, 461], "major": [0, 2], "make": [1, 2, 3, 5, 6, 8, 204, 227, 268, 324, 468, 469, 471, 472, 479, 483, 485, 487], "make_shar": 2, "malloc_or_wait": 2, "man": 5, "manag": [282, 476, 480, 487], "mani": [2, 83, 275, 330, 331, 332, 333, 334, 335, 340, 380, 479, 480, 483], "manual": 324, "map": [2, 6, 38, 194, 311, 340, 358], "map_fn": [358, 362], "map_torch_to_mlx": 5, "margin": [430, 434], "margin_ranking_loss": 324, "mask": [0, 5, 90, 146, 372, 378, 482], "mask_lh": [0, 90], "mask_n": 1, "mask_nw": 1, "mask_out": [0, 90], "mask_rh": [0, 90], "mask_s": 1, "mask_sw": 1, "matadata": 194, "match": [8, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 211, 365, 400, 423, 480, 482, 484], "materi": [5, 7], "math": [5, 434, 479], "mathbf": 184, "mathcal": 352, "mathemat": 189, "mathrm": [134, 269, 348], "matmul": [0, 163, 487], "matric": [189, 190, 191], "matrix": [0, 4, 14, 45, 90, 116, 117, 140, 141, 163, 164, 168, 169, 183, 184, 186, 187, 188, 189, 190, 191, 192, 204, 208, 237, 238, 245, 380, 381, 406, 449], "matter": [5, 324], "max": [0, 1, 2, 189, 206, 329, 355, 356, 379, 409, 416, 417, 422, 424, 425, 430, 434, 436, 438, 456, 460, 479, 481, 487], "max_": [355, 356], "max_buffer_s": 210, "max_freq": 391, "max_i": 237, "max_norm": 309, "max_recommended_working_set_s": [210, 218], "max_val": 416, "maximum": [0, 6, 26, 38, 93, 108, 213, 217, 309, 324, 351, 384, 391, 412, 413, 418, 437, 452, 483], "maxpool1d": 324, "maxpool2d": 324, "maxtotalthreadsperthreadgroup": 2, "mca": 480, "md": 189, "me": 5, "mean": [0, 1, 4, 5, 6, 144, 244, 245, 246, 299, 324, 328, 344, 363, 382, 407, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 479, 481, 484], "meant": 112, "measur": 487, "mechan": 399, "medic": 338, "meet": 8, "member": [2, 324, 368, 373], "memori": [0, 1, 2, 7, 83, 209, 211, 212, 213, 215, 216, 217, 218, 399, 452, 456, 479, 483, 484], "memory_order_relax": 1, "memory_s": [210, 218], "memoryview": [483, 484], "merg": 479, "meshgrid": 0, "metadata": [4, 194, 263, 264], "metal": [2, 7, 143], "metal_captur": 3, "metal_kernel": 1, "metal_path": 8, "metallib": [2, 8], "method": [2, 5, 9, 10, 30, 112, 120, 307, 315, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 369, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 452, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 474], "millisecond": [8, 479, 487], "min": [0, 2, 189, 222, 329, 379, 409, 416, 417, 436, 438], "min_freq": 391, "min_i": 237, "min_val": 416, "mind": [2, 5], "mine": 5, "minibatch": 6, "minim": 480, "minimum": [0, 27, 38, 93, 109, 391, 421, 422], "minsizerel": 8, "minu": 139, "minut": 5, "mish": 324, "miss": [365, 486], "mix": 482, "mkdir": [3, 8], "ml": 8, "mlp": [6, 324, 399, 454], "mlp_dim": [5, 399], "mlx": [1, 3, 4, 5, 6, 8, 319, 324, 449, 452, 454, 476, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487], "mlx_build_benchmark": 8, "mlx_build_cpu": 8, "mlx_build_exampl": 8, "mlx_build_gguf": 8, "mlx_build_met": [2, 8], "mlx_build_metallib": 2, "mlx_build_python_bind": 8, "mlx_build_safetensor": 8, "mlx_build_test": 8, "mlx_disable_compil": [119, 132, 479], "mlx_ext": 2, "mlx_ext_metallib": 2, "mlx_include_dir": 2, "mlx_metal_debug": [3, 8], "mlx_metal_jit": 8, "mlx_sample_extens": 2, "mlx_trace": 3, "mnist": 6, "mode": [0, 1, 2, 105, 232, 361, 372, 374, 400, 404, 405, 480], "model": [4, 6, 7, 265, 307, 308, 311, 312, 324, 358, 361, 363, 365, 369, 372, 374, 375, 376, 378, 399, 449, 452, 454, 462, 463, 465, 479, 480, 483], "modest": 2, "modif": 484, "modifi": 484, "modul": [2, 5, 6, 307, 308, 388, 399, 449, 465, 478, 479, 483], "moment": [5, 456, 458, 459, 460], "momentum": [328, 461, 463, 467, 479], "monei": 5, "monoton": 435, "more": [1, 2, 3, 6, 10, 78, 118, 163, 183, 184, 186, 187, 188, 191, 192, 204, 216, 217, 263, 264, 317, 324, 328, 337, 386, 391, 399, 400, 402, 403, 404, 405, 421, 476, 479, 480, 481, 482, 485, 487], "most": [2, 241, 324, 465, 479, 480, 481, 482, 483], "move": [0, 2, 223, 487], "moveaxi": 0, "mpi": 319, "mpiexec": 480, "mpirun": 480, "mse": 299, "mse_loss": 324, "mtl": 2, "mtl_capture_en": 3, "mtlcommandbuff": 2, "mu": 467, "much": [1, 2, 5, 326, 327, 355, 356, 479, 483], "multi": [7, 146, 330, 331, 332, 333, 334, 335, 482, 484], "multidimension": 208, "multiheadattent": [5, 324], "multipl": [0, 1, 8, 14, 90, 142, 144, 163, 164, 204, 224, 237, 238, 378, 391, 469, 470, 472, 479, 483, 486], "multipli": [0, 2, 38, 164, 237, 238, 336, 391, 400], "murtadha": 5, "must": [0, 1, 2, 3, 8, 90, 93, 145, 162, 164, 186, 187, 189, 240, 241, 245, 248, 251, 252, 304, 400, 484], "mx": [1, 2, 3, 4, 5, 6, 38, 96, 97, 112, 123, 126, 143, 159, 178, 186, 187, 189, 190, 194, 247, 265, 299, 309, 324, 326, 327, 328, 339, 348, 351, 355, 356, 358, 365, 369, 384, 400, 401, 402, 403, 404, 405, 406, 407, 408, 410, 418, 421, 422, 423, 427, 430, 437, 447, 449, 452, 454, 476, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "my": [5, 8], "my_devic": 488, "my_path": 265, "myexp": [1, 143], "myexp_strid": 1, "mymlp": 452, "n": [0, 1, 2, 5, 30, 90, 98, 99, 100, 101, 102, 103, 104, 140, 147, 149, 150, 152, 153, 156, 158, 168, 169, 245, 280, 296, 300, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 349, 355, 356, 383, 400, 429, 434, 480], "n_i": [326, 327, 355, 356], "n_t": 343, "naiv": [2, 481], "naive_add": 481, "name": [1, 2, 143, 164, 194, 237, 238, 263, 264, 265, 266, 324, 344, 362, 365, 367, 480, 482, 486], "named_modul": 324, "nan": [0, 16, 82, 172, 173, 175, 225], "nan_to_num": 0, "nanobind": [2, 399], "nanobind_add_modul": 2, "nativ": 8, "natur": [0, 195, 197, 483], "nb": 2, "nb_domain": 2, "nb_func": 399, "nb_modul": 2, "nb_static": 2, "nbyte": 2, "nc": 328, "ndarrai": [30, 482, 483, 485], "ndhwc": [332, 335, 338], "ndim": [0, 1, 2, 159, 189, 191, 400], "ne": 1, "nearest": [1, 400], "necessari": 324, "necessarili": 293, "need": [1, 2, 5, 6, 7, 8, 82, 237, 324, 376, 377, 391, 399, 476, 480, 481, 483, 484, 485, 487], "neg": [0, 118, 159, 176, 225, 259, 294, 351, 355, 356, 378, 424, 432, 434, 482], "negat": [0, 226], "negative_slop": [351, 418], "neginf": [0, 225], "neighbor": 400, "neither": [165, 299], "nelem": 2, "nervou": 5, "nest": [78, 94, 313, 324, 452, 478, 481], "nesterov": 467, "network": [5, 7, 328, 337, 340, 402, 403, 449, 452, 466, 480], "neural": [5, 7, 340, 402, 403, 435, 449, 452, 466], "never": [5, 483], "new": [0, 2, 6, 91, 118, 223, 227, 257, 279, 295, 301, 311, 312, 370, 378, 452, 454, 465, 470, 479, 482, 483, 484], "new_tre": 312, "next": [2, 5, 6, 216], "nh": [343, 349, 383], "nhwc": [328, 331, 334], "nice": [481, 483], "nlc": [328, 330, 333], "nld": [343, 349, 383], "nlh": [343, 349, 383], "nll": [424, 432], "nll_loss": 324, "nn": [2, 5, 6, 265, 311, 324, 449, 452, 454, 463, 465, 479, 483], "nobodi": 5, "node": [94, 136, 303, 312, 313], "nois": 4, "noisi": 4, "nomins": 2, "non": [0, 1, 2, 8, 208, 373, 383, 435, 452], "none": [1, 2, 5, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 119, 121, 122, 125, 126, 127, 128, 129, 130, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 215, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 264, 265, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 303, 304, 305, 306, 307, 310, 311, 312, 313, 315, 326, 327, 341, 355, 356, 358, 362, 363, 370, 375, 378, 383, 391, 399, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 456, 474, 482], "nonlinear": [383, 479], "nonzero": 482, "noop": 375, "nor": [2, 165, 299], "norm": [5, 144, 309, 344, 434, 460, 461], "norm1": 5, "norm2": 5, "norm_first": 399, "normal": [1, 2, 4, 5, 142, 143, 144, 186, 245, 251, 324, 326, 327, 328, 344, 348, 350, 355, 356, 382, 399, 402, 404, 484, 487], "not_equ": 0, "notabl": [5, 7], "notat": [116, 141, 310, 367], "note": [0, 1, 2, 5, 8, 16, 18, 83, 90, 94, 100, 103, 104, 112, 146, 154, 155, 164, 172, 189, 211, 237, 241, 301, 307, 324, 382, 400, 454, 484, 486], "noth": [5, 324, 483], "notic": [5, 481, 486], "now": [1, 2, 5, 8, 381, 479, 480, 484], "np": [1, 5, 6, 480, 484, 485], "npy": [194, 262, 486], "npz": [5, 194, 265, 266, 365, 369, 486], "nuclear": 189, "nullopt": 0, "num": [0, 5, 193, 250], "num_class": [6, 454], "num_decoder_lay": 399, "num_embed": [340, 380], "num_encoder_lay": 399, "num_epoch": [6, 454], "num_exampl": 4, "num_featur": [4, 328], "num_group": 344, "num_head": [5, 378, 399], "num_it": 4, "num_lay": [5, 6, 454], "num_param": 324, "num_paramet": 379, "num_sampl": 241, "num_split": 0, "number": [0, 2, 11, 18, 61, 70, 94, 99, 100, 101, 103, 104, 116, 140, 141, 164, 165, 169, 179, 193, 225, 232, 237, 238, 241, 244, 246, 250, 252, 256, 259, 260, 291, 292, 296, 299, 302, 303, 307, 324, 328, 330, 331, 332, 333, 334, 335, 337, 338, 344, 348, 378, 379, 399, 400, 402, 403, 404, 405, 468, 470, 471, 476, 479, 481, 488], "number_of_el": 0, "numer": [5, 142, 144, 189, 199, 203, 273, 328, 344, 348, 350, 382, 421, 422, 424, 434, 455, 456, 457, 458, 459, 460, 466, 479, 483], "numpi": [2, 5, 6, 7, 13, 16, 18, 87, 88, 89, 91, 128, 129, 133, 166, 167, 172, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283, 483, 485, 486], "nw": 1, "nwhc": 337, "o": [2, 8, 146, 349], "o_t": 349, "obj": 263, "object": [3, 10, 30, 50, 78, 94, 143, 178, 265, 303, 310, 311, 312, 313, 317, 337, 399, 478], "observ": 5, "occupi": [116, 141, 164, 237, 238], "occur": 484, "odim": 6, "odot": [343, 349], "off": [5, 8, 483], "offer": 429, "offset": [0, 1, 2, 5, 46, 83, 118, 142, 145, 294], "often": 338, "ok": [365, 481], "okai": [479, 483], "old": 5, "omit": [458, 460, 480], "onc": [2, 8, 479], "one": [0, 2, 5, 8, 38, 78, 84, 93, 99, 100, 101, 103, 104, 138, 140, 142, 144, 145, 189, 197, 204, 238, 241, 278, 283, 317, 375, 400, 423, 480, 487], "ones": [0, 2, 5, 230, 265, 296, 376, 377, 454, 480, 482], "ones_lik": 0, "onli": [1, 2, 5, 7, 8, 82, 90, 99, 100, 101, 103, 104, 186, 187, 189, 218, 237, 245, 301, 324, 362, 363, 365, 370, 372, 375, 376, 377, 452, 479, 480, 481, 486, 487], "onlin": 457, "op": [1, 2, 231, 301, 363, 483], "open": [3, 8, 18, 248, 252], "openmpi": 480, "oper": [3, 5, 7, 9, 37, 84, 85, 86, 101, 146, 163, 164, 234, 236, 273, 281, 288, 315, 324, 399, 461, 479, 480, 481, 482, 483, 484, 485, 487, 488], "operand": [130, 131, 163], "opportun": 479, "opt": [462, 480], "optim": [1, 3, 4, 6, 7, 376, 479, 480, 481, 483], "option": [0, 3, 5, 14, 15, 17, 18, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 83, 84, 85, 86, 90, 94, 95, 98, 99, 100, 101, 102, 103, 104, 105, 108, 109, 110, 111, 112, 116, 117, 118, 121, 122, 123, 125, 126, 127, 140, 141, 142, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 163, 164, 165, 169, 176, 177, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 203, 205, 207, 208, 217, 221, 225, 229, 232, 233, 235, 237, 238, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 256, 257, 259, 273, 274, 275, 278, 279, 280, 284, 286, 287, 291, 293, 294, 295, 296, 297, 298, 299, 300, 303, 305, 307, 310, 311, 312, 313, 326, 327, 328, 330, 331, 332, 333, 334, 335, 343, 349, 352, 355, 356, 358, 362, 363, 365, 370, 375, 378, 380, 381, 383, 386, 391, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 468, 476, 479, 486, 488], "ord": 189, "order": [0, 1, 28, 83, 101, 131, 186, 187, 189, 233, 237, 293, 324, 344, 376, 388, 463, 479, 481], "ordinari": 171, "org": [344, 348, 350, 357, 382, 413, 435], "origin": [5, 118, 309, 328, 371, 402, 403, 404, 405, 455, 456, 457, 458, 460, 461, 484], "orthonorm": 168, "ostream": 2, "ostringstream": 2, "other": [0, 2, 5, 7, 178, 189, 324, 364, 452, 461, 479, 480, 482, 483, 485], "other_input": 324, "otherwis": [18, 101, 123, 217, 247, 307, 310, 311, 312, 313, 363, 365, 375, 397, 399, 400, 415, 421, 426, 433, 445, 446, 483, 484], "our": [1, 2, 5, 6, 388, 455, 456, 457, 458, 460, 461, 480], "out": [0, 1, 2, 8, 90, 143, 326, 327, 337, 338, 355, 356, 372, 479, 480, 481, 482], "out_ax": [303, 481], "out_channel": [330, 331, 332, 333, 334, 335], "out_dim": [324, 452], "out_dtyp": 2, "out_idx": 2, "out_mask": 90, "out_proj": [5, 452], "out_ptr": 2, "out_shap": [1, 2], "outer": [0, 479, 483], "outlier": 429, "output": [0, 1, 2, 5, 8, 15, 16, 17, 18, 28, 83, 90, 91, 94, 96, 97, 108, 109, 110, 111, 112, 130, 140, 142, 143, 144, 145, 146, 153, 156, 157, 158, 162, 163, 165, 168, 169, 172, 189, 193, 203, 205, 207, 208, 221, 225, 229, 230, 233, 235, 236, 240, 241, 242, 244, 245, 246, 248, 251, 252, 265, 266, 273, 278, 280, 284, 288, 294, 296, 299, 300, 301, 302, 303, 304, 305, 306, 326, 327, 328, 330, 331, 332, 333, 334, 335, 348, 352, 355, 356, 378, 381, 397, 399, 400, 402, 403, 404, 405, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 449, 479, 480, 481, 482, 483, 484, 485, 486, 487], "output_dim": [6, 324, 352, 381], "output_directori": 2, "output_dtyp": [1, 143], "output_fil": 5, "output_nam": [1, 143], "output_shap": [1, 143], "outsid": [143, 159], "over": [0, 2, 5, 6, 15, 17, 26, 27, 28, 29, 98, 99, 100, 101, 102, 103, 104, 108, 109, 110, 111, 149, 152, 155, 158, 171, 189, 191, 193, 203, 205, 207, 221, 233, 235, 261, 273, 274, 280, 284, 291, 293, 300, 328, 330, 331, 332, 333, 334, 335, 344, 350, 382, 423, 468, 471, 480, 481], "overal": 2, "overhead": [479, 483, 487], "overlap": 1, "overload": 18, "overrid": [2, 132], "overview": 3, "overwrit": 5, "own": [8, 484], "owndata": 484, "p": [8, 240, 324, 336, 337, 338, 434, 458, 460], "pack": [164, 237, 238], "packag": [2, 4, 6, 8, 319, 449, 480], "package_data": 2, "pad": [0, 1, 98, 99, 100, 101, 102, 103, 104, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 326, 327, 330, 331, 332, 333, 334, 335, 355, 356], "pad_valu": 0, "pad_width": [0, 232], "padding_hi": 0, "padding_lo": 0, "page": 485, "pain": 5, "pair": [0, 2, 232, 365, 386], "pairwis": 434, "pan": 5, "paper": [328, 391, 455, 456, 457, 458, 460, 461], "parallel": [480, 487], "param": [299, 324, 449, 481], "paramet": [0, 1, 2, 4, 5, 6, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 216, 217, 218, 219, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 348, 349, 350, 351, 352, 355, 356, 358, 359, 362, 363, 365, 370, 371, 372, 375, 376, 377, 378, 379, 380, 381, 382, 383, 386, 388, 391, 395, 397, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 448, 449, 452, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 465, 466, 467, 468, 469, 470, 471, 472, 474, 479, 480, 481, 483], "parameter_scal": 456, "parametr": [379, 436], "pars": 5, "parse_arg": 5, "parser": 5, "part": [1, 2, 170, 253, 481, 482], "partial": [376, 377, 479, 483], "particip": [121, 122, 125, 126, 127], "particular": [237, 344], "particularli": 479, "partit": [0, 28], "pass": [1, 2, 5, 6, 65, 79, 231, 232, 299, 308, 310, 311, 312, 324, 363, 375, 376, 377, 388, 479, 480, 483], "password": 480, "path": [3, 8, 131, 219, 265, 266, 307, 312, 365, 480], "pattern": [324, 483], "peak": [213, 215], "penalti": 467, "pep": 484, "per": [5, 6, 116, 141, 164, 237, 238, 307, 328, 344, 348, 350, 382, 474, 479, 480, 483], "perceptron": 7, "perf_count": 479, "perfectli": 483, "perform": [0, 1, 2, 3, 5, 7, 14, 90, 101, 108, 109, 110, 111, 127, 130, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 163, 164, 168, 186, 187, 204, 238, 260, 273, 287, 324, 344, 399, 404, 405, 454, 479, 480, 482, 483, 487], "perhap": [2, 5], "perm": 6, "permtuat": 247, "permut": [0, 6], "persist": 8, "pg": 189, "phi": [341, 411], "physic": 480, "pi": [134, 341, 391, 412, 481], "pick": 2, "pip": [2, 8], "pipelin": 2, "pixel": 337, "place": [2, 5, 38, 259, 260, 307, 480, 483, 484], "placehold": 479, "plai": [2, 5], "plain": 388, "plan": [2, 479], "platform": 8, "plu": [0, 197], "point": [0, 2, 4, 5, 8, 83, 161, 238, 317], "pointer": 2, "pool": [326, 327, 355, 356, 487], "popul": 2, "portion": 336, "posinf": [0, 225], "posit": [0, 5, 28, 118, 145, 159, 165, 177, 183, 184, 223, 225, 233, 245, 259, 294, 299, 311, 324, 330, 331, 332, 333, 334, 335, 378, 386, 391, 424, 434], "possibl": [275, 340, 380, 479, 480, 482, 487], "possibli": [5, 14, 90, 163, 204, 309], "postur": 5, "potenti": 217, "power": [0, 481, 484], "practic": [2, 479], "pre": [8, 146, 421], "preced": 344, "precis": [0, 2, 5, 139, 146, 324, 341, 382, 421, 462, 479], "preclud": 324, "pred": [425, 429], "predic": [307, 370], "predict": [421, 424, 425, 426, 427, 428, 429, 431, 432, 433], "prefix": [303, 310], "prelu": 324, "prepar": [2, 5], "prepend": [3, 204], "preprint": [5, 455, 461], "preprocessor": 8, "present": 1, "preserv": [257, 481], "press": [5, 189], "pressur": 2, "pretti": [479, 483], "prevent": [281, 434, 484], "previou": [216, 217, 218], "primal": [1, 2, 112, 179, 302], "primit": 481, "print": [1, 2, 4, 5, 6, 8, 309, 310, 311, 312, 314, 324, 476, 479, 480, 481, 482, 483, 484, 485], "prior": [236, 287, 288], "priorit": 481, "privat": 2, "prng": [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 476], "prob": 421, "probabl": [8, 248, 336, 337, 338, 381, 421, 423, 427, 487], "problem": [4, 6, 324], "process": [5, 101, 105, 120, 121, 122, 123, 125, 126, 127, 311, 312, 338, 340, 399, 478, 480], "processor": 8, "prod": [0, 1], "produc": [0, 2, 8, 94, 378, 449], "product": [0, 2, 14, 83, 110, 171, 179, 185, 204, 231, 235, 291, 302, 378, 485], "profil": 3, "program": [213, 480], "programmat": 377, "project": [3, 5, 378], "project_source_dir": 2, "promot": 2, "promote_typ": 2, "promoted_dtyp": 2, "prompt": 5, "propag": [481, 482], "properti": [31, 38, 47, 51, 61, 62, 68, 70, 371, 374, 464, 481], "proportion": 309, "protocol": 484, "provid": [0, 2, 5, 83, 116, 141, 165, 247, 259, 291, 299, 311, 313, 319, 324, 358, 363, 365, 375, 376, 377, 380, 381, 399, 400, 448, 452, 480, 486, 488], "pseudo": 476, "pth": 5, "public": [2, 324], "pun": 0, "pure": [1, 324, 454], "purpos": [1, 189], "purs": 5, "push": 2, "push_back": 2, "put": [0, 1, 6, 236, 479, 480], "put_along_axi": 0, "py": [2, 5, 8, 480], "pypi": 8, "python": [1, 3, 5, 50, 68, 78, 136, 310, 311, 312, 313, 314, 452, 462, 463, 465, 478, 480, 481, 484], "python_requir": 2, "pytorch": [5, 7, 341, 344, 481], "pytorch_compat": 344, "q": [146, 190], "qualifi": 480, "quantiz": [0, 116, 141, 164, 194, 238, 380, 381], "quantized_matmul": 0, "quantizedembed": 324, "quantizedlinear": 324, "quarter": 5, "queri": [5, 146, 218, 378], "query_input_dim": 378, "query_proj": 5, "question": [5, 483], "queue": 3, "quick": [2, 7], "quit": [481, 484], "quotient": [0, 128, 129, 161], "r": [2, 5, 190, 299, 337, 343], "r_t": 343, "race": 487, "radian": [0, 115], "rag": 5, "rain": 5, "rais": [0, 5, 189, 217, 234, 275, 365], "ram": 5, "random": [1, 2, 3, 4, 5, 6, 7, 143, 326, 327, 328, 348, 355, 356, 365, 372, 479, 481, 487, 488], "randomli": [4, 5, 247, 336, 337, 338], "rang": [0, 2, 3, 4, 5, 6, 8, 18, 159, 163, 193, 403, 405, 412, 413, 454, 468, 469, 470, 471, 472, 476, 479, 481, 483, 487], "rank": [0, 125, 126, 127, 430, 480], "rate": [4, 454, 455, 456, 457, 458, 459, 460, 461, 466, 467], "rather": [2, 481, 487], "ratio": [0, 24], "rceil": 90, "re": [6, 8, 449], "readabl": 3, "readi": 2, "real": [0, 153, 154, 155, 156, 157, 158, 183, 184, 186, 187], "realli": 350, "reason": [1, 5, 482], "reboot": 8, "receiv": [125, 126, 307, 470, 484], "reciproc": [0, 261], "reclaim": 216, "recommend": [8, 217, 461], "recompil": [94, 479], "record": [3, 213, 483], "recreat": [314, 454], "rectifi": [351, 384, 385, 404, 405, 418, 437, 438], "recurr": [343, 349, 383], "recurs": [324, 362, 363, 368, 373, 375, 452], "recv": 126, "redirect": 2, "reduc": [0, 1, 8, 15, 17, 26, 27, 122, 203, 205, 207, 221, 235, 280, 284, 300, 313, 328, 399, 429], "reduct": [15, 17, 122, 203, 205, 221, 235, 313, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 480], "redund": 481, "refer": [189, 348, 357, 371, 402, 403, 404, 405, 413, 435, 482], "reflect": [371, 479, 482, 484], "regard": 341, "regardless": [83, 146], "regist": [2, 6], "register_librari": 2, "regress": [7, 429], "regular": [38, 337, 435, 459, 479, 482], "regularli": 2, "reimplement": 2, "rel": [16, 172, 456, 479], "relative_step": 456, "relax": 217, "relev": 2, "reli": [1, 2], "relu": [324, 379, 399, 436, 449], "relu6": 324, "remain": [0, 5, 218, 299, 312, 336, 337, 338, 480], "remaind": [0, 129], "remov": [0, 118, 204, 241, 278, 423], "rep": [0, 292], "repeat": [0, 292], "repeatedli": 4, "repetit": 256, "replac": [0, 5, 225, 376, 377, 399, 433], "replai": 3, "repli": 5, "repo": [4, 6, 8, 479], "report": [211, 217], "repres": [2, 5, 120, 123, 164, 430, 434, 484], "represent": [5, 237, 301, 310, 314], "request": 2, "requir": [1, 2, 5, 324, 480, 483, 484], "requires_grad": 481, "rerun": [479, 483], "rescal": 309, "research": 7, "reset": 215, "reset_peak_memori": 213, "reshap": [0, 5, 189, 400, 482], "resid": 218, "resolv": 2, "resourc": 2, "respect": [2, 4, 6, 142, 144, 163, 164, 165, 237, 299, 311, 324, 328, 341, 344, 348, 350, 452, 481, 485], "respons": 2, "rest": [5, 145, 311, 312, 386], "restart": 8, "restor": 259, "result": [0, 5, 14, 18, 38, 78, 83, 94, 142, 144, 164, 189, 204, 238, 245, 256, 279, 311, 312, 313, 391, 421, 479, 481, 484], "resum": 5, "return": [0, 1, 2, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 50, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 212, 216, 217, 218, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 260, 261, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 324, 343, 349, 358, 359, 360, 362, 363, 364, 365, 366, 367, 368, 372, 373, 375, 376, 377, 383, 401, 402, 403, 404, 405, 406, 407, 408, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 449, 452, 462, 478, 479, 480, 481, 482, 483, 484, 486, 487], "return_metadata": 194, "revers": [0, 2, 41, 42, 43, 44, 83, 108, 109, 110, 111, 295, 391], "rf": 8, "rfft": 153, "rfft2": 154, "rfftn": 155, "rfloor": [326, 327, 355, 356], "rho": 455, "rhs_indic": [0, 163, 164], "rhs_mask": 90, "right": [0, 1, 2, 8, 237, 258, 259, 326, 327, 341, 355, 356, 400, 412, 413, 424, 426, 434], "right_shift": 0, "rm": [5, 8, 144, 456], "rmsnorm": [5, 324], "rmsprop": 454, "rnn": [324, 343], "roadcast": 248, "robust": 429, "roform": [5, 386], "roll": 0, "root": [0, 5, 144, 261, 276, 382], "rope": [5, 324], "rosetta": 8, "rotari": [5, 145, 386], "rotat": [145, 386], "round": [0, 237], "routin": 2, "row": [0, 1, 2, 83, 140, 143, 169, 237, 296], "row_contigu": 2, "rpath": 2, "rsqrt": 0, "rtol": [0, 16, 172], "rule": [2, 454], "run": [1, 2, 3, 5, 6, 7, 8, 9, 143, 231, 315, 328, 358, 455, 456, 458, 459, 460, 479, 480, 483, 487, 488], "runtim": [5, 123, 319, 479, 480], "runtime_error": 2, "safetensor": [8, 194, 264, 365, 369, 454, 483, 486], "sai": [2, 5, 449, 483], "said": 5, "sake": 481, "same": [0, 2, 5, 8, 16, 38, 82, 91, 94, 99, 100, 101, 103, 104, 105, 121, 142, 144, 153, 156, 157, 158, 164, 165, 172, 179, 232, 241, 259, 260, 301, 302, 304, 312, 324, 327, 328, 336, 344, 348, 356, 380, 401, 402, 403, 404, 405, 406, 407, 408, 423, 434, 452, 462, 476, 479, 480, 482, 487], "sampl": [2, 4, 5, 193, 240, 241, 242, 244, 245, 248, 251, 252, 402, 403, 404, 405, 407, 408, 424, 430, 434, 476, 479], "sat": 5, "save": [3, 5, 7, 194, 219, 237, 263, 264, 265, 266, 369, 483], "save_gguf": 486, "save_safetensor": [369, 454, 486], "save_weight": 324, "savez": [5, 369, 486], "savez_compress": 486, "saw": [5, 481], "scalar": [0, 2, 13, 14, 16, 30, 50, 78, 82, 87, 88, 89, 90, 91, 93, 128, 129, 133, 161, 162, 165, 166, 167, 168, 172, 180, 181, 182, 193, 199, 200, 201, 202, 204, 206, 222, 224, 225, 228, 232, 234, 240, 248, 251, 252, 255, 258, 263, 283, 299, 301, 304, 308, 434, 481, 483, 485], "scale": [0, 2, 5, 14, 116, 141, 142, 144, 145, 146, 164, 168, 237, 238, 244, 246, 309, 337, 338, 350, 378, 386, 387, 391, 400, 439, 456], "scale_arr": 2, "scale_factor": 400, "scale_paramet": 456, "scatter": 0, "scatter_add": 0, "scatter_max": 0, "scatter_min": 0, "scatter_prod": 0, "schedul": [2, 217, 454, 468, 469, 470, 471, 472, 474, 487], "schema": 3, "scipi": 168, "scope": 324, "score": [5, 146, 430], "sdk": 8, "se": 1, "second": [5, 8, 118, 178, 180, 200, 202, 204, 258, 285, 294, 299, 327, 356, 422, 430, 456, 458, 459, 460, 481, 487], "second_layer_a": 483, "second_layer_b": 483, "secret": 5, "section": [1, 5, 8, 275, 434, 479, 480, 481], "see": [1, 2, 5, 6, 8, 10, 11, 32, 33, 34, 35, 36, 39, 40, 41, 42, 43, 44, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 189, 216, 263, 264, 307, 317, 324, 328, 329, 337, 339, 341, 345, 346, 347, 353, 354, 361, 379, 380, 381, 384, 385, 386, 387, 389, 391, 392, 393, 394, 395, 396, 398, 400, 402, 403, 404, 405, 411, 412, 413, 439, 479, 480, 481, 482, 485, 487], "seed": 243, "seen": 484, "select": [0, 3, 8, 186, 187, 293, 304, 358, 362, 370], "self": [5, 6, 9, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 112, 315, 324, 435, 452], "selu": 324, "semant": [13, 87, 88, 89, 91, 128, 129, 133, 166, 167, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283, 487], "semi": [183, 184, 245], "send": 480, "sennrich": 5, "sensit": 429, "sentencepiec": 5, "separ": [5, 65, 79, 344, 430], "sequenc": [5, 15, 17, 33, 34, 56, 57, 58, 59, 63, 71, 74, 75, 76, 80, 83, 91, 101, 125, 138, 143, 148, 149, 151, 152, 154, 155, 157, 158, 162, 203, 205, 207, 221, 229, 235, 240, 241, 242, 244, 245, 246, 248, 251, 252, 257, 273, 275, 278, 280, 284, 291, 292, 295, 300, 305, 328, 330, 333, 343, 349, 383, 399, 476, 487], "sequenti": [324, 449], "seri": 8, "serial": 454, "set": [2, 5, 6, 8, 94, 112, 119, 121, 122, 123, 125, 126, 127, 132, 142, 145, 210, 216, 217, 218, 267, 268, 282, 341, 350, 352, 361, 363, 370, 371, 372, 375, 376, 381, 386, 397, 422, 434, 446, 452, 454, 456, 463, 476, 481, 483], "set_data": 2, "set_default_devic": 2, "set_dtyp": 324, "set_input_arrai": 2, "set_memory_limit": 216, "set_output_arrai": 2, "setbyt": 2, "setcomputepipelinest": 2, "setup": [2, 4, 6, 8, 479], "sever": [5, 8, 98, 99, 100, 101, 102, 103, 104, 265, 266, 479, 486], "sgd": [4, 6, 454, 461, 463, 468, 469, 472, 479], "shade": [1, 2], "shall": 5, "shape": [0, 2, 3, 5, 6, 65, 82, 83, 90, 91, 94, 98, 99, 100, 101, 102, 103, 104, 118, 121, 125, 126, 143, 146, 147, 150, 153, 156, 157, 158, 162, 163, 168, 179, 188, 192, 204, 229, 230, 240, 241, 242, 244, 245, 246, 248, 251, 252, 257, 259, 301, 302, 304, 305, 306, 324, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 348, 349, 352, 355, 356, 365, 383, 401, 402, 403, 404, 405, 406, 407, 408, 423, 434, 454, 479, 481, 482, 485, 487], "shapeless": [0, 94], "share": [7, 116, 141, 164, 237, 238, 301, 480], "shazeer": 5, "shift": [0, 180, 258, 259, 328], "shop": 5, "should": [1, 2, 4, 5, 6, 8, 83, 118, 121, 142, 143, 144, 146, 179, 209, 218, 219, 236, 237, 288, 294, 299, 302, 307, 310, 324, 330, 331, 332, 333, 334, 335, 337, 338, 372, 378, 388, 423, 425, 430, 452, 478, 479, 480, 481, 483, 484, 488], "show": [8, 317, 479], "shown": 2, "shuffl": 6, "side": [0, 232, 326, 327, 355, 356, 479], "sigma": [341, 342, 343, 349, 390, 402, 403, 404, 405, 413, 414, 419, 440, 441], "sigmoid": [0, 5, 324, 353, 389, 413, 419, 421, 441], "sign": [0, 16, 172, 317, 461], "signal": [105, 400], "signatur": [1, 143], "signedinteg": [11, 178], "signific": 237, "silent": [156, 157, 158], "silicon": [2, 5, 7, 8, 487], "silu": 324, "simd": 1, "simd_sum": 1, "simdgroup": 1, "simdgroup_s": 1, "similar": [5, 164, 178, 311, 376, 377, 378, 422, 484, 486], "similarli": [2, 8, 204, 481, 483], "simpl": [2, 5, 6, 324, 340, 448, 454, 479, 480, 481, 483], "simple_axpbi": 2, "simple_tim": 2, "simplest": [2, 324, 480], "simpli": [2, 5, 8, 339, 351, 384, 410, 418, 437, 447, 452, 479, 480, 481], "simplic": 0, "simultan": 1, "sin": [0, 112, 391, 481, 485], "sinc": [1, 2, 5, 6, 164, 213, 452, 461, 470, 484, 487], "sine": [0, 21, 22, 271, 272, 481], "sing": 189, "singer": 457, "singl": [2, 6, 136, 179, 194, 208, 232, 302, 327, 356, 479, 482, 486], "singleton": [0, 15, 17, 26, 27, 123, 203, 204, 205, 207, 221, 235, 280, 284, 300], "singular": [189, 191], "sinh": 0, "sinusoid": 391, "sinusoidalpositionalencod": 324, "size": [0, 1, 2, 5, 6, 51, 68, 90, 99, 100, 103, 104, 116, 138, 141, 142, 143, 144, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 162, 164, 168, 169, 178, 185, 189, 212, 217, 218, 237, 238, 241, 257, 275, 278, 301, 307, 324, 326, 327, 330, 331, 332, 333, 334, 335, 340, 348, 355, 356, 380, 381, 400, 456, 480, 483, 484], "size_in_megabyt": 218, "size_t": [0, 2], "sizeof": 2, "skip": [3, 83], "slice": [0, 482], "slice_s": 0, "slice_upd": 0, "slight": [5, 483], "slightli": [386, 487], "slope": 351, "slot": 480, "slow": 479, "slowli": 5, "small": [5, 139, 142, 144, 328, 344, 350, 382, 424, 429, 434, 479, 480, 487], "smaller": [0, 8, 233, 461, 479], "smallest": 189, "smile": 5, "smooth": [423, 433, 466], "smooth_l1_loss": 324, "sned": 127, "snippet": 480, "so": [1, 2, 5, 8, 165, 168, 299, 336, 400, 454, 479, 480, 483, 487], "softmax": [0, 5, 146, 324, 354, 420, 423], "softmin": 324, "softplu": [324, 357, 435], "softshrink": 324, "softsign": 324, "solv": 324, "some": [0, 2, 4, 5, 6, 363, 375, 454, 463, 479, 481, 483], "someon": 5, "someth": [4, 5, 482], "sonoma": 8, "soon": 5, "sort": [0, 28, 29, 233, 293], "sourc": [0, 1, 2, 3, 60, 125, 126, 143, 223, 295, 480], "space": [0, 2, 193, 421, 432], "spars": [0, 208], "spatial": [99, 100, 101, 103, 104, 326, 344, 355, 400], "speak": [5, 189], "special": 2, "specif": [1, 2, 8, 480, 481], "specifi": [0, 2, 18, 37, 99, 100, 101, 103, 104, 118, 154, 155, 162, 165, 185, 189, 193, 223, 229, 236, 241, 256, 285, 287, 288, 291, 294, 295, 299, 303, 305, 328, 397, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 480, 481, 487], "speed": [1, 2], "spent": 5, "split": [0, 342, 344, 414], "splittabl": 476, "sqrt": [0, 5, 134, 146, 168, 328, 341, 344, 348, 350, 352, 382, 391, 402, 403, 404, 405, 412, 455, 457, 458, 459, 466, 479], "squar": [0, 4, 5, 144, 169, 188, 192, 261, 276, 299, 311, 324, 382, 431, 433, 455, 456, 458, 459, 460, 481, 484], "squeez": [0, 400, 479], "src": [0, 125, 126], "ssh": 480, "stabil": [142, 144, 328, 344, 348, 350, 382, 421, 422, 424, 455, 456, 457, 458, 459, 460, 466], "stabl": [199, 203, 273, 429], "stable_abi": 2, "stack": [0, 479], "standard": [0, 1, 6, 50, 78, 204, 242, 246, 280, 399, 402, 404, 407, 480, 485], "starmap": [5, 311], "start": [0, 1, 2, 4, 5, 7, 8, 18, 145, 193, 219, 275, 313, 479, 482, 487], "start_axi": [0, 49, 159], "start_captur": 3, "state": [5, 6, 324, 343, 349, 383, 454, 463, 476, 479], "static": 8, "static_cast": 2, "std": [0, 2, 407], "step": [0, 3, 5, 6, 18, 324, 343, 349, 383, 456, 463, 468, 470, 471, 472, 479, 480], "step_decai": 454, "step_siz": 472, "still": [5, 8, 189, 479, 483], "stochast": [457, 458, 460, 467, 483], "stood": 5, "stop": [0, 2, 5, 18, 193, 220, 281, 481, 482], "stop_captur": 3, "stop_gradi": [0, 481], "storag": 83, "store": 5, "str": [2, 105, 130, 131, 143, 165, 186, 187, 189, 194, 208, 210, 219, 262, 263, 264, 265, 266, 299, 310, 314, 358, 359, 362, 363, 365, 367, 369, 375, 400, 404, 405, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434], "straight": 5, "strang": 5, "stream": [2, 7, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 114, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 301, 304, 305, 306, 480, 487], "streamcontext": 282, "streamordevic": [0, 2], "street": 5, "strength": [461, 467], "strict": [123, 166, 181, 363, 365, 375], "strictli": [189, 218], "stride": [0, 2, 83, 98, 99, 100, 101, 102, 103, 104, 326, 327, 330, 331, 332, 333, 334, 335, 355, 356, 386, 482], "string": [0, 2, 131, 143, 210, 232, 484, 486], "structur": [2, 462, 481], "stub": 8, "style": [2, 13, 16, 87, 88, 89, 128, 129, 133, 166, 167, 172, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283], "su": 5, "sub": [0, 6, 118, 250, 294, 307], "subarrai": [118, 275], "subclass": 452, "subdivid": 1, "subdtyp": 178, "subgradi": 457, "sublinear": 456, "submodul": [5, 6, 324, 359, 363, 364, 375, 377], "subscript": [130, 131], "subsect": 5, "subsequ": 454, "subset": [324, 362], "substanti": 8, "subtract": [0, 38], "subtyp": [178, 317], "sudo": [8, 218], "sum": [0, 2, 4, 13, 111, 122, 171, 189, 203, 273, 291, 294, 324, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 480, 482, 484], "sum_": [189, 326, 327, 429], "sum_i": 420, "sum_j": [442, 443], "summat": [130, 131], "super": [5, 6, 324, 452], "superset": [311, 462], "support": [1, 2, 5, 7, 8, 16, 90, 100, 103, 104, 146, 159, 168, 172, 183, 184, 186, 187, 188, 190, 191, 192, 194, 204, 237, 245, 480, 481, 482, 484, 486], "suppos": [481, 487], "sure": [2, 3, 5, 8, 324, 479], "surpass": [404, 405], "surpris": 5, "sw": 1, "swap": [0, 105, 217, 285, 377], "swapax": [0, 112], "swiglu": 5, "swish": [389, 441], "switch": 8, "symbol": 461, "symmetr": [99, 100, 103, 104, 183, 184, 186, 187], "symmetri": [186, 187], "synchron": [2, 479], "syntax": [38, 482], "synthet": 4, "sysctl": 218, "system": [5, 8, 210, 211, 212, 218], "t": [0, 1, 2, 5, 8, 134, 143, 146, 164, 183, 184, 238, 299, 324, 326, 343, 349, 355, 383, 455, 456, 457, 458, 459, 460, 461, 466, 467, 479, 481, 487], "tabl": [1, 189, 317, 340], "take": [0, 2, 5, 6, 87, 88, 89, 94, 163, 165, 179, 206, 222, 230, 238, 288, 299, 302, 303, 306, 312, 313, 378, 421, 476, 480, 481, 482, 486, 487, 488], "take_along_axi": [0, 482], "taken": [118, 287, 294], "talk": 480, "tan": 0, "tangent": [0, 2, 23, 24, 25, 112, 179, 289, 290, 398, 447], "tangent_i": 2, "tangent_x": 2, "tanh": [0, 324, 341, 343, 349, 357, 383, 412, 435], "target": [2, 299, 421, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 479], "target_include_directori": 2, "target_link_librari": 2, "target_link_opt": 2, "target_sourc": 2, "task": [217, 429], "tau": 467, "tcp": 480, "tell": [5, 479, 484], "temp": 5, "templat": [0, 1, 2, 143], "ten": [481, 483], "tend": 461, "tensor": [194, 291, 326, 327, 355, 356, 434, 484], "tensordot": 0, "term": [2, 424, 455, 456, 457, 458, 459, 460, 466], "termin": 8, "test": [6, 8, 480], "test_imag": 6, "test_label": 6, "text": [5, 326, 327, 341, 343, 349, 355, 356, 357, 383, 390, 397, 402, 403, 404, 405, 412, 415, 416, 417, 424, 425, 426, 429, 430, 433, 435, 436, 439, 440, 445, 446, 456, 461], "textrm": [237, 341, 342, 411, 414], "tf": 484, "tgp_size": 2, "th": [108, 109, 110, 111, 117, 140, 186, 470], "than": [1, 2, 5, 78, 105, 118, 129, 145, 163, 166, 167, 181, 182, 183, 184, 186, 187, 188, 191, 192, 204, 216, 218, 309, 311, 386, 397, 400, 430, 433, 446, 456, 461, 479, 481, 487], "thank": 483, "thei": [1, 2, 4, 5, 8, 16, 105, 164, 172, 388, 425, 452, 461, 478, 479, 480, 483, 485, 486, 487], "them": [0, 2, 5, 121, 324, 363, 375, 480, 487], "themselv": [2, 479], "thi": [0, 1, 2, 5, 6, 8, 15, 16, 17, 18, 26, 27, 28, 29, 83, 112, 132, 143, 163, 164, 168, 172, 179, 183, 184, 186, 187, 188, 189, 190, 191, 192, 199, 203, 204, 205, 207, 209, 211, 218, 221, 233, 235, 241, 268, 273, 274, 275, 280, 284, 287, 293, 300, 309, 312, 313, 324, 336, 337, 338, 342, 343, 349, 359, 360, 362, 363, 366, 367, 368, 373, 375, 376, 377, 378, 381, 383, 397, 402, 403, 404, 405, 412, 413, 414, 421, 429, 446, 452, 463, 478, 479, 480, 481, 483, 484, 486], "thing": [2, 5, 480], "third": 185, "thompson": 337, "those": [2, 5, 324], "though": [2, 5, 479, 483, 484], "thousand": 483, "thread": [1, 2], "thread_index_in_simdgroup": 1, "thread_position_in_grid": [1, 2, 143], "threadgroup": [1, 2, 143], "threads_per_simdgroup": 1, "three": [5, 86, 400], "threefri": 476, "threshold": [397, 426, 433, 446], "through": [1, 2, 281, 399, 461, 479, 481, 484], "throw": [2, 94, 123], "thu": [5, 324], "thumb": 454, "tic": 479, "tieleman": 466, "tile": [0, 146], "time": [2, 5, 8, 217, 292, 324, 326, 327, 343, 349, 355, 356, 383, 479, 481, 483, 487], "timeit": [479, 481], "titl": 2, "tmp": [1, 143], "to_quant": 307, "to_stream": 2, "toc": 479, "togeth": [0, 1, 2, 6, 237, 311, 312, 480], "tok_embed": 5, "token": [5, 340, 380], "told": 5, "toler": [0, 16, 172], "too": [178, 479, 483], "took": 5, "tool": 8, "top": [2, 293, 352, 400], "topk": 0, "torch": [5, 484], "torch_weight": 5, "total": [218, 481], "total_norm": 309, "tpi": 479, "trace": [0, 3, 479], "trace_fil": 3, "tracer": 376, "track": [2, 324, 328], "track_running_stat": 328, "trade": 483, "tradit": [5, 145, 337, 338, 386], "train": [5, 6, 324, 328, 336, 337, 338, 361, 363, 375, 402, 403], "train_imag": [6, 454], "train_label": [6, 454], "trainabl": [6, 308, 324, 452], "trainable_paramet": [324, 362, 463], "transform": [1, 5, 7, 112, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 168, 308, 324, 328, 344, 350, 352, 362, 363, 375, 381, 386, 482], "transformerencod": 265, "transit": 470, "translat": [142, 350], "transpos": [0, 5, 31, 102, 103, 104, 164, 238, 333, 334, 335], "treat": [0, 2, 154, 155, 157, 158, 287, 400, 479], "tree": [7, 94, 136, 165, 299, 303, 310, 311, 312, 313, 314, 462, 463, 465, 474, 481], "tree_flatten": [265, 311, 314, 324, 454], "tree_map": [312, 324, 480], "tree_unflatten": [5, 454], "trembl": 5, "tri": 0, "triangl": [186, 187, 296], "triangular": [183, 184, 192], "tril": 0, "trilinear": 400, "triplet": 434, "triplet_loss": 324, "triu": 0, "true": [0, 1, 2, 4, 5, 16, 41, 42, 43, 44, 82, 94, 108, 109, 110, 111, 143, 145, 164, 172, 178, 183, 184, 189, 194, 208, 217, 238, 273, 304, 307, 310, 311, 312, 313, 317, 324, 328, 330, 331, 332, 333, 334, 335, 343, 344, 348, 349, 350, 352, 362, 363, 365, 372, 375, 381, 383, 386, 391, 399, 400, 421, 429, 456], "truncat": [147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 251], "truth": [4, 423, 433], "try": [2, 8], "tupl": [0, 30, 65, 68, 79, 95, 99, 100, 101, 103, 104, 125, 129, 131, 136, 138, 179, 186, 189, 190, 191, 232, 237, 257, 259, 278, 299, 302, 310, 311, 312, 313, 314, 326, 327, 331, 332, 334, 335, 355, 356, 365, 367, 388, 400, 456, 458, 459, 460, 461, 478, 481], "tutori": 2, "twice": 487, "two": [0, 2, 13, 14, 16, 24, 82, 85, 87, 88, 89, 90, 118, 128, 133, 148, 151, 157, 163, 164, 166, 167, 172, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191, 192, 199, 204, 206, 222, 224, 228, 231, 285, 313, 327, 342, 349, 356, 414, 422, 479, 480, 481, 482, 487], "txt": 2, "type": [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 210, 216, 217, 218, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 260, 261, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 309, 310, 313, 324, 370, 399, 401, 402, 403, 404, 405, 406, 407, 408, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 479, 482], "type_nam": 2, "type_to_nam": 2, "typenam": [0, 1, 2], "typic": [0, 146, 340, 454, 479, 483], "u": [1, 2, 183, 186, 187, 191, 352, 377, 474, 483], "u_": 455, "u_t": 455, "uint": [1, 2, 143], "uint16": [11, 317], "uint3": 1, "uint32": [11, 26, 27, 28, 29, 241, 317], "uint64": [11, 317], "uint8": [11, 317], "ultra": 5, "unabl": 8, "unam": 8, "unari": 479, "unchang": [145, 281, 386], "uncheck": 8, "uncompress": 265, "undefin": [0, 28, 112, 183, 184, 233, 245, 482], "under": [2, 189], "underli": [2, 301], "understand": [5, 402, 403], "unexpect": [2, 18], "unfreez": [324, 363], "unfrozen": 375, "unifi": 7, "uniform": [3, 324, 352, 365, 403, 405, 449, 476, 479, 481, 487], "uniformli": 252, "unintend": 0, "union": [18, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 84, 85, 86, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 176, 177, 178, 186, 187, 210, 263, 282], "uniqu": [2, 476], "unique_ptr": 2, "unit": [329, 339, 341, 342, 343, 351, 384, 385, 387, 389, 402, 403, 404, 405, 409, 410, 411, 412, 413, 414, 418, 437, 438, 439, 441], "unittest": 8, "univers": 189, "unless": [5, 16, 172, 189, 452], "unlik": [5, 16, 172, 337, 338, 371], "unnecessari": [2, 5], "unnorm": [241, 421, 423], "unscal": 456, "unsign": [164, 237, 238, 317], "unsignedinteg": 11, "unspecifi": [15, 17, 18, 26, 27, 28, 29, 95, 108, 109, 110, 111, 162, 203, 205, 207, 221, 229, 233, 235, 256, 273, 274, 280, 284, 287, 293, 294, 300, 305, 488], "unsqueez": 5, "unsupport": 194, "until": [2, 483, 485], "unus": 2, "up": [1, 2, 5, 112, 479], "upcast": 2, "updat": [0, 1, 2, 4, 5, 6, 38, 94, 307, 311, 313, 328, 358, 359, 365, 370, 371, 372, 377, 454, 456, 459, 461, 462, 463, 467, 468, 469, 470, 471, 472, 479, 480, 483], "update_modul": 324, "uplo": [186, 187], "upon": [5, 311, 312], "upper": [183, 184, 186, 187, 192, 237, 248, 251, 252, 408], "upsampl": 324, "us": [0, 3, 4, 5, 6, 7, 8, 18, 38, 83, 112, 116, 119, 121, 122, 125, 126, 127, 129, 141, 143, 145, 159, 164, 180, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 204, 211, 212, 213, 216, 218, 237, 238, 256, 257, 258, 259, 286, 310, 313, 317, 319, 324, 327, 337, 340, 341, 343, 349, 352, 356, 358, 362, 369, 376, 378, 380, 381, 383, 386, 391, 399, 400, 404, 405, 412, 413, 422, 449, 452, 454, 455, 456, 458, 459, 460, 461, 462, 463, 476, 478, 479, 480, 481, 482, 485, 487], "usag": [112, 399, 479], "user": [2, 5, 324], "usual": [340, 380, 478, 483], "util": [1, 2, 5, 7, 8, 265, 324, 454, 480], "v": [5, 105, 146, 186, 324, 363, 484], "v_": [455, 457, 458, 459, 460, 466, 467], "v_t": [455, 457, 458, 459, 460, 466, 467], "val": [0, 30, 162], "valid": [6, 105, 159, 303, 310, 363, 375, 478], "valid_parameter_filt": 358, "valu": [0, 1, 4, 5, 11, 12, 16, 18, 26, 27, 50, 78, 82, 93, 140, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 172, 185, 189, 191, 193, 210, 218, 225, 232, 236, 240, 241, 242, 244, 245, 246, 248, 251, 252, 259, 263, 287, 288, 299, 303, 308, 310, 311, 312, 313, 317, 327, 329, 336, 337, 338, 339, 345, 348, 352, 356, 362, 378, 379, 395, 397, 399, 401, 421, 422, 423, 424, 425, 426, 428, 429, 430, 431, 432, 433, 446, 452, 456, 459, 468, 469, 471, 472, 481], "value_and_grad": [6, 112, 324, 376, 452, 454, 465, 479, 481, 484, 485], "value_and_grad_fn": 483, "value_cach": 5, "value_dim": 378, "value_input_dim": 378, "value_output_dim": 378, "value_proj": 5, "valueerror": [189, 365, 481], "values_hat": 5, "van": 189, "var": [0, 328, 344, 348, 350, 424], "variabl": [8, 94, 119, 132, 165, 179, 299, 302, 303, 480], "varianc": [0, 280, 300, 328, 344, 424], "variant": [5, 433, 460], "variou": 189, "vector": [0, 2, 4, 7, 171, 179, 189, 287, 302, 303, 340, 423, 485], "verbos": [1, 143], "veri": [5, 378, 480, 483, 487], "verifi": [4, 8], "versa": 259, "version": [2, 8, 116, 141, 199, 203, 237, 273, 303, 476, 481, 482], "versu": 479, "via": [8, 112, 462, 465, 480, 483, 484], "vice": 259, "video": 338, "view": [0, 3, 83, 484], "virtual": 2, "vjp": [2, 112, 485], "vmap": [2, 112, 481, 483, 485], "vmap_add": 481, "vocab_s": 5, "vocabulari": [340, 380], "void": [1, 2], "vt": 191, "w": [0, 1, 4, 99, 100, 103, 104, 116, 141, 164, 186, 237, 238, 299, 312, 327, 328, 331, 332, 334, 335, 337, 338, 352, 356, 454, 467, 481], "w1": [5, 309], "w2": [5, 309], "w3": 5, "w_": [327, 343, 349, 356, 383, 455, 456, 457, 458, 459, 460, 461, 466, 467], "w_1": 237, "w_g": 237, "w_i": [116, 141, 237], "w_in": 1, "w_q": 237, "w_star": 4, "w_stride": 1, "w_t": [455, 457, 458, 459, 460, 461, 466, 467], "wa": [5, 83, 125, 126, 480, 483], "wai": [2, 5, 8, 324, 400, 479, 480, 481, 482], "wait": [2, 5, 217], "walk": 5, "walkthrough": 2, "walsh": 168, "want": [1, 5, 480, 481, 487], "warm": [2, 479], "warmup": [470, 471], "warmup_init": 456, "watch": [5, 479], "wd": 461, "we": [0, 1, 2, 4, 5, 6, 116, 125, 126, 141, 164, 237, 238, 324, 340, 380, 388, 459, 461, 476, 478, 479, 480, 481, 483, 487], "weight": [0, 4, 98, 99, 100, 101, 102, 103, 104, 142, 144, 311, 324, 365, 369, 380, 381, 421, 423, 452, 456, 459, 461, 463, 467, 481, 483], "weight_decai": [456, 459, 461, 467], "weight_fil": 5, "weights_fp16": 483, "well": [5, 324, 363, 375, 378, 483], "wen": 5, "went": 5, "were": [5, 487], "wet": 5, "what": [2, 5, 311], "whatsoev": 5, "whc": 337, "when": [0, 1, 2, 5, 7, 8, 94, 101, 112, 127, 183, 184, 186, 187, 188, 189, 191, 192, 194, 330, 331, 332, 333, 334, 335, 400, 404, 405, 421, 427, 433, 452, 454, 470, 476, 479, 480, 487], "where": [0, 6, 140, 172, 184, 237, 299, 303, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 343, 344, 348, 349, 350, 352, 355, 356, 362, 379, 382, 383, 397, 404, 405, 410, 411, 413, 424, 430, 436, 439, 441, 446, 463, 480, 481, 482], "wherea": 481, "whether": [143, 164, 186, 187, 192, 238, 343, 349, 362, 378, 383, 421, 424, 430], "which": [0, 1, 2, 5, 6, 7, 8, 18, 37, 83, 94, 101, 118, 121, 122, 125, 126, 127, 136, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 165, 173, 174, 175, 176, 177, 179, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 194, 208, 219, 237, 241, 242, 256, 257, 259, 262, 263, 264, 265, 266, 278, 279, 287, 294, 299, 302, 303, 307, 327, 337, 338, 341, 356, 358, 362, 386, 421, 423, 426, 430, 433, 449, 462, 463, 476, 479, 480, 481, 482, 483, 487, 488], "while": [2, 3, 5, 8, 257, 386, 483, 484], "whistl": 2, "who": 5, "whose": [140, 307, 308], "why": 5, "wide": 483, "width": [327, 328, 331, 332, 334, 335, 337, 338, 356, 380, 381], "window": [8, 326, 327, 355, 356], "wipe": 8, "wire": 218, "wired_limit_mb": 218, "wise": [0, 2, 12, 13, 19, 20, 21, 22, 23, 24, 25, 87, 88, 89, 92, 106, 107, 128, 129, 133, 134, 135, 137, 139, 160, 161, 166, 167, 172, 180, 181, 182, 195, 196, 197, 198, 199, 200, 201, 202, 206, 222, 224, 226, 228, 234, 254, 255, 258, 261, 269, 270, 271, 272, 276, 277, 283, 289, 290, 329, 337, 338, 347, 357, 379, 390, 409, 416, 417, 419, 420, 435, 436, 438, 441, 442, 443, 444, 479], "wish": 8, "with_logit": 421, "within": [0, 3, 28, 172], "without": [1, 5, 7, 281, 378, 448, 478, 479, 480, 483, 484, 487], "wk": 5, "wl": 2, "wo": 5, "word": 0, "work": [2, 3, 5, 217, 479, 480, 481, 482, 483], "workhors": 324, "world": [314, 480], "worri": [1, 483], "would": [2, 5, 400, 480, 482, 483, 484, 487], "wq": 5, "wrap": [112, 324], "write": [0, 1, 2, 5, 324, 484], "written": 2, "wrt": 308, "wv": 5, "x": [0, 1, 2, 4, 5, 6, 38, 90, 112, 121, 122, 126, 127, 134, 139, 142, 143, 144, 164, 168, 169, 189, 238, 242, 247, 260, 265, 269, 297, 298, 304, 311, 313, 324, 326, 327, 328, 329, 339, 341, 342, 344, 348, 350, 351, 352, 355, 356, 357, 358, 379, 382, 384, 390, 391, 397, 400, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 433, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 452, 454, 461, 479, 480, 481, 482, 483, 484, 485, 487], "x1": 422, "x2": 422, "x86_64": 8, "x_1": [422, 430], "x_2": [422, 430], "x_cast": 2, "x_grad": 1, "x_i": [420, 442, 443], "x_j": [442, 443], "x_offset": 2, "x_ptr": 2, "x_shape": 1, "x_stride": 2, "x_t": [343, 349, 383], "x_view": 484, "xcode": 8, "xcodeproj": 3, "xcrun": 8, "xf": 349, "xg": 349, "xi": 349, "xn": 343, "xo": 349, "xor": 89, "xr": 343, "xy": [0, 208], "xz": 343, "x\u00b2": 484, "y": [0, 2, 4, 5, 6, 38, 112, 168, 304, 324, 328, 337, 344, 348, 350, 352, 382, 425, 430, 433, 454, 457, 479, 480, 481, 483, 484], "y_": [425, 429], "y_cast": 2, "y_hat": 324, "y_offset": 2, "y_ptr": 2, "y_stride": 2, "ye": 5, "year": 5, "yet": [5, 189, 324, 452, 463, 481, 482, 483, 485], "yield": [5, 6, 476], "you": [2, 3, 5, 6, 7, 8, 218, 324, 391, 399, 449, 476, 479, 480, 481, 482, 484, 486, 487], "your": [2, 5, 8, 452, 481, 483], "z": [2, 343, 479, 483], "z_t": 343, "zeiler": 455, "zero": [0, 140, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 185, 208, 215, 296, 297, 298, 306, 324, 326, 327, 336, 337, 338, 365, 401, 402, 403, 404, 405, 406, 407, 408, 449, 454, 456, 482], "zero_grad": 481, "zeros_lik": 0, "zhang": 5, "zip": [5, 6], "zip_saf": 2}, "titles": ["Operations", "Custom Metal Kernels", "Custom Extensions in MLX", "Metal Debugger", "Linear Regression", "LLM inference", "Multi-Layer Perceptron", "MLX", "Build and Install", "mlx.core.Device", "mlx.core.Dtype", "mlx.core.DtypeCategory", "mlx.core.abs", "mlx.core.add", "mlx.core.addmm", "mlx.core.all", "mlx.core.allclose", "mlx.core.any", "mlx.core.arange", "mlx.core.arccos", "mlx.core.arccosh", "mlx.core.arcsin", "mlx.core.arcsinh", "mlx.core.arctan", "mlx.core.arctan2", "mlx.core.arctanh", "mlx.core.argmax", "mlx.core.argmin", "mlx.core.argpartition", "mlx.core.argsort", "mlx.core.array", "mlx.core.array.T", "mlx.core.array.abs", "mlx.core.array.all", "mlx.core.array.any", "mlx.core.array.argmax", "mlx.core.array.argmin", "mlx.core.array.astype", "mlx.core.array.at", "mlx.core.array.conj", "mlx.core.array.cos", "mlx.core.array.cummax", "mlx.core.array.cummin", "mlx.core.array.cumprod", "mlx.core.array.cumsum", "mlx.core.array.diag", "mlx.core.array.diagonal", "mlx.core.array.dtype", "mlx.core.array.exp", "mlx.core.array.flatten", "mlx.core.array.item", "mlx.core.array.itemsize", "mlx.core.array.log", "mlx.core.array.log10", "mlx.core.array.log1p", "mlx.core.array.log2", "mlx.core.array.logsumexp", "mlx.core.array.max", "mlx.core.array.mean", "mlx.core.array.min", "mlx.core.array.moveaxis", "mlx.core.array.nbytes", "mlx.core.array.ndim", "mlx.core.array.prod", "mlx.core.array.reciprocal", "mlx.core.array.reshape", "mlx.core.array.round", "mlx.core.array.rsqrt", "mlx.core.array.shape", "mlx.core.array.sin", "mlx.core.array.size", "mlx.core.array.split", "mlx.core.array.sqrt", "mlx.core.array.square", "mlx.core.array.squeeze", "mlx.core.array.std", "mlx.core.array.sum", "mlx.core.array.swapaxes", "mlx.core.array.tolist", "mlx.core.array.transpose", "mlx.core.array.var", "mlx.core.array.view", "mlx.core.array_equal", "mlx.core.as_strided", "mlx.core.atleast_1d", "mlx.core.atleast_2d", "mlx.core.atleast_3d", "mlx.core.bitwise_and", "mlx.core.bitwise_or", "mlx.core.bitwise_xor", "mlx.core.block_masked_mm", "mlx.core.broadcast_to", "mlx.core.ceil", "mlx.core.clip", "mlx.core.compile", "mlx.core.concatenate", "mlx.core.conj", "mlx.core.conjugate", "mlx.core.conv1d", "mlx.core.conv2d", "mlx.core.conv3d", "mlx.core.conv_general", "mlx.core.conv_transpose1d", "mlx.core.conv_transpose2d", "mlx.core.conv_transpose3d", "mlx.core.convolve", "mlx.core.cos", "mlx.core.cosh", "mlx.core.cummax", "mlx.core.cummin", "mlx.core.cumprod", "mlx.core.cumsum", "mlx.core.custom_function", "mlx.core.default_device", "mlx.core.default_stream", "mlx.core.degrees", "mlx.core.dequantize", "mlx.core.diag", "mlx.core.diagonal", "mlx.core.disable_compile", "mlx.core.distributed.Group", "mlx.core.distributed.all_gather", "mlx.core.distributed.all_sum", "mlx.core.distributed.init", "mlx.core.distributed.is_available", "mlx.core.distributed.recv", "mlx.core.distributed.recv_like", "mlx.core.distributed.send", "mlx.core.divide", "mlx.core.divmod", "mlx.core.einsum", "mlx.core.einsum_path", "mlx.core.enable_compile", "mlx.core.equal", "mlx.core.erf", "mlx.core.erfinv", "mlx.core.eval", "mlx.core.exp", "mlx.core.expand_dims", "mlx.core.expm1", "mlx.core.eye", "mlx.core.fast.affine_quantize", "mlx.core.fast.layer_norm", "mlx.core.fast.metal_kernel", "mlx.core.fast.rms_norm", "mlx.core.fast.rope", "mlx.core.fast.scaled_dot_product_attention", "mlx.core.fft.fft", "mlx.core.fft.fft2", "mlx.core.fft.fftn", "mlx.core.fft.ifft", "mlx.core.fft.ifft2", "mlx.core.fft.ifftn", "mlx.core.fft.irfft", "mlx.core.fft.irfft2", "mlx.core.fft.irfftn", "mlx.core.fft.rfft", "mlx.core.fft.rfft2", "mlx.core.fft.rfftn", "mlx.core.flatten", "mlx.core.floor", "mlx.core.floor_divide", "mlx.core.full", "mlx.core.gather_mm", "mlx.core.gather_qmm", "mlx.core.grad", "mlx.core.greater", "mlx.core.greater_equal", "mlx.core.hadamard_transform", "mlx.core.identity", "mlx.core.imag", "mlx.core.inner", "mlx.core.isclose", "mlx.core.isfinite", "mlx.core.isinf", "mlx.core.isnan", "mlx.core.isneginf", "mlx.core.isposinf", "mlx.core.issubdtype", "mlx.core.jvp", "mlx.core.left_shift", "mlx.core.less", "mlx.core.less_equal", "mlx.core.linalg.cholesky", "mlx.core.linalg.cholesky_inv", "mlx.core.linalg.cross", "mlx.core.linalg.eigh", "mlx.core.linalg.eigvalsh", "mlx.core.linalg.inv", "mlx.core.linalg.norm", "mlx.core.linalg.qr", "mlx.core.linalg.svd", "mlx.core.linalg.tri_inv", "mlx.core.linspace", "mlx.core.load", "mlx.core.log", "mlx.core.log10", "mlx.core.log1p", "mlx.core.log2", "mlx.core.logaddexp", "mlx.core.logical_and", "mlx.core.logical_not", "mlx.core.logical_or", "mlx.core.logsumexp", "mlx.core.matmul", "mlx.core.max", "mlx.core.maximum", "mlx.core.mean", "mlx.core.meshgrid", "mlx.core.metal.clear_cache", "mlx.core.metal.device_info", "mlx.core.metal.get_active_memory", "mlx.core.metal.get_cache_memory", "mlx.core.metal.get_peak_memory", "mlx.core.metal.is_available", "mlx.core.metal.reset_peak_memory", "mlx.core.metal.set_cache_limit", "mlx.core.metal.set_memory_limit", "mlx.core.metal.set_wired_limit", "mlx.core.metal.start_capture", "mlx.core.metal.stop_capture", "mlx.core.min", "mlx.core.minimum", "mlx.core.moveaxis", "mlx.core.multiply", "mlx.core.nan_to_num", "mlx.core.negative", "mlx.core.new_stream", "mlx.core.not_equal", "mlx.core.ones", "mlx.core.ones_like", "mlx.core.outer", "mlx.core.pad", "mlx.core.partition", "mlx.core.power", "mlx.core.prod", "mlx.core.put_along_axis", "mlx.core.quantize", "mlx.core.quantized_matmul", "mlx.core.radians", "mlx.core.random.bernoulli", "mlx.core.random.categorical", "mlx.core.random.gumbel", "mlx.core.random.key", "mlx.core.random.laplace", "mlx.core.random.multivariate_normal", "mlx.core.random.normal", "mlx.core.random.permutation", "mlx.core.random.randint", "mlx.core.random.seed", "mlx.core.random.split", "mlx.core.random.truncated_normal", "mlx.core.random.uniform", "mlx.core.real", "mlx.core.reciprocal", "mlx.core.remainder", "mlx.core.repeat", "mlx.core.reshape", "mlx.core.right_shift", "mlx.core.roll", "mlx.core.round", "mlx.core.rsqrt", "mlx.core.save", "mlx.core.save_gguf", "mlx.core.save_safetensors", "mlx.core.savez", "mlx.core.savez_compressed", "mlx.core.set_default_device", "mlx.core.set_default_stream", "mlx.core.sigmoid", "mlx.core.sign", "mlx.core.sin", "mlx.core.sinh", "mlx.core.softmax", "mlx.core.sort", "mlx.core.split", "mlx.core.sqrt", "mlx.core.square", "mlx.core.squeeze", "mlx.core.stack", "mlx.core.std", "mlx.core.stop_gradient", "mlx.core.stream", "mlx.core.subtract", "mlx.core.sum", "mlx.core.swapaxes", "mlx.core.synchronize", "mlx.core.take", "mlx.core.take_along_axis", "mlx.core.tan", "mlx.core.tanh", "mlx.core.tensordot", "mlx.core.tile", "mlx.core.topk", "mlx.core.trace", "mlx.core.transpose", "mlx.core.tri", "mlx.core.tril", "mlx.core.triu", "mlx.core.value_and_grad", "mlx.core.var", "mlx.core.view", "mlx.core.vjp", "mlx.core.vmap", "mlx.core.where", "mlx.core.zeros", "mlx.core.zeros_like", "mlx.nn.quantize", "mlx.nn.value_and_grad", "mlx.optimizers.clip_grad_norm", "mlx.utils.tree_flatten", "mlx.utils.tree_map", "mlx.utils.tree_map_with_path", "mlx.utils.tree_reduce", "mlx.utils.tree_unflatten", "mlx.core.Stream", "Array", "Data Types", "Devices and Streams", "Distributed Communication", "Fast", "FFT", "Linear Algebra", "Metal", "Neural Networks", "mlx.nn.ALiBi", "mlx.nn.AvgPool1d", "mlx.nn.AvgPool2d", "mlx.nn.BatchNorm", "mlx.nn.CELU", "mlx.nn.Conv1d", "mlx.nn.Conv2d", "mlx.nn.Conv3d", "mlx.nn.ConvTranspose1d", "mlx.nn.ConvTranspose2d", "mlx.nn.ConvTranspose3d", "mlx.nn.Dropout", "mlx.nn.Dropout2d", "mlx.nn.Dropout3d", "mlx.nn.ELU", "mlx.nn.Embedding", "mlx.nn.GELU", "mlx.nn.GLU", "mlx.nn.GRU", "mlx.nn.GroupNorm", "mlx.nn.HardShrink", "mlx.nn.HardTanh", "mlx.nn.Hardswish", "mlx.nn.InstanceNorm", "mlx.nn.LSTM", "mlx.nn.LayerNorm", "mlx.nn.LeakyReLU", "mlx.nn.Linear", "mlx.nn.LogSigmoid", "mlx.nn.LogSoftmax", "mlx.nn.MaxPool1d", "mlx.nn.MaxPool2d", "mlx.nn.Mish", "mlx.nn.Module.apply", "mlx.nn.Module.apply_to_modules", "mlx.nn.Module.children", "mlx.nn.Module.eval", "mlx.nn.Module.filter_and_map", "mlx.nn.Module.freeze", "mlx.nn.Module.leaf_modules", "mlx.nn.Module.load_weights", "mlx.nn.Module.modules", "mlx.nn.Module.named_modules", "mlx.nn.Module.parameters", "mlx.nn.Module.save_weights", "mlx.nn.Module.set_dtype", "mlx.nn.Module.state", "mlx.nn.Module.train", "mlx.nn.Module.trainable_parameters", "mlx.nn.Module.training", "mlx.nn.Module.unfreeze", "mlx.nn.Module.update", "mlx.nn.Module.update_modules", "mlx.nn.MultiHeadAttention", "mlx.nn.PReLU", "mlx.nn.QuantizedEmbedding", "mlx.nn.QuantizedLinear", "mlx.nn.RMSNorm", "mlx.nn.RNN", "mlx.nn.ReLU", "mlx.nn.ReLU6", "mlx.nn.RoPE", "mlx.nn.SELU", "mlx.nn.Sequential", "mlx.nn.SiLU", "mlx.nn.Sigmoid", "mlx.nn.SinusoidalPositionalEncoding", "mlx.nn.Softmax", "mlx.nn.Softmin", "mlx.nn.Softplus", "mlx.nn.Softshrink", "mlx.nn.Softsign", "mlx.nn.Step", "mlx.nn.Tanh", "mlx.nn.Transformer", "mlx.nn.Upsample", "mlx.nn.init.constant", "mlx.nn.init.glorot_normal", "mlx.nn.init.glorot_uniform", "mlx.nn.init.he_normal", "mlx.nn.init.he_uniform", "mlx.nn.init.identity", "mlx.nn.init.normal", "mlx.nn.init.uniform", "mlx.nn.celu", "mlx.nn.elu", "mlx.nn.gelu", "mlx.nn.gelu_approx", "mlx.nn.gelu_fast_approx", "mlx.nn.glu", "mlx.nn.hard_shrink", "mlx.nn.hard_tanh", "mlx.nn.hardswish", "mlx.nn.leaky_relu", "mlx.nn.log_sigmoid", "mlx.nn.log_softmax", "mlx.nn.losses.binary_cross_entropy", "mlx.nn.losses.cosine_similarity_loss", "mlx.nn.losses.cross_entropy", "mlx.nn.losses.gaussian_nll_loss", "mlx.nn.losses.hinge_loss", "mlx.nn.losses.huber_loss", "mlx.nn.losses.kl_div_loss", "mlx.nn.losses.l1_loss", "mlx.nn.losses.log_cosh_loss", "mlx.nn.losses.margin_ranking_loss", "mlx.nn.losses.mse_loss", "mlx.nn.losses.nll_loss", "mlx.nn.losses.smooth_l1_loss", "mlx.nn.losses.triplet_loss", "mlx.nn.mish", "mlx.nn.prelu", "mlx.nn.relu", "mlx.nn.relu6", "mlx.nn.selu", "mlx.nn.sigmoid", "mlx.nn.silu", "mlx.nn.softmax", "mlx.nn.softmin", "mlx.nn.softplus", "mlx.nn.softshrink", "mlx.nn.step", "mlx.nn.tanh", "Functions", "Initializers", "Layers", "Loss Functions", "Module", "Operations", "Optimizers", "mlx.optimizers.AdaDelta", "mlx.optimizers.Adafactor", "mlx.optimizers.Adagrad", "mlx.optimizers.Adam", "mlx.optimizers.AdamW", "mlx.optimizers.Adamax", "mlx.optimizers.Lion", "mlx.optimizers.Optimizer.apply_gradients", "mlx.optimizers.Optimizer.init", "mlx.optimizers.Optimizer.state", "mlx.optimizers.Optimizer.update", "mlx.optimizers.RMSprop", "mlx.optimizers.SGD", "mlx.optimizers.cosine_decay", "mlx.optimizers.exponential_decay", "mlx.optimizers.join_schedules", "mlx.optimizers.linear_schedule", "mlx.optimizers.step_decay", "Common Optimizers", "Optimizer", "Schedulers", "Random", "Transforms", "Tree Utils", "Compilation", "Distributed Communication", "Function Transforms", "Indexing Arrays", "Lazy Evaluation", "Conversion to NumPy and Other Frameworks", "Quick Start Guide", "Saving and Loading Arrays", "Unified Memory", "Using Streams"], "titleterms": {"A": 487, "In": 482, "The": 324, "ab": [12, 32], "adadelta": 455, "adafactor": 456, "adagrad": 457, "adam": 458, "adamax": 460, "adamw": 459, "add": 13, "addmm": 14, "affine_quant": 141, "algebra": 322, "alibi": 325, "all": [5, 15, 33, 480], "all_gath": 121, "all_sum": 122, "allclos": 16, "ani": [17, 34], "api": [7, 8], "appli": 358, "apply_gradi": 462, "apply_to_modul": 359, "arang": 18, "arcco": 19, "arccosh": 20, "arcsin": 21, "arcsinh": 22, "arctan": 23, "arctan2": 24, "arctanh": 25, "argmax": [26, 35], "argmin": [27, 36], "argpartit": 28, "argsort": 29, "arrai": [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 316, 482, 486], "array_equ": 82, "as_strid": 83, "astyp": 37, "atleast_1d": 84, "atleast_2d": 85, "atleast_3d": 86, "attent": 5, "automat": 481, "avgpool1d": 326, "avgpool2d": 327, "back": 2, "basic": [479, 485], "batchnorm": 328, "benchmark": 5, "bernoulli": 240, "binari": 8, "binary_cross_entropi": 421, "bind": 2, "bitwise_and": 87, "bitwise_or": 88, "bitwise_xor": 89, "block_masked_mm": 90, "broadcast_to": 91, "build": [2, 8], "c": [7, 8], "categor": 241, "ceil": 92, "celu": [329, 409], "children": 360, "choleski": 183, "cholesky_inv": 184, "class": 324, "clear_cach": 209, "clip": 93, "clip_grad_norm": 309, "cmake": 2, "co": [40, 106], "code": [2, 5], "common": 473, "commun": [319, 480], "compil": [94, 479], "complex": 1, "comput": 483, "concaten": 95, "conj": [39, 96], "conjug": 97, "constant": 401, "conv1d": [98, 330], "conv2d": [99, 331], "conv3d": [100, 332], "conv_gener": 101, "conv_transpose1d": 102, "conv_transpose2d": 103, "conv_transpose3d": 104, "convers": 484, "convert": 5, "convolv": 105, "convtranspose1d": 333, "convtranspose2d": 334, "convtranspose3d": 335, "core": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 315], "cosh": 107, "cosine_decai": 468, "cosine_similarity_loss": 422, "cpu": 2, "cross": 185, "cross_entropi": 423, "cummax": [41, 108], "cummin": [42, 109], "cumprod": [43, 110], "cumsum": [44, 111], "custom": [1, 2], "custom_funct": 112, "data": 317, "debug": 479, "debugg": 3, "default_devic": 113, "default_stream": 114, "degre": 115, "dequant": 116, "devic": [9, 318], "device_info": 210, "diag": [45, 117], "diagon": [46, 118], "differ": 482, "differenti": 481, "disable_compil": 119, "distribut": [120, 121, 122, 123, 124, 125, 126, 127, 319, 480], "divid": 128, "divmod": 129, "download": [2, 5], "dropout": 336, "dropout2d": 337, "dropout3d": 338, "dtype": [10, 47], "dtypecategori": 11, "eigh": 186, "eigvalsh": 187, "einsum": 130, "einsum_path": 131, "elu": [339, 410], "embed": 340, "enable_compil": 132, "encod": 5, "end": 2, "equal": 133, "erf": 134, "erfinv": 135, "eval": [136, 361], "evalu": 483, "exampl": [1, 2, 7, 479, 480, 487], "exp": [48, 137], "expand_dim": 138, "expm1": 139, "exponential_decai": 469, "extens": 2, "ey": 140, "fast": [141, 142, 143, 144, 145, 146, 320], "fft": [147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 321], "fft2": 148, "fftn": 149, "filter_and_map": 362, "flatten": [49, 159], "floor": 160, "floor_divid": 161, "format": 486, "found": 8, "framework": 484, "freez": 363, "from": [8, 482], "full": [5, 162], "function": [448, 451, 479, 481, 485], "further": 7, "gather_mm": 163, "gather_qmm": 164, "gaussian_nll_loss": 424, "gelu": [341, 411], "gelu_approx": 412, "gelu_fast_approx": 413, "gener": 5, "get": 480, "get_active_memori": 211, "get_cache_memori": 212, "get_peak_memori": 213, "glorot_norm": 402, "glorot_uniform": 403, "glu": [342, 414], "gpu": 2, "grad": [165, 324], "graph": [479, 483, 485], "greater": 166, "greater_equ": 167, "grid": 1, "group": 120, "groupnorm": 344, "gru": 343, "guid": 485, "gumbel": 242, "hadamard_transform": 168, "hard_shrink": 415, "hard_tanh": 416, "hardshrink": 345, "hardswish": [347, 417], "hardtanh": 346, "he_norm": 404, "he_uniform": 405, "hinge_loss": 425, "host": 480, "huber_loss": 426, "ident": [169, 406], "ifft": 150, "ifft2": 151, "ifftn": 152, "imag": 170, "implement": [2, 5], "index": 482, "infer": 5, "init": [123, 401, 402, 403, 404, 405, 406, 407, 408, 463], "initi": 449, "inner": 171, "inspect": 324, "instal": [7, 8, 480], "instancenorm": 348, "introduc": 2, "inv": 188, "irfft": 153, "irfft2": 154, "irfftn": 155, "is_avail": [124, 214], "isclos": 172, "isfinit": 173, "isinf": 174, "isnan": 175, "isneginf": 176, "isposinf": 177, "issubdtyp": 178, "item": 50, "items": 51, "jax": 484, "join_schedul": 470, "jvp": 179, "kei": 243, "kernel": 1, "kl_div_loss": 427, "l1_loss": 428, "laplac": 244, "layer": [5, 6, 450], "layer_norm": 142, "layernorm": 350, "lazi": 483, "leaf_modul": 364, "leaky_relu": 418, "leakyrelu": 351, "left_shift": 180, "less": 181, "less_equ": 182, "linalg": [183, 184, 185, 186, 187, 188, 189, 190, 191, 192], "linear": [4, 322, 352], "linear_schedul": 471, "linspac": 193, "lion": 461, "llm": 5, "load": [5, 194, 454, 486], "load_weight": 365, "log": [52, 195], "log10": [53, 196], "log1p": [54, 197], "log2": [55, 198], "log_cosh_loss": 429, "log_sigmoid": 419, "log_softmax": 420, "logaddexp": 199, "logical_and": 200, "logical_not": 201, "logical_or": 202, "logsigmoid": 353, "logsoftmax": 354, "logsumexp": [56, 203], "loss": [421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 451], "lstm": 349, "margin_ranking_loss": 430, "matmul": 204, "max": [57, 205], "maximum": 206, "maxpool1d": 355, "maxpool2d": 356, "mean": [58, 207], "memori": 487, "meshgrid": 208, "metal": [1, 3, 8, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 323], "metal_kernel": 143, "min": [59, 221], "minim": 8, "minimum": 222, "mish": [357, 435], "mlx": [2, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472], "model": 5, "modul": [324, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 452], "moveaxi": [60, 223], "mpi": 480, "mse_loss": 431, "multi": 6, "multiheadattent": 378, "multipli": 224, "multivariate_norm": 245, "named_modul": 367, "nan_to_num": 225, "nbyte": 61, "ndim": 62, "neg": 226, "network": 324, "neural": 324, "new_stream": 227, "nll_loss": 432, "nn": [307, 308, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447], "norm": 189, "normal": [246, 407], "not_equ": 228, "numpi": [482, 484], "ones": 229, "ones_lik": 230, "onli": 483, "oper": [0, 2, 453], "optim": [309, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474], "option": 8, "other": 484, "outer": 231, "pad": 232, "paramet": [324, 368], "partit": 233, "perceptron": 6, "permut": 247, "place": 482, "power": 234, "prelu": [379, 436], "primit": 2, "prod": [63, 235], "pure": 479, "put": 5, "put_along_axi": 236, "python": [2, 7, 8], "pytorch": 484, "qr": 190, "quantiz": [237, 307], "quantized_matmul": 238, "quantizedembed": 380, "quantizedlinear": 381, "quick": [324, 485], "radian": 239, "randint": 248, "random": [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 476], "read": 7, "real": 253, "reciproc": [64, 254], "recv": 125, "recv_lik": 126, "reduc": 480, "refer": 7, "regress": 4, "relu": [384, 437], "relu6": [385, 438], "remaind": 255, "remot": 480, "repeat": 256, "requir": 8, "reset_peak_memori": 215, "reshap": [65, 257], "result": 2, "rfft": 156, "rfft2": 157, "rfftn": 158, "right_shift": 258, "rms_norm": 144, "rmsnorm": 382, "rmsprop": 466, "rnn": 383, "roll": 259, "rope": [145, 386], "round": [66, 260], "rsqrt": [67, 261], "sampl": 1, "save": [262, 454, 486], "save_gguf": 263, "save_safetensor": 264, "save_weight": 369, "savez": 265, "savez_compress": 266, "scaled_dot_product_attent": 146, "schedul": 475, "script": [2, 5], "seed": 249, "selu": [387, 439], "send": 127, "sequenti": 388, "serial": 486, "set": 480, "set_cache_limit": 216, "set_default_devic": 267, "set_default_stream": 268, "set_dtyp": 370, "set_memory_limit": 217, "set_wired_limit": 218, "setuptool": 2, "sgd": 467, "shape": [1, 68], "shell": 8, "sigmoid": [269, 390, 440], "sign": 270, "silu": [389, 441], "simpl": [1, 487], "sin": [69, 271], "sinh": 272, "sinusoidalpositionalencod": 391, "size": [8, 70], "smooth_l1_loss": 433, "softmax": [273, 392, 442], "softmin": [393, 443], "softplu": [394, 444], "softshrink": [395, 445], "softsign": 396, "sort": 274, "sourc": 8, "specifi": 488, "speedup": 479, "split": [71, 250, 275], "sqrt": [72, 276], "squar": [73, 277], "squeez": [74, 278], "stack": 279, "start": [324, 480, 485], "start_captur": 219, "state": [371, 464], "std": [75, 280], "step": [397, 446], "step_decai": 472, "stop_captur": 220, "stop_gradi": 281, "stream": [282, 315, 318, 488], "stride": 1, "subtract": 283, "sum": [76, 284], "support": 317, "svd": 191, "swapax": [77, 285], "synchron": 286, "t": 31, "take": 287, "take_along_axi": 288, "tan": 289, "tanh": [290, 398, 447], "tensordot": 291, "tensorflow": 484, "tile": 292, "togeth": 5, "tolist": 78, "topk": 293, "trace": 294, "train": [372, 374, 479, 480], "trainable_paramet": 373, "transform": [2, 399, 477, 479, 481, 483, 485], "transpos": [79, 295], "tree": 478, "tree_flatten": 310, "tree_map": 311, "tree_map_with_path": 312, "tree_reduc": 313, "tree_unflatten": 314, "tri": 296, "tri_inv": 192, "tril": 297, "triplet_loss": 434, "triu": 298, "troubleshoot": 8, "truncated_norm": 251, "tune": 480, "type": 317, "unfreez": 375, "unifi": 487, "uniform": [252, 408], "up": 480, "updat": [324, 376, 465, 482], "update_modul": 377, "upsampl": 400, "us": [1, 2, 483, 488], "usag": [2, 7], "util": [310, 311, 312, 313, 314, 478], "valu": 324, "value_and_grad": [299, 308], "var": [80, 300], "vector": 481, "view": [81, 301], "vjp": [1, 302], "vmap": 303, "weight": 5, "what": 483, "when": 483, "where": 304, "why": 483, "workflow": 3, "x86": 8, "xcode": 3, "you": 483, "zero": 305, "zeros_lik": 306}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"A Simple Example": [[488, "a-simple-example"]], "Array": [[315, null]], "Attention layer": [[5, "attention-layer"]], "Automatic Differentiation": [[482, "automatic-differentiation"]], "Automatic Vectorization": [[482, "automatic-vectorization"]], "Basics": [[486, "basics"]], "Basics of Compile": [[480, "basics-of-compile"]], "Binary Size Minimization": [[8, "binary-size-minimization"]], "Binding to Python": [[2, "binding-to-python"]], "Build Options": [[8, "id3"]], "Build Requirements": [[8, "build-requirements"]], "Build and Install": [[8, null]], "Build from source": [[8, "build-from-source"]], "Building and Binding": [[2, "building-and-binding"]], "Building with CMake": [[2, "building-with-cmake"]], "Building with setuptools": [[2, "building-with-setuptools"]], "C++ API": [[8, "c-api"]], "C++ API Reference": [[7, null]], "Common Optimizers": [[474, null]], "Compilation": [[480, null]], "Compiling Training Graphs": [[480, "compiling-training-graphs"]], "Complex Example": [[1, "complex-example"]], "Conversion to NumPy and Other Frameworks": [[485, null]], "Converting the weights": [[5, "converting-the-weights"]], "Custom Extensions in MLX": [[2, null]], "Custom Metal Kernels": [[1, null]], "Data Types": [[316, null]], "Debugging": [[480, "debugging"]], "Devices and Streams": [[317, null]], "Differences from NumPy": [[483, "differences-from-numpy"]], "Distributed Communication": [[318, null], [481, null]], "Download the code": [[2, null], [5, null]], "Encoder layer": [[5, "encoder-layer"]], "Example Speedup": [[480, "example-speedup"]], "Examples": [[7, null]], "FFT": [[320, null]], "Fast": [[319, null]], "Full model": [[5, "full-model"]], "Function Transforms": [[482, null]], "Function and Graph Transformations": [[486, "function-and-graph-transformations"]], "Functions": [[449, null]], "Further Reading": [[7, null]], "Generation": [[5, "generation"]], "Getting Started": [[481, "getting-started"]], "Grid Sample VJP": [[1, "grid-sample-vjp"]], "Implementing the CPU Back-end": [[2, "implementing-the-cpu-back-end"]], "Implementing the GPU Back-end": [[2, "implementing-the-gpu-back-end"]], "Implementing the Primitive": [[2, "implementing-the-primitive"]], "Implementing the model": [[5, "implementing-the-model"]], "In Place Updates": [[483, "in-place-updates"]], "Indexing Arrays": [[483, null]], "Initializers": [[450, null]], "Inspecting Modules": [[323, "inspecting-modules"]], "Install": [[7, null]], "Installing MPI": [[481, "installing-mpi"]], "Introducing the Example": [[2, "introducing-the-example"]], "JAX": [[485, "jax"]], "LLM inference": [[5, null]], "Layers": [[451, null]], "Lazy Evaluation": [[484, null]], "Linear Algebra": [[321, null]], "Linear Regression": [[4, null]], "Loss Functions": [[452, null]], "MLX": [[7, null]], "Metal": [[322, null]], "Metal Debugger": [[3, null]], "Metal not found": [[8, "metal-not-found"]], "Module": [[453, null]], "Multi-Layer Perceptron": [[6, null]], "Neural Networks": [[323, null]], "Only Compute What You Use": [[484, "only-compute-what-you-use"]], "Operations": [[0, null], [2, "operations"], [454, null]], "Operations and Primitives": [[2, "operations-and-primitives"]], "Optimizer": [[475, null]], "Optimizers": [[455, null]], "Parameters": [[323, "parameters"]], "Primitive Transforms": [[2, "primitive-transforms"]], "Primitives": [[2, "primitives"]], "Pure Functions": [[480, "pure-functions"]], "Putting it all together": [[5, "putting-it-all-together"]], "PyTorch": [[485, "pytorch"]], "Python API": [[8, "python-api"]], "Python API Reference": [[7, null]], "Python Installation": [[8, "python-installation"]], "Quick Start Guide": [[486, null]], "Quick Start with Neural Networks": [[323, "quick-start-with-neural-networks"]], "Random": [[477, null]], "Results": [[2, "results"]], "Saving and Loading": [[455, "saving-and-loading"]], "Saving and Loading Arrays": [[487, null]], "Schedulers": [[476, null]], "Scripts": [[2, "scripts"], [5, "scripts"]], "Serialization Formats": [[487, "id1"]], "Setting up Remote Hosts": [[481, "setting-up-remote-hosts"]], "Simple Example": [[1, "simple-example"]], "Specifying the Stream": [[489, "specifying-the-stream"]], "Supported Data Types": [[316, "id2"]], "TensorFlow": [[485, "tensorflow"]], "The Module Class": [[323, "the-module-class"]], "Training Example": [[481, "training-example"]], "Transformations with Compile": [[480, "transformations-with-compile"]], "Transforming Compute Graphs": [[484, "transforming-compute-graphs"]], "Transforms": [[478, null]], "Tree Utils": [[479, null]], "Troubleshooting": [[8, "troubleshooting"], [8, "id2"]], "Tuning All Reduce": [[481, "tuning-all-reduce"]], "Unified Memory": [[488, null]], "Updating the Parameters": [[323, "updating-the-parameters"]], "Usage": [[2, "usage"], [7, null]], "Using Shape/Strides": [[1, "using-shape-strides"]], "Using Streams": [[489, null]], "Using the Primitive": [[2, "using-the-primitive"]], "Value and Grad": [[323, "value-and-grad"]], "Weight loading and benchmarking": [[5, "weight-loading-and-benchmarking"]], "When to Evaluate": [[484, "when-to-evaluate"]], "Why Lazy Evaluation": [[484, "why-lazy-evaluation"]], "Xcode Workflow": [[3, "xcode-workflow"]], "mlx.core.Device": [[9, null]], "mlx.core.Dtype": [[10, null]], "mlx.core.DtypeCategory": [[11, null]], "mlx.core.Stream": [[314, null]], "mlx.core.abs": [[12, null]], "mlx.core.add": [[13, null]], "mlx.core.addmm": [[14, null]], "mlx.core.all": [[15, null]], "mlx.core.allclose": [[16, null]], "mlx.core.any": [[17, null]], "mlx.core.arange": [[18, null]], "mlx.core.arccos": [[19, null]], "mlx.core.arccosh": [[20, null]], "mlx.core.arcsin": [[21, null]], "mlx.core.arcsinh": [[22, null]], "mlx.core.arctan": [[23, null]], "mlx.core.arctan2": [[24, null]], "mlx.core.arctanh": [[25, null]], "mlx.core.argmax": [[26, null]], "mlx.core.argmin": [[27, null]], "mlx.core.argpartition": [[28, null]], "mlx.core.argsort": [[29, null]], "mlx.core.array": [[30, null]], "mlx.core.array.T": [[31, null]], "mlx.core.array.abs": [[32, null]], "mlx.core.array.all": [[33, null]], "mlx.core.array.any": [[34, null]], "mlx.core.array.argmax": [[35, null]], "mlx.core.array.argmin": [[36, null]], "mlx.core.array.astype": [[37, null]], "mlx.core.array.at": [[38, null]], "mlx.core.array.conj": [[39, null]], "mlx.core.array.cos": [[40, null]], "mlx.core.array.cummax": [[41, null]], "mlx.core.array.cummin": [[42, null]], "mlx.core.array.cumprod": [[43, null]], "mlx.core.array.cumsum": [[44, null]], "mlx.core.array.diag": [[45, null]], "mlx.core.array.diagonal": [[46, null]], "mlx.core.array.dtype": [[47, null]], "mlx.core.array.exp": [[48, null]], "mlx.core.array.flatten": [[49, null]], "mlx.core.array.item": [[50, null]], "mlx.core.array.itemsize": [[51, null]], "mlx.core.array.log": [[52, null]], "mlx.core.array.log10": [[53, null]], "mlx.core.array.log1p": [[54, null]], "mlx.core.array.log2": [[55, null]], "mlx.core.array.logsumexp": [[56, null]], "mlx.core.array.max": [[57, null]], "mlx.core.array.mean": [[58, null]], "mlx.core.array.min": [[59, null]], "mlx.core.array.moveaxis": [[60, null]], "mlx.core.array.nbytes": [[61, null]], "mlx.core.array.ndim": [[62, null]], "mlx.core.array.prod": [[63, null]], "mlx.core.array.reciprocal": [[64, null]], "mlx.core.array.reshape": [[65, null]], "mlx.core.array.round": [[66, null]], "mlx.core.array.rsqrt": [[67, null]], "mlx.core.array.shape": [[68, null]], "mlx.core.array.sin": [[69, null]], "mlx.core.array.size": [[70, null]], "mlx.core.array.split": [[71, null]], "mlx.core.array.sqrt": [[72, null]], "mlx.core.array.square": [[73, null]], "mlx.core.array.squeeze": [[74, null]], "mlx.core.array.std": [[75, null]], "mlx.core.array.sum": [[76, null]], "mlx.core.array.swapaxes": [[77, null]], "mlx.core.array.tolist": [[78, null]], "mlx.core.array.transpose": [[79, null]], "mlx.core.array.var": [[80, null]], "mlx.core.array.view": [[81, null]], "mlx.core.array_equal": [[82, null]], "mlx.core.as_strided": [[83, null]], "mlx.core.atleast_1d": [[84, null]], "mlx.core.atleast_2d": [[85, null]], "mlx.core.atleast_3d": [[86, null]], "mlx.core.bitwise_and": [[87, null]], "mlx.core.bitwise_or": [[88, null]], "mlx.core.bitwise_xor": [[89, null]], "mlx.core.block_masked_mm": [[90, null]], "mlx.core.broadcast_to": [[91, null]], "mlx.core.ceil": [[92, null]], "mlx.core.clip": [[93, null]], "mlx.core.compile": [[94, null]], "mlx.core.concatenate": [[95, null]], "mlx.core.conj": [[96, null]], "mlx.core.conjugate": [[97, null]], "mlx.core.conv1d": [[98, null]], "mlx.core.conv2d": [[99, null]], "mlx.core.conv3d": [[100, null]], "mlx.core.conv_general": [[101, null]], "mlx.core.conv_transpose1d": [[102, null]], "mlx.core.conv_transpose2d": [[103, null]], "mlx.core.conv_transpose3d": [[104, null]], "mlx.core.convolve": [[105, null]], "mlx.core.cos": [[106, null]], "mlx.core.cosh": [[107, null]], "mlx.core.cummax": [[108, null]], "mlx.core.cummin": [[109, null]], "mlx.core.cumprod": [[110, null]], "mlx.core.cumsum": [[111, null]], "mlx.core.custom_function": [[112, null]], "mlx.core.default_device": [[113, null]], "mlx.core.default_stream": [[114, null]], "mlx.core.degrees": [[115, null]], "mlx.core.dequantize": [[116, null]], "mlx.core.diag": [[117, null]], "mlx.core.diagonal": [[118, null]], "mlx.core.disable_compile": [[119, null]], "mlx.core.distributed.Group": [[120, null]], "mlx.core.distributed.all_gather": [[121, null]], "mlx.core.distributed.all_sum": [[122, null]], "mlx.core.distributed.init": [[123, null]], "mlx.core.distributed.is_available": [[124, null]], "mlx.core.distributed.recv": [[125, null]], "mlx.core.distributed.recv_like": [[126, null]], "mlx.core.distributed.send": [[127, null]], "mlx.core.divide": [[128, null]], "mlx.core.divmod": [[129, null]], "mlx.core.einsum": [[130, null]], "mlx.core.einsum_path": [[131, null]], "mlx.core.enable_compile": [[132, null]], "mlx.core.equal": [[133, null]], "mlx.core.erf": [[134, null]], "mlx.core.erfinv": [[135, null]], "mlx.core.eval": [[136, null]], "mlx.core.exp": [[137, null]], "mlx.core.expand_dims": [[138, null]], "mlx.core.expm1": [[139, null]], "mlx.core.eye": [[140, null]], "mlx.core.fast.layer_norm": [[141, null]], "mlx.core.fast.metal_kernel": [[142, null]], "mlx.core.fast.rms_norm": [[143, null]], "mlx.core.fast.rope": [[144, null]], "mlx.core.fast.scaled_dot_product_attention": [[145, null]], "mlx.core.fft.fft": [[146, null]], "mlx.core.fft.fft2": [[147, null]], "mlx.core.fft.fftn": [[148, null]], "mlx.core.fft.ifft": [[149, null]], "mlx.core.fft.ifft2": [[150, null]], "mlx.core.fft.ifftn": [[151, null]], "mlx.core.fft.irfft": [[152, null]], "mlx.core.fft.irfft2": [[153, null]], "mlx.core.fft.irfftn": [[154, null]], "mlx.core.fft.rfft": [[155, null]], "mlx.core.fft.rfft2": [[156, null]], "mlx.core.fft.rfftn": [[157, null]], "mlx.core.flatten": [[158, null]], "mlx.core.floor": [[159, null]], "mlx.core.floor_divide": [[160, null]], "mlx.core.full": [[161, null]], "mlx.core.gather_mm": [[162, null]], "mlx.core.gather_qmm": [[163, null]], "mlx.core.grad": [[164, null]], "mlx.core.greater": [[165, null]], "mlx.core.greater_equal": [[166, null]], "mlx.core.hadamard_transform": [[167, null]], "mlx.core.identity": [[168, null]], "mlx.core.imag": [[169, null]], "mlx.core.inner": [[170, null]], "mlx.core.isclose": [[171, null]], "mlx.core.isfinite": [[172, null]], "mlx.core.isinf": [[173, null]], "mlx.core.isnan": [[174, null]], "mlx.core.isneginf": [[175, null]], "mlx.core.isposinf": [[176, null]], "mlx.core.issubdtype": [[177, null]], "mlx.core.jvp": [[178, null]], "mlx.core.left_shift": [[179, null]], "mlx.core.less": [[180, null]], "mlx.core.less_equal": [[181, null]], "mlx.core.linalg.cholesky": [[182, null]], "mlx.core.linalg.cholesky_inv": [[183, null]], "mlx.core.linalg.cross": [[184, null]], "mlx.core.linalg.eigh": [[185, null]], "mlx.core.linalg.eigvalsh": [[186, null]], "mlx.core.linalg.inv": [[187, null]], "mlx.core.linalg.norm": [[188, null]], "mlx.core.linalg.qr": [[189, null]], "mlx.core.linalg.svd": [[190, null]], "mlx.core.linalg.tri_inv": [[191, null]], "mlx.core.linspace": [[192, null]], "mlx.core.load": [[193, null]], "mlx.core.log": [[194, null]], "mlx.core.log10": [[195, null]], "mlx.core.log1p": [[196, null]], "mlx.core.log2": [[197, null]], "mlx.core.logaddexp": [[198, null]], "mlx.core.logical_and": [[199, null]], "mlx.core.logical_not": [[200, null]], "mlx.core.logical_or": [[201, null]], "mlx.core.logsumexp": [[202, null]], "mlx.core.matmul": [[203, null]], "mlx.core.max": [[204, null]], "mlx.core.maximum": [[205, null]], "mlx.core.mean": [[206, null]], "mlx.core.meshgrid": [[207, null]], "mlx.core.metal.clear_cache": [[208, null]], "mlx.core.metal.device_info": [[209, null]], "mlx.core.metal.get_active_memory": [[210, null]], "mlx.core.metal.get_cache_memory": [[211, null]], "mlx.core.metal.get_peak_memory": [[212, null]], "mlx.core.metal.is_available": [[213, null]], "mlx.core.metal.reset_peak_memory": [[214, null]], "mlx.core.metal.set_cache_limit": [[215, null]], "mlx.core.metal.set_memory_limit": [[216, null]], "mlx.core.metal.set_wired_limit": [[217, null]], "mlx.core.metal.start_capture": [[218, null]], "mlx.core.metal.stop_capture": [[219, null]], "mlx.core.min": [[220, null]], "mlx.core.minimum": [[221, null]], "mlx.core.moveaxis": [[222, null]], "mlx.core.multiply": [[223, null]], "mlx.core.nan_to_num": [[224, null]], "mlx.core.negative": [[225, null]], "mlx.core.new_stream": [[226, null]], "mlx.core.not_equal": [[227, null]], "mlx.core.ones": [[228, null]], "mlx.core.ones_like": [[229, null]], "mlx.core.outer": [[230, null]], "mlx.core.pad": [[231, null]], "mlx.core.partition": [[232, null]], "mlx.core.power": [[233, null]], "mlx.core.prod": [[234, null]], "mlx.core.put_along_axis": [[235, null]], "mlx.core.quantize": [[236, null]], "mlx.core.quantized_matmul": [[237, null]], "mlx.core.radians": [[238, null]], "mlx.core.random.bernoulli": [[239, null]], "mlx.core.random.categorical": [[240, null]], "mlx.core.random.gumbel": [[241, null]], "mlx.core.random.key": [[242, null]], "mlx.core.random.laplace": [[243, null]], "mlx.core.random.multivariate_normal": [[244, null]], "mlx.core.random.normal": [[245, null]], "mlx.core.random.permutation": [[246, null]], "mlx.core.random.randint": [[247, null]], "mlx.core.random.seed": [[248, null]], "mlx.core.random.split": [[249, null]], "mlx.core.random.truncated_normal": [[250, null]], "mlx.core.random.uniform": [[251, null]], "mlx.core.real": [[252, null]], "mlx.core.reciprocal": [[253, null]], "mlx.core.remainder": [[254, null]], "mlx.core.repeat": [[255, null]], "mlx.core.reshape": [[256, null]], "mlx.core.right_shift": [[257, null]], "mlx.core.roll": [[258, null]], "mlx.core.round": [[259, null]], "mlx.core.rsqrt": [[260, null]], "mlx.core.save": [[261, null]], "mlx.core.save_gguf": [[262, null]], "mlx.core.save_safetensors": [[263, null]], "mlx.core.savez": [[264, null]], "mlx.core.savez_compressed": [[265, null]], "mlx.core.set_default_device": [[266, null]], "mlx.core.set_default_stream": [[267, null]], "mlx.core.sigmoid": [[268, null]], "mlx.core.sign": [[269, null]], "mlx.core.sin": [[270, null]], "mlx.core.sinh": [[271, null]], "mlx.core.softmax": [[272, null]], "mlx.core.sort": [[273, null]], "mlx.core.split": [[274, null]], "mlx.core.sqrt": [[275, null]], "mlx.core.square": [[276, null]], "mlx.core.squeeze": [[277, null]], "mlx.core.stack": [[278, null]], "mlx.core.std": [[279, null]], "mlx.core.stop_gradient": [[280, null]], "mlx.core.stream": [[281, null]], "mlx.core.subtract": [[282, null]], "mlx.core.sum": [[283, null]], "mlx.core.swapaxes": [[284, null]], "mlx.core.synchronize": [[285, null]], "mlx.core.take": [[286, null]], "mlx.core.take_along_axis": [[287, null]], "mlx.core.tan": [[288, null]], "mlx.core.tanh": [[289, null]], "mlx.core.tensordot": [[290, null]], "mlx.core.tile": [[291, null]], "mlx.core.topk": [[292, null]], "mlx.core.trace": [[293, null]], "mlx.core.transpose": [[294, null]], "mlx.core.tri": [[295, null]], "mlx.core.tril": [[296, null]], "mlx.core.triu": [[297, null]], "mlx.core.value_and_grad": [[298, null]], "mlx.core.var": [[299, null]], "mlx.core.view": [[300, null]], "mlx.core.vjp": [[301, null]], "mlx.core.vmap": [[302, null]], "mlx.core.where": [[303, null]], "mlx.core.zeros": [[304, null]], "mlx.core.zeros_like": [[305, null]], "mlx.nn.ALiBi": [[324, null]], "mlx.nn.AvgPool1d": [[325, null]], "mlx.nn.AvgPool2d": [[326, null]], "mlx.nn.AvgPool3d": [[327, null]], "mlx.nn.BatchNorm": [[328, null]], "mlx.nn.CELU": [[329, null]], "mlx.nn.Conv1d": [[330, null]], "mlx.nn.Conv2d": [[331, null]], "mlx.nn.Conv3d": [[332, null]], "mlx.nn.ConvTranspose1d": [[333, null]], "mlx.nn.ConvTranspose2d": [[334, null]], "mlx.nn.ConvTranspose3d": [[335, null]], "mlx.nn.Dropout": [[336, null]], "mlx.nn.Dropout2d": [[337, null]], "mlx.nn.Dropout3d": [[338, null]], "mlx.nn.ELU": [[339, null]], "mlx.nn.Embedding": [[340, null]], "mlx.nn.GELU": [[341, null]], "mlx.nn.GLU": [[342, null]], "mlx.nn.GRU": [[343, null]], "mlx.nn.GroupNorm": [[344, null]], "mlx.nn.HardShrink": [[345, null]], "mlx.nn.HardTanh": [[346, null]], "mlx.nn.Hardswish": [[347, null]], "mlx.nn.InstanceNorm": [[348, null]], "mlx.nn.LSTM": [[349, null]], "mlx.nn.LayerNorm": [[350, null]], "mlx.nn.LeakyReLU": [[351, null]], "mlx.nn.Linear": [[352, null]], "mlx.nn.LogSigmoid": [[353, null]], "mlx.nn.LogSoftmax": [[354, null]], "mlx.nn.MaxPool1d": [[355, null]], "mlx.nn.MaxPool2d": [[356, null]], "mlx.nn.MaxPool3d": [[357, null]], "mlx.nn.Mish": [[358, null]], "mlx.nn.Module.apply": [[359, null]], "mlx.nn.Module.apply_to_modules": [[360, null]], "mlx.nn.Module.children": [[361, null]], "mlx.nn.Module.eval": [[362, null]], "mlx.nn.Module.filter_and_map": [[363, null]], "mlx.nn.Module.freeze": [[364, null]], "mlx.nn.Module.leaf_modules": [[365, null]], "mlx.nn.Module.load_weights": [[366, null]], "mlx.nn.Module.modules": [[367, null]], "mlx.nn.Module.named_modules": [[368, null]], "mlx.nn.Module.parameters": [[369, null]], "mlx.nn.Module.save_weights": [[370, null]], "mlx.nn.Module.set_dtype": [[371, null]], "mlx.nn.Module.state": [[372, null]], "mlx.nn.Module.train": [[373, null]], "mlx.nn.Module.trainable_parameters": [[374, null]], "mlx.nn.Module.training": [[375, null]], "mlx.nn.Module.unfreeze": [[376, null]], "mlx.nn.Module.update": [[377, null]], "mlx.nn.Module.update_modules": [[378, null]], "mlx.nn.MultiHeadAttention": [[379, null]], "mlx.nn.PReLU": [[380, null]], "mlx.nn.QuantizedEmbedding": [[381, null]], "mlx.nn.QuantizedLinear": [[382, null]], "mlx.nn.RMSNorm": [[383, null]], "mlx.nn.RNN": [[384, null]], "mlx.nn.ReLU": [[385, null]], "mlx.nn.ReLU6": [[386, null]], "mlx.nn.RoPE": [[387, null]], "mlx.nn.SELU": [[388, null]], "mlx.nn.Sequential": [[389, null]], "mlx.nn.SiLU": [[390, null]], "mlx.nn.Sigmoid": [[391, null]], "mlx.nn.SinusoidalPositionalEncoding": [[392, null]], "mlx.nn.Softmax": [[393, null]], "mlx.nn.Softmin": [[394, null]], "mlx.nn.Softplus": [[395, null]], "mlx.nn.Softshrink": [[396, null]], "mlx.nn.Softsign": [[397, null]], "mlx.nn.Step": [[398, null]], "mlx.nn.Tanh": [[399, null]], "mlx.nn.Transformer": [[400, null]], "mlx.nn.Upsample": [[401, null]], "mlx.nn.celu": [[410, null]], "mlx.nn.elu": [[411, null]], "mlx.nn.gelu": [[412, null]], "mlx.nn.gelu_approx": [[413, null]], "mlx.nn.gelu_fast_approx": [[414, null]], "mlx.nn.glu": [[415, null]], "mlx.nn.hard_shrink": [[416, null]], "mlx.nn.hard_tanh": [[417, null]], "mlx.nn.hardswish": [[418, null]], "mlx.nn.init.constant": [[402, null]], "mlx.nn.init.glorot_normal": [[403, null]], "mlx.nn.init.glorot_uniform": [[404, null]], "mlx.nn.init.he_normal": [[405, null]], "mlx.nn.init.he_uniform": [[406, null]], "mlx.nn.init.identity": [[407, null]], "mlx.nn.init.normal": [[408, null]], "mlx.nn.init.uniform": [[409, null]], "mlx.nn.leaky_relu": [[419, null]], "mlx.nn.log_sigmoid": [[420, null]], "mlx.nn.log_softmax": [[421, null]], "mlx.nn.losses.binary_cross_entropy": [[422, null]], "mlx.nn.losses.cosine_similarity_loss": [[423, null]], "mlx.nn.losses.cross_entropy": [[424, null]], "mlx.nn.losses.gaussian_nll_loss": [[425, null]], "mlx.nn.losses.hinge_loss": [[426, null]], "mlx.nn.losses.huber_loss": [[427, null]], "mlx.nn.losses.kl_div_loss": [[428, null]], "mlx.nn.losses.l1_loss": [[429, null]], "mlx.nn.losses.log_cosh_loss": [[430, null]], "mlx.nn.losses.margin_ranking_loss": [[431, null]], "mlx.nn.losses.mse_loss": [[432, null]], "mlx.nn.losses.nll_loss": [[433, null]], "mlx.nn.losses.smooth_l1_loss": [[434, null]], "mlx.nn.losses.triplet_loss": [[435, null]], "mlx.nn.mish": [[436, null]], "mlx.nn.prelu": [[437, null]], "mlx.nn.quantize": [[306, null]], "mlx.nn.relu": [[438, null]], "mlx.nn.relu6": [[439, null]], "mlx.nn.selu": [[440, null]], "mlx.nn.sigmoid": [[441, null]], "mlx.nn.silu": [[442, null]], "mlx.nn.softmax": [[443, null]], "mlx.nn.softmin": [[444, null]], "mlx.nn.softplus": [[445, null]], "mlx.nn.softshrink": [[446, null]], "mlx.nn.step": [[447, null]], "mlx.nn.tanh": [[448, null]], "mlx.nn.value_and_grad": [[307, null]], "mlx.optimizers.AdaDelta": [[456, null]], "mlx.optimizers.Adafactor": [[457, null]], "mlx.optimizers.Adagrad": [[458, null]], "mlx.optimizers.Adam": [[459, null]], "mlx.optimizers.AdamW": [[460, null]], "mlx.optimizers.Adamax": [[461, null]], "mlx.optimizers.Lion": [[462, null]], "mlx.optimizers.Optimizer.apply_gradients": [[463, null]], "mlx.optimizers.Optimizer.init": [[464, null]], "mlx.optimizers.Optimizer.state": [[465, null]], "mlx.optimizers.Optimizer.update": [[466, null]], "mlx.optimizers.RMSprop": [[467, null]], "mlx.optimizers.SGD": [[468, null]], "mlx.optimizers.clip_grad_norm": [[308, null]], "mlx.optimizers.cosine_decay": [[469, null]], "mlx.optimizers.exponential_decay": [[470, null]], "mlx.optimizers.join_schedules": [[471, null]], "mlx.optimizers.linear_schedule": [[472, null]], "mlx.optimizers.step_decay": [[473, null]], "mlx.utils.tree_flatten": [[309, null]], "mlx.utils.tree_map": [[310, null]], "mlx.utils.tree_map_with_path": [[311, null]], "mlx.utils.tree_reduce": [[312, null]], "mlx.utils.tree_unflatten": [[313, null]], "x86 Shell": [[8, "x86-shell"]]}, "docnames": ["cpp/ops", "dev/custom_metal_kernels", "dev/extensions", "dev/metal_debugger", "examples/linear_regression", "examples/llama-inference", "examples/mlp", "index", "install", "python/_autosummary/mlx.core.Device", "python/_autosummary/mlx.core.Dtype", "python/_autosummary/mlx.core.DtypeCategory", "python/_autosummary/mlx.core.abs", "python/_autosummary/mlx.core.add", "python/_autosummary/mlx.core.addmm", "python/_autosummary/mlx.core.all", "python/_autosummary/mlx.core.allclose", "python/_autosummary/mlx.core.any", "python/_autosummary/mlx.core.arange", "python/_autosummary/mlx.core.arccos", "python/_autosummary/mlx.core.arccosh", "python/_autosummary/mlx.core.arcsin", "python/_autosummary/mlx.core.arcsinh", "python/_autosummary/mlx.core.arctan", "python/_autosummary/mlx.core.arctan2", "python/_autosummary/mlx.core.arctanh", "python/_autosummary/mlx.core.argmax", "python/_autosummary/mlx.core.argmin", "python/_autosummary/mlx.core.argpartition", "python/_autosummary/mlx.core.argsort", "python/_autosummary/mlx.core.array", "python/_autosummary/mlx.core.array.T", "python/_autosummary/mlx.core.array.abs", "python/_autosummary/mlx.core.array.all", "python/_autosummary/mlx.core.array.any", "python/_autosummary/mlx.core.array.argmax", "python/_autosummary/mlx.core.array.argmin", "python/_autosummary/mlx.core.array.astype", "python/_autosummary/mlx.core.array.at", "python/_autosummary/mlx.core.array.conj", "python/_autosummary/mlx.core.array.cos", "python/_autosummary/mlx.core.array.cummax", "python/_autosummary/mlx.core.array.cummin", "python/_autosummary/mlx.core.array.cumprod", "python/_autosummary/mlx.core.array.cumsum", "python/_autosummary/mlx.core.array.diag", "python/_autosummary/mlx.core.array.diagonal", "python/_autosummary/mlx.core.array.dtype", "python/_autosummary/mlx.core.array.exp", "python/_autosummary/mlx.core.array.flatten", "python/_autosummary/mlx.core.array.item", "python/_autosummary/mlx.core.array.itemsize", "python/_autosummary/mlx.core.array.log", "python/_autosummary/mlx.core.array.log10", "python/_autosummary/mlx.core.array.log1p", "python/_autosummary/mlx.core.array.log2", "python/_autosummary/mlx.core.array.logsumexp", "python/_autosummary/mlx.core.array.max", "python/_autosummary/mlx.core.array.mean", "python/_autosummary/mlx.core.array.min", "python/_autosummary/mlx.core.array.moveaxis", "python/_autosummary/mlx.core.array.nbytes", "python/_autosummary/mlx.core.array.ndim", "python/_autosummary/mlx.core.array.prod", "python/_autosummary/mlx.core.array.reciprocal", "python/_autosummary/mlx.core.array.reshape", "python/_autosummary/mlx.core.array.round", "python/_autosummary/mlx.core.array.rsqrt", "python/_autosummary/mlx.core.array.shape", "python/_autosummary/mlx.core.array.sin", "python/_autosummary/mlx.core.array.size", "python/_autosummary/mlx.core.array.split", "python/_autosummary/mlx.core.array.sqrt", "python/_autosummary/mlx.core.array.square", "python/_autosummary/mlx.core.array.squeeze", "python/_autosummary/mlx.core.array.std", "python/_autosummary/mlx.core.array.sum", "python/_autosummary/mlx.core.array.swapaxes", "python/_autosummary/mlx.core.array.tolist", "python/_autosummary/mlx.core.array.transpose", "python/_autosummary/mlx.core.array.var", "python/_autosummary/mlx.core.array.view", "python/_autosummary/mlx.core.array_equal", "python/_autosummary/mlx.core.as_strided", "python/_autosummary/mlx.core.atleast_1d", "python/_autosummary/mlx.core.atleast_2d", "python/_autosummary/mlx.core.atleast_3d", "python/_autosummary/mlx.core.bitwise_and", "python/_autosummary/mlx.core.bitwise_or", "python/_autosummary/mlx.core.bitwise_xor", "python/_autosummary/mlx.core.block_masked_mm", "python/_autosummary/mlx.core.broadcast_to", "python/_autosummary/mlx.core.ceil", "python/_autosummary/mlx.core.clip", "python/_autosummary/mlx.core.compile", "python/_autosummary/mlx.core.concatenate", "python/_autosummary/mlx.core.conj", "python/_autosummary/mlx.core.conjugate", "python/_autosummary/mlx.core.conv1d", "python/_autosummary/mlx.core.conv2d", "python/_autosummary/mlx.core.conv3d", "python/_autosummary/mlx.core.conv_general", "python/_autosummary/mlx.core.conv_transpose1d", "python/_autosummary/mlx.core.conv_transpose2d", "python/_autosummary/mlx.core.conv_transpose3d", "python/_autosummary/mlx.core.convolve", "python/_autosummary/mlx.core.cos", "python/_autosummary/mlx.core.cosh", "python/_autosummary/mlx.core.cummax", "python/_autosummary/mlx.core.cummin", "python/_autosummary/mlx.core.cumprod", "python/_autosummary/mlx.core.cumsum", "python/_autosummary/mlx.core.custom_function", "python/_autosummary/mlx.core.default_device", "python/_autosummary/mlx.core.default_stream", "python/_autosummary/mlx.core.degrees", "python/_autosummary/mlx.core.dequantize", "python/_autosummary/mlx.core.diag", "python/_autosummary/mlx.core.diagonal", "python/_autosummary/mlx.core.disable_compile", "python/_autosummary/mlx.core.distributed.Group", "python/_autosummary/mlx.core.distributed.all_gather", "python/_autosummary/mlx.core.distributed.all_sum", "python/_autosummary/mlx.core.distributed.init", "python/_autosummary/mlx.core.distributed.is_available", "python/_autosummary/mlx.core.distributed.recv", "python/_autosummary/mlx.core.distributed.recv_like", "python/_autosummary/mlx.core.distributed.send", "python/_autosummary/mlx.core.divide", "python/_autosummary/mlx.core.divmod", "python/_autosummary/mlx.core.einsum", "python/_autosummary/mlx.core.einsum_path", "python/_autosummary/mlx.core.enable_compile", "python/_autosummary/mlx.core.equal", "python/_autosummary/mlx.core.erf", "python/_autosummary/mlx.core.erfinv", "python/_autosummary/mlx.core.eval", "python/_autosummary/mlx.core.exp", "python/_autosummary/mlx.core.expand_dims", "python/_autosummary/mlx.core.expm1", "python/_autosummary/mlx.core.eye", "python/_autosummary/mlx.core.fast.layer_norm", "python/_autosummary/mlx.core.fast.metal_kernel", "python/_autosummary/mlx.core.fast.rms_norm", "python/_autosummary/mlx.core.fast.rope", "python/_autosummary/mlx.core.fast.scaled_dot_product_attention", "python/_autosummary/mlx.core.fft.fft", "python/_autosummary/mlx.core.fft.fft2", "python/_autosummary/mlx.core.fft.fftn", "python/_autosummary/mlx.core.fft.ifft", "python/_autosummary/mlx.core.fft.ifft2", "python/_autosummary/mlx.core.fft.ifftn", "python/_autosummary/mlx.core.fft.irfft", "python/_autosummary/mlx.core.fft.irfft2", "python/_autosummary/mlx.core.fft.irfftn", "python/_autosummary/mlx.core.fft.rfft", "python/_autosummary/mlx.core.fft.rfft2", "python/_autosummary/mlx.core.fft.rfftn", "python/_autosummary/mlx.core.flatten", "python/_autosummary/mlx.core.floor", "python/_autosummary/mlx.core.floor_divide", "python/_autosummary/mlx.core.full", "python/_autosummary/mlx.core.gather_mm", "python/_autosummary/mlx.core.gather_qmm", "python/_autosummary/mlx.core.grad", "python/_autosummary/mlx.core.greater", "python/_autosummary/mlx.core.greater_equal", "python/_autosummary/mlx.core.hadamard_transform", "python/_autosummary/mlx.core.identity", "python/_autosummary/mlx.core.imag", "python/_autosummary/mlx.core.inner", "python/_autosummary/mlx.core.isclose", "python/_autosummary/mlx.core.isfinite", "python/_autosummary/mlx.core.isinf", "python/_autosummary/mlx.core.isnan", "python/_autosummary/mlx.core.isneginf", "python/_autosummary/mlx.core.isposinf", "python/_autosummary/mlx.core.issubdtype", "python/_autosummary/mlx.core.jvp", "python/_autosummary/mlx.core.left_shift", "python/_autosummary/mlx.core.less", "python/_autosummary/mlx.core.less_equal", "python/_autosummary/mlx.core.linalg.cholesky", "python/_autosummary/mlx.core.linalg.cholesky_inv", "python/_autosummary/mlx.core.linalg.cross", "python/_autosummary/mlx.core.linalg.eigh", "python/_autosummary/mlx.core.linalg.eigvalsh", "python/_autosummary/mlx.core.linalg.inv", "python/_autosummary/mlx.core.linalg.norm", "python/_autosummary/mlx.core.linalg.qr", "python/_autosummary/mlx.core.linalg.svd", "python/_autosummary/mlx.core.linalg.tri_inv", "python/_autosummary/mlx.core.linspace", "python/_autosummary/mlx.core.load", "python/_autosummary/mlx.core.log", "python/_autosummary/mlx.core.log10", "python/_autosummary/mlx.core.log1p", "python/_autosummary/mlx.core.log2", "python/_autosummary/mlx.core.logaddexp", "python/_autosummary/mlx.core.logical_and", "python/_autosummary/mlx.core.logical_not", "python/_autosummary/mlx.core.logical_or", "python/_autosummary/mlx.core.logsumexp", "python/_autosummary/mlx.core.matmul", "python/_autosummary/mlx.core.max", "python/_autosummary/mlx.core.maximum", "python/_autosummary/mlx.core.mean", "python/_autosummary/mlx.core.meshgrid", "python/_autosummary/mlx.core.metal.clear_cache", "python/_autosummary/mlx.core.metal.device_info", "python/_autosummary/mlx.core.metal.get_active_memory", "python/_autosummary/mlx.core.metal.get_cache_memory", "python/_autosummary/mlx.core.metal.get_peak_memory", "python/_autosummary/mlx.core.metal.is_available", "python/_autosummary/mlx.core.metal.reset_peak_memory", "python/_autosummary/mlx.core.metal.set_cache_limit", "python/_autosummary/mlx.core.metal.set_memory_limit", "python/_autosummary/mlx.core.metal.set_wired_limit", "python/_autosummary/mlx.core.metal.start_capture", "python/_autosummary/mlx.core.metal.stop_capture", "python/_autosummary/mlx.core.min", "python/_autosummary/mlx.core.minimum", "python/_autosummary/mlx.core.moveaxis", "python/_autosummary/mlx.core.multiply", "python/_autosummary/mlx.core.nan_to_num", "python/_autosummary/mlx.core.negative", "python/_autosummary/mlx.core.new_stream", "python/_autosummary/mlx.core.not_equal", "python/_autosummary/mlx.core.ones", "python/_autosummary/mlx.core.ones_like", "python/_autosummary/mlx.core.outer", "python/_autosummary/mlx.core.pad", "python/_autosummary/mlx.core.partition", "python/_autosummary/mlx.core.power", "python/_autosummary/mlx.core.prod", "python/_autosummary/mlx.core.put_along_axis", "python/_autosummary/mlx.core.quantize", "python/_autosummary/mlx.core.quantized_matmul", "python/_autosummary/mlx.core.radians", "python/_autosummary/mlx.core.random.bernoulli", "python/_autosummary/mlx.core.random.categorical", "python/_autosummary/mlx.core.random.gumbel", "python/_autosummary/mlx.core.random.key", "python/_autosummary/mlx.core.random.laplace", "python/_autosummary/mlx.core.random.multivariate_normal", "python/_autosummary/mlx.core.random.normal", "python/_autosummary/mlx.core.random.permutation", "python/_autosummary/mlx.core.random.randint", "python/_autosummary/mlx.core.random.seed", "python/_autosummary/mlx.core.random.split", "python/_autosummary/mlx.core.random.truncated_normal", "python/_autosummary/mlx.core.random.uniform", "python/_autosummary/mlx.core.real", "python/_autosummary/mlx.core.reciprocal", "python/_autosummary/mlx.core.remainder", "python/_autosummary/mlx.core.repeat", "python/_autosummary/mlx.core.reshape", "python/_autosummary/mlx.core.right_shift", "python/_autosummary/mlx.core.roll", "python/_autosummary/mlx.core.round", "python/_autosummary/mlx.core.rsqrt", "python/_autosummary/mlx.core.save", "python/_autosummary/mlx.core.save_gguf", "python/_autosummary/mlx.core.save_safetensors", "python/_autosummary/mlx.core.savez", "python/_autosummary/mlx.core.savez_compressed", "python/_autosummary/mlx.core.set_default_device", "python/_autosummary/mlx.core.set_default_stream", "python/_autosummary/mlx.core.sigmoid", "python/_autosummary/mlx.core.sign", "python/_autosummary/mlx.core.sin", "python/_autosummary/mlx.core.sinh", "python/_autosummary/mlx.core.softmax", "python/_autosummary/mlx.core.sort", "python/_autosummary/mlx.core.split", "python/_autosummary/mlx.core.sqrt", "python/_autosummary/mlx.core.square", "python/_autosummary/mlx.core.squeeze", "python/_autosummary/mlx.core.stack", "python/_autosummary/mlx.core.std", "python/_autosummary/mlx.core.stop_gradient", "python/_autosummary/mlx.core.stream", "python/_autosummary/mlx.core.subtract", "python/_autosummary/mlx.core.sum", "python/_autosummary/mlx.core.swapaxes", "python/_autosummary/mlx.core.synchronize", "python/_autosummary/mlx.core.take", "python/_autosummary/mlx.core.take_along_axis", "python/_autosummary/mlx.core.tan", "python/_autosummary/mlx.core.tanh", "python/_autosummary/mlx.core.tensordot", "python/_autosummary/mlx.core.tile", "python/_autosummary/mlx.core.topk", "python/_autosummary/mlx.core.trace", "python/_autosummary/mlx.core.transpose", "python/_autosummary/mlx.core.tri", "python/_autosummary/mlx.core.tril", "python/_autosummary/mlx.core.triu", "python/_autosummary/mlx.core.value_and_grad", "python/_autosummary/mlx.core.var", "python/_autosummary/mlx.core.view", "python/_autosummary/mlx.core.vjp", "python/_autosummary/mlx.core.vmap", "python/_autosummary/mlx.core.where", "python/_autosummary/mlx.core.zeros", "python/_autosummary/mlx.core.zeros_like", "python/_autosummary/mlx.nn.quantize", "python/_autosummary/mlx.nn.value_and_grad", "python/_autosummary/mlx.optimizers.clip_grad_norm", "python/_autosummary/mlx.utils.tree_flatten", "python/_autosummary/mlx.utils.tree_map", "python/_autosummary/mlx.utils.tree_map_with_path", "python/_autosummary/mlx.utils.tree_reduce", "python/_autosummary/mlx.utils.tree_unflatten", "python/_autosummary/stream_class", "python/array", "python/data_types", "python/devices_and_streams", "python/distributed", "python/fast", "python/fft", "python/linalg", "python/metal", "python/nn", "python/nn/_autosummary/mlx.nn.ALiBi", "python/nn/_autosummary/mlx.nn.AvgPool1d", "python/nn/_autosummary/mlx.nn.AvgPool2d", "python/nn/_autosummary/mlx.nn.AvgPool3d", "python/nn/_autosummary/mlx.nn.BatchNorm", "python/nn/_autosummary/mlx.nn.CELU", "python/nn/_autosummary/mlx.nn.Conv1d", "python/nn/_autosummary/mlx.nn.Conv2d", "python/nn/_autosummary/mlx.nn.Conv3d", "python/nn/_autosummary/mlx.nn.ConvTranspose1d", "python/nn/_autosummary/mlx.nn.ConvTranspose2d", "python/nn/_autosummary/mlx.nn.ConvTranspose3d", "python/nn/_autosummary/mlx.nn.Dropout", "python/nn/_autosummary/mlx.nn.Dropout2d", "python/nn/_autosummary/mlx.nn.Dropout3d", "python/nn/_autosummary/mlx.nn.ELU", "python/nn/_autosummary/mlx.nn.Embedding", "python/nn/_autosummary/mlx.nn.GELU", "python/nn/_autosummary/mlx.nn.GLU", "python/nn/_autosummary/mlx.nn.GRU", "python/nn/_autosummary/mlx.nn.GroupNorm", "python/nn/_autosummary/mlx.nn.HardShrink", "python/nn/_autosummary/mlx.nn.HardTanh", "python/nn/_autosummary/mlx.nn.Hardswish", "python/nn/_autosummary/mlx.nn.InstanceNorm", "python/nn/_autosummary/mlx.nn.LSTM", "python/nn/_autosummary/mlx.nn.LayerNorm", "python/nn/_autosummary/mlx.nn.LeakyReLU", "python/nn/_autosummary/mlx.nn.Linear", "python/nn/_autosummary/mlx.nn.LogSigmoid", "python/nn/_autosummary/mlx.nn.LogSoftmax", "python/nn/_autosummary/mlx.nn.MaxPool1d", "python/nn/_autosummary/mlx.nn.MaxPool2d", "python/nn/_autosummary/mlx.nn.MaxPool3d", "python/nn/_autosummary/mlx.nn.Mish", "python/nn/_autosummary/mlx.nn.Module.apply", "python/nn/_autosummary/mlx.nn.Module.apply_to_modules", "python/nn/_autosummary/mlx.nn.Module.children", "python/nn/_autosummary/mlx.nn.Module.eval", "python/nn/_autosummary/mlx.nn.Module.filter_and_map", "python/nn/_autosummary/mlx.nn.Module.freeze", "python/nn/_autosummary/mlx.nn.Module.leaf_modules", "python/nn/_autosummary/mlx.nn.Module.load_weights", "python/nn/_autosummary/mlx.nn.Module.modules", "python/nn/_autosummary/mlx.nn.Module.named_modules", "python/nn/_autosummary/mlx.nn.Module.parameters", "python/nn/_autosummary/mlx.nn.Module.save_weights", "python/nn/_autosummary/mlx.nn.Module.set_dtype", "python/nn/_autosummary/mlx.nn.Module.state", "python/nn/_autosummary/mlx.nn.Module.train", "python/nn/_autosummary/mlx.nn.Module.trainable_parameters", "python/nn/_autosummary/mlx.nn.Module.training", "python/nn/_autosummary/mlx.nn.Module.unfreeze", "python/nn/_autosummary/mlx.nn.Module.update", "python/nn/_autosummary/mlx.nn.Module.update_modules", "python/nn/_autosummary/mlx.nn.MultiHeadAttention", "python/nn/_autosummary/mlx.nn.PReLU", "python/nn/_autosummary/mlx.nn.QuantizedEmbedding", "python/nn/_autosummary/mlx.nn.QuantizedLinear", "python/nn/_autosummary/mlx.nn.RMSNorm", "python/nn/_autosummary/mlx.nn.RNN", "python/nn/_autosummary/mlx.nn.ReLU", "python/nn/_autosummary/mlx.nn.ReLU6", "python/nn/_autosummary/mlx.nn.RoPE", "python/nn/_autosummary/mlx.nn.SELU", "python/nn/_autosummary/mlx.nn.Sequential", "python/nn/_autosummary/mlx.nn.SiLU", "python/nn/_autosummary/mlx.nn.Sigmoid", "python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding", "python/nn/_autosummary/mlx.nn.Softmax", "python/nn/_autosummary/mlx.nn.Softmin", "python/nn/_autosummary/mlx.nn.Softplus", "python/nn/_autosummary/mlx.nn.Softshrink", "python/nn/_autosummary/mlx.nn.Softsign", "python/nn/_autosummary/mlx.nn.Step", "python/nn/_autosummary/mlx.nn.Tanh", "python/nn/_autosummary/mlx.nn.Transformer", "python/nn/_autosummary/mlx.nn.Upsample", "python/nn/_autosummary/mlx.nn.init.constant", "python/nn/_autosummary/mlx.nn.init.glorot_normal", "python/nn/_autosummary/mlx.nn.init.glorot_uniform", "python/nn/_autosummary/mlx.nn.init.he_normal", "python/nn/_autosummary/mlx.nn.init.he_uniform", "python/nn/_autosummary/mlx.nn.init.identity", "python/nn/_autosummary/mlx.nn.init.normal", "python/nn/_autosummary/mlx.nn.init.uniform", "python/nn/_autosummary_functions/mlx.nn.celu", "python/nn/_autosummary_functions/mlx.nn.elu", "python/nn/_autosummary_functions/mlx.nn.gelu", "python/nn/_autosummary_functions/mlx.nn.gelu_approx", "python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx", "python/nn/_autosummary_functions/mlx.nn.glu", "python/nn/_autosummary_functions/mlx.nn.hard_shrink", "python/nn/_autosummary_functions/mlx.nn.hard_tanh", "python/nn/_autosummary_functions/mlx.nn.hardswish", "python/nn/_autosummary_functions/mlx.nn.leaky_relu", "python/nn/_autosummary_functions/mlx.nn.log_sigmoid", "python/nn/_autosummary_functions/mlx.nn.log_softmax", "python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy", "python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss", "python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy", "python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss", "python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss", "python/nn/_autosummary_functions/mlx.nn.losses.huber_loss", "python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss", "python/nn/_autosummary_functions/mlx.nn.losses.l1_loss", "python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss", "python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss", "python/nn/_autosummary_functions/mlx.nn.losses.mse_loss", "python/nn/_autosummary_functions/mlx.nn.losses.nll_loss", "python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss", "python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss", "python/nn/_autosummary_functions/mlx.nn.mish", "python/nn/_autosummary_functions/mlx.nn.prelu", "python/nn/_autosummary_functions/mlx.nn.relu", "python/nn/_autosummary_functions/mlx.nn.relu6", "python/nn/_autosummary_functions/mlx.nn.selu", "python/nn/_autosummary_functions/mlx.nn.sigmoid", "python/nn/_autosummary_functions/mlx.nn.silu", "python/nn/_autosummary_functions/mlx.nn.softmax", "python/nn/_autosummary_functions/mlx.nn.softmin", "python/nn/_autosummary_functions/mlx.nn.softplus", "python/nn/_autosummary_functions/mlx.nn.softshrink", "python/nn/_autosummary_functions/mlx.nn.step", "python/nn/_autosummary_functions/mlx.nn.tanh", "python/nn/functions", "python/nn/init", "python/nn/layers", "python/nn/losses", "python/nn/module", "python/ops", "python/optimizers", "python/optimizers/_autosummary/mlx.optimizers.AdaDelta", "python/optimizers/_autosummary/mlx.optimizers.Adafactor", "python/optimizers/_autosummary/mlx.optimizers.Adagrad", "python/optimizers/_autosummary/mlx.optimizers.Adam", "python/optimizers/_autosummary/mlx.optimizers.AdamW", "python/optimizers/_autosummary/mlx.optimizers.Adamax", "python/optimizers/_autosummary/mlx.optimizers.Lion", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.init", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.state", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.update", "python/optimizers/_autosummary/mlx.optimizers.RMSprop", "python/optimizers/_autosummary/mlx.optimizers.SGD", "python/optimizers/_autosummary/mlx.optimizers.cosine_decay", "python/optimizers/_autosummary/mlx.optimizers.exponential_decay", "python/optimizers/_autosummary/mlx.optimizers.join_schedules", "python/optimizers/_autosummary/mlx.optimizers.linear_schedule", "python/optimizers/_autosummary/mlx.optimizers.step_decay", "python/optimizers/common_optimizers", "python/optimizers/optimizer", "python/optimizers/schedulers", "python/random", "python/transforms", "python/tree_utils", "usage/compile", "usage/distributed", "usage/function_transforms", "usage/indexing", "usage/lazy_evaluation", "usage/numpy", "usage/quick_start", "usage/saving_and_loading", "usage/unified_memory", "usage/using_streams"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1}, "filenames": ["cpp/ops.rst", "dev/custom_metal_kernels.rst", "dev/extensions.rst", "dev/metal_debugger.rst", "examples/linear_regression.rst", "examples/llama-inference.rst", "examples/mlp.rst", "index.rst", "install.rst", "python/_autosummary/mlx.core.Device.rst", "python/_autosummary/mlx.core.Dtype.rst", "python/_autosummary/mlx.core.DtypeCategory.rst", "python/_autosummary/mlx.core.abs.rst", "python/_autosummary/mlx.core.add.rst", "python/_autosummary/mlx.core.addmm.rst", "python/_autosummary/mlx.core.all.rst", "python/_autosummary/mlx.core.allclose.rst", "python/_autosummary/mlx.core.any.rst", "python/_autosummary/mlx.core.arange.rst", "python/_autosummary/mlx.core.arccos.rst", "python/_autosummary/mlx.core.arccosh.rst", "python/_autosummary/mlx.core.arcsin.rst", "python/_autosummary/mlx.core.arcsinh.rst", "python/_autosummary/mlx.core.arctan.rst", "python/_autosummary/mlx.core.arctan2.rst", "python/_autosummary/mlx.core.arctanh.rst", "python/_autosummary/mlx.core.argmax.rst", "python/_autosummary/mlx.core.argmin.rst", "python/_autosummary/mlx.core.argpartition.rst", "python/_autosummary/mlx.core.argsort.rst", "python/_autosummary/mlx.core.array.rst", "python/_autosummary/mlx.core.array.T.rst", "python/_autosummary/mlx.core.array.abs.rst", "python/_autosummary/mlx.core.array.all.rst", "python/_autosummary/mlx.core.array.any.rst", "python/_autosummary/mlx.core.array.argmax.rst", "python/_autosummary/mlx.core.array.argmin.rst", "python/_autosummary/mlx.core.array.astype.rst", "python/_autosummary/mlx.core.array.at.rst", "python/_autosummary/mlx.core.array.conj.rst", "python/_autosummary/mlx.core.array.cos.rst", "python/_autosummary/mlx.core.array.cummax.rst", "python/_autosummary/mlx.core.array.cummin.rst", "python/_autosummary/mlx.core.array.cumprod.rst", "python/_autosummary/mlx.core.array.cumsum.rst", "python/_autosummary/mlx.core.array.diag.rst", "python/_autosummary/mlx.core.array.diagonal.rst", "python/_autosummary/mlx.core.array.dtype.rst", "python/_autosummary/mlx.core.array.exp.rst", "python/_autosummary/mlx.core.array.flatten.rst", "python/_autosummary/mlx.core.array.item.rst", "python/_autosummary/mlx.core.array.itemsize.rst", "python/_autosummary/mlx.core.array.log.rst", "python/_autosummary/mlx.core.array.log10.rst", "python/_autosummary/mlx.core.array.log1p.rst", "python/_autosummary/mlx.core.array.log2.rst", "python/_autosummary/mlx.core.array.logsumexp.rst", "python/_autosummary/mlx.core.array.max.rst", "python/_autosummary/mlx.core.array.mean.rst", "python/_autosummary/mlx.core.array.min.rst", "python/_autosummary/mlx.core.array.moveaxis.rst", "python/_autosummary/mlx.core.array.nbytes.rst", "python/_autosummary/mlx.core.array.ndim.rst", "python/_autosummary/mlx.core.array.prod.rst", "python/_autosummary/mlx.core.array.reciprocal.rst", "python/_autosummary/mlx.core.array.reshape.rst", "python/_autosummary/mlx.core.array.round.rst", "python/_autosummary/mlx.core.array.rsqrt.rst", "python/_autosummary/mlx.core.array.shape.rst", "python/_autosummary/mlx.core.array.sin.rst", "python/_autosummary/mlx.core.array.size.rst", "python/_autosummary/mlx.core.array.split.rst", "python/_autosummary/mlx.core.array.sqrt.rst", "python/_autosummary/mlx.core.array.square.rst", "python/_autosummary/mlx.core.array.squeeze.rst", "python/_autosummary/mlx.core.array.std.rst", "python/_autosummary/mlx.core.array.sum.rst", "python/_autosummary/mlx.core.array.swapaxes.rst", "python/_autosummary/mlx.core.array.tolist.rst", "python/_autosummary/mlx.core.array.transpose.rst", "python/_autosummary/mlx.core.array.var.rst", "python/_autosummary/mlx.core.array.view.rst", "python/_autosummary/mlx.core.array_equal.rst", "python/_autosummary/mlx.core.as_strided.rst", "python/_autosummary/mlx.core.atleast_1d.rst", "python/_autosummary/mlx.core.atleast_2d.rst", "python/_autosummary/mlx.core.atleast_3d.rst", "python/_autosummary/mlx.core.bitwise_and.rst", "python/_autosummary/mlx.core.bitwise_or.rst", "python/_autosummary/mlx.core.bitwise_xor.rst", "python/_autosummary/mlx.core.block_masked_mm.rst", "python/_autosummary/mlx.core.broadcast_to.rst", "python/_autosummary/mlx.core.ceil.rst", "python/_autosummary/mlx.core.clip.rst", "python/_autosummary/mlx.core.compile.rst", "python/_autosummary/mlx.core.concatenate.rst", "python/_autosummary/mlx.core.conj.rst", "python/_autosummary/mlx.core.conjugate.rst", "python/_autosummary/mlx.core.conv1d.rst", "python/_autosummary/mlx.core.conv2d.rst", "python/_autosummary/mlx.core.conv3d.rst", "python/_autosummary/mlx.core.conv_general.rst", "python/_autosummary/mlx.core.conv_transpose1d.rst", "python/_autosummary/mlx.core.conv_transpose2d.rst", "python/_autosummary/mlx.core.conv_transpose3d.rst", "python/_autosummary/mlx.core.convolve.rst", "python/_autosummary/mlx.core.cos.rst", "python/_autosummary/mlx.core.cosh.rst", "python/_autosummary/mlx.core.cummax.rst", "python/_autosummary/mlx.core.cummin.rst", "python/_autosummary/mlx.core.cumprod.rst", "python/_autosummary/mlx.core.cumsum.rst", "python/_autosummary/mlx.core.custom_function.rst", "python/_autosummary/mlx.core.default_device.rst", "python/_autosummary/mlx.core.default_stream.rst", "python/_autosummary/mlx.core.degrees.rst", "python/_autosummary/mlx.core.dequantize.rst", "python/_autosummary/mlx.core.diag.rst", "python/_autosummary/mlx.core.diagonal.rst", "python/_autosummary/mlx.core.disable_compile.rst", "python/_autosummary/mlx.core.distributed.Group.rst", "python/_autosummary/mlx.core.distributed.all_gather.rst", "python/_autosummary/mlx.core.distributed.all_sum.rst", "python/_autosummary/mlx.core.distributed.init.rst", "python/_autosummary/mlx.core.distributed.is_available.rst", "python/_autosummary/mlx.core.distributed.recv.rst", "python/_autosummary/mlx.core.distributed.recv_like.rst", "python/_autosummary/mlx.core.distributed.send.rst", "python/_autosummary/mlx.core.divide.rst", "python/_autosummary/mlx.core.divmod.rst", "python/_autosummary/mlx.core.einsum.rst", "python/_autosummary/mlx.core.einsum_path.rst", "python/_autosummary/mlx.core.enable_compile.rst", "python/_autosummary/mlx.core.equal.rst", "python/_autosummary/mlx.core.erf.rst", "python/_autosummary/mlx.core.erfinv.rst", "python/_autosummary/mlx.core.eval.rst", "python/_autosummary/mlx.core.exp.rst", "python/_autosummary/mlx.core.expand_dims.rst", "python/_autosummary/mlx.core.expm1.rst", "python/_autosummary/mlx.core.eye.rst", "python/_autosummary/mlx.core.fast.layer_norm.rst", "python/_autosummary/mlx.core.fast.metal_kernel.rst", "python/_autosummary/mlx.core.fast.rms_norm.rst", "python/_autosummary/mlx.core.fast.rope.rst", "python/_autosummary/mlx.core.fast.scaled_dot_product_attention.rst", "python/_autosummary/mlx.core.fft.fft.rst", "python/_autosummary/mlx.core.fft.fft2.rst", "python/_autosummary/mlx.core.fft.fftn.rst", "python/_autosummary/mlx.core.fft.ifft.rst", "python/_autosummary/mlx.core.fft.ifft2.rst", "python/_autosummary/mlx.core.fft.ifftn.rst", "python/_autosummary/mlx.core.fft.irfft.rst", "python/_autosummary/mlx.core.fft.irfft2.rst", "python/_autosummary/mlx.core.fft.irfftn.rst", "python/_autosummary/mlx.core.fft.rfft.rst", "python/_autosummary/mlx.core.fft.rfft2.rst", "python/_autosummary/mlx.core.fft.rfftn.rst", "python/_autosummary/mlx.core.flatten.rst", "python/_autosummary/mlx.core.floor.rst", "python/_autosummary/mlx.core.floor_divide.rst", "python/_autosummary/mlx.core.full.rst", "python/_autosummary/mlx.core.gather_mm.rst", "python/_autosummary/mlx.core.gather_qmm.rst", "python/_autosummary/mlx.core.grad.rst", "python/_autosummary/mlx.core.greater.rst", "python/_autosummary/mlx.core.greater_equal.rst", "python/_autosummary/mlx.core.hadamard_transform.rst", "python/_autosummary/mlx.core.identity.rst", "python/_autosummary/mlx.core.imag.rst", "python/_autosummary/mlx.core.inner.rst", "python/_autosummary/mlx.core.isclose.rst", "python/_autosummary/mlx.core.isfinite.rst", "python/_autosummary/mlx.core.isinf.rst", "python/_autosummary/mlx.core.isnan.rst", "python/_autosummary/mlx.core.isneginf.rst", "python/_autosummary/mlx.core.isposinf.rst", "python/_autosummary/mlx.core.issubdtype.rst", "python/_autosummary/mlx.core.jvp.rst", "python/_autosummary/mlx.core.left_shift.rst", "python/_autosummary/mlx.core.less.rst", "python/_autosummary/mlx.core.less_equal.rst", "python/_autosummary/mlx.core.linalg.cholesky.rst", "python/_autosummary/mlx.core.linalg.cholesky_inv.rst", "python/_autosummary/mlx.core.linalg.cross.rst", "python/_autosummary/mlx.core.linalg.eigh.rst", "python/_autosummary/mlx.core.linalg.eigvalsh.rst", "python/_autosummary/mlx.core.linalg.inv.rst", "python/_autosummary/mlx.core.linalg.norm.rst", "python/_autosummary/mlx.core.linalg.qr.rst", "python/_autosummary/mlx.core.linalg.svd.rst", "python/_autosummary/mlx.core.linalg.tri_inv.rst", "python/_autosummary/mlx.core.linspace.rst", "python/_autosummary/mlx.core.load.rst", "python/_autosummary/mlx.core.log.rst", "python/_autosummary/mlx.core.log10.rst", "python/_autosummary/mlx.core.log1p.rst", "python/_autosummary/mlx.core.log2.rst", "python/_autosummary/mlx.core.logaddexp.rst", "python/_autosummary/mlx.core.logical_and.rst", "python/_autosummary/mlx.core.logical_not.rst", "python/_autosummary/mlx.core.logical_or.rst", "python/_autosummary/mlx.core.logsumexp.rst", "python/_autosummary/mlx.core.matmul.rst", "python/_autosummary/mlx.core.max.rst", "python/_autosummary/mlx.core.maximum.rst", "python/_autosummary/mlx.core.mean.rst", "python/_autosummary/mlx.core.meshgrid.rst", "python/_autosummary/mlx.core.metal.clear_cache.rst", "python/_autosummary/mlx.core.metal.device_info.rst", "python/_autosummary/mlx.core.metal.get_active_memory.rst", "python/_autosummary/mlx.core.metal.get_cache_memory.rst", "python/_autosummary/mlx.core.metal.get_peak_memory.rst", "python/_autosummary/mlx.core.metal.is_available.rst", "python/_autosummary/mlx.core.metal.reset_peak_memory.rst", "python/_autosummary/mlx.core.metal.set_cache_limit.rst", "python/_autosummary/mlx.core.metal.set_memory_limit.rst", "python/_autosummary/mlx.core.metal.set_wired_limit.rst", "python/_autosummary/mlx.core.metal.start_capture.rst", "python/_autosummary/mlx.core.metal.stop_capture.rst", "python/_autosummary/mlx.core.min.rst", "python/_autosummary/mlx.core.minimum.rst", "python/_autosummary/mlx.core.moveaxis.rst", "python/_autosummary/mlx.core.multiply.rst", "python/_autosummary/mlx.core.nan_to_num.rst", "python/_autosummary/mlx.core.negative.rst", "python/_autosummary/mlx.core.new_stream.rst", "python/_autosummary/mlx.core.not_equal.rst", "python/_autosummary/mlx.core.ones.rst", "python/_autosummary/mlx.core.ones_like.rst", "python/_autosummary/mlx.core.outer.rst", "python/_autosummary/mlx.core.pad.rst", "python/_autosummary/mlx.core.partition.rst", "python/_autosummary/mlx.core.power.rst", "python/_autosummary/mlx.core.prod.rst", "python/_autosummary/mlx.core.put_along_axis.rst", "python/_autosummary/mlx.core.quantize.rst", "python/_autosummary/mlx.core.quantized_matmul.rst", "python/_autosummary/mlx.core.radians.rst", "python/_autosummary/mlx.core.random.bernoulli.rst", "python/_autosummary/mlx.core.random.categorical.rst", "python/_autosummary/mlx.core.random.gumbel.rst", "python/_autosummary/mlx.core.random.key.rst", "python/_autosummary/mlx.core.random.laplace.rst", "python/_autosummary/mlx.core.random.multivariate_normal.rst", "python/_autosummary/mlx.core.random.normal.rst", "python/_autosummary/mlx.core.random.permutation.rst", "python/_autosummary/mlx.core.random.randint.rst", "python/_autosummary/mlx.core.random.seed.rst", "python/_autosummary/mlx.core.random.split.rst", "python/_autosummary/mlx.core.random.truncated_normal.rst", "python/_autosummary/mlx.core.random.uniform.rst", "python/_autosummary/mlx.core.real.rst", "python/_autosummary/mlx.core.reciprocal.rst", "python/_autosummary/mlx.core.remainder.rst", "python/_autosummary/mlx.core.repeat.rst", "python/_autosummary/mlx.core.reshape.rst", "python/_autosummary/mlx.core.right_shift.rst", "python/_autosummary/mlx.core.roll.rst", "python/_autosummary/mlx.core.round.rst", "python/_autosummary/mlx.core.rsqrt.rst", "python/_autosummary/mlx.core.save.rst", "python/_autosummary/mlx.core.save_gguf.rst", "python/_autosummary/mlx.core.save_safetensors.rst", "python/_autosummary/mlx.core.savez.rst", "python/_autosummary/mlx.core.savez_compressed.rst", "python/_autosummary/mlx.core.set_default_device.rst", "python/_autosummary/mlx.core.set_default_stream.rst", "python/_autosummary/mlx.core.sigmoid.rst", "python/_autosummary/mlx.core.sign.rst", "python/_autosummary/mlx.core.sin.rst", "python/_autosummary/mlx.core.sinh.rst", "python/_autosummary/mlx.core.softmax.rst", "python/_autosummary/mlx.core.sort.rst", "python/_autosummary/mlx.core.split.rst", "python/_autosummary/mlx.core.sqrt.rst", "python/_autosummary/mlx.core.square.rst", "python/_autosummary/mlx.core.squeeze.rst", "python/_autosummary/mlx.core.stack.rst", "python/_autosummary/mlx.core.std.rst", "python/_autosummary/mlx.core.stop_gradient.rst", "python/_autosummary/mlx.core.stream.rst", "python/_autosummary/mlx.core.subtract.rst", "python/_autosummary/mlx.core.sum.rst", "python/_autosummary/mlx.core.swapaxes.rst", "python/_autosummary/mlx.core.synchronize.rst", "python/_autosummary/mlx.core.take.rst", "python/_autosummary/mlx.core.take_along_axis.rst", "python/_autosummary/mlx.core.tan.rst", "python/_autosummary/mlx.core.tanh.rst", "python/_autosummary/mlx.core.tensordot.rst", "python/_autosummary/mlx.core.tile.rst", "python/_autosummary/mlx.core.topk.rst", "python/_autosummary/mlx.core.trace.rst", "python/_autosummary/mlx.core.transpose.rst", "python/_autosummary/mlx.core.tri.rst", "python/_autosummary/mlx.core.tril.rst", "python/_autosummary/mlx.core.triu.rst", "python/_autosummary/mlx.core.value_and_grad.rst", "python/_autosummary/mlx.core.var.rst", "python/_autosummary/mlx.core.view.rst", "python/_autosummary/mlx.core.vjp.rst", "python/_autosummary/mlx.core.vmap.rst", "python/_autosummary/mlx.core.where.rst", "python/_autosummary/mlx.core.zeros.rst", "python/_autosummary/mlx.core.zeros_like.rst", "python/_autosummary/mlx.nn.quantize.rst", "python/_autosummary/mlx.nn.value_and_grad.rst", "python/_autosummary/mlx.optimizers.clip_grad_norm.rst", "python/_autosummary/mlx.utils.tree_flatten.rst", "python/_autosummary/mlx.utils.tree_map.rst", "python/_autosummary/mlx.utils.tree_map_with_path.rst", "python/_autosummary/mlx.utils.tree_reduce.rst", "python/_autosummary/mlx.utils.tree_unflatten.rst", "python/_autosummary/stream_class.rst", "python/array.rst", "python/data_types.rst", "python/devices_and_streams.rst", "python/distributed.rst", "python/fast.rst", "python/fft.rst", "python/linalg.rst", "python/metal.rst", "python/nn.rst", "python/nn/_autosummary/mlx.nn.ALiBi.rst", "python/nn/_autosummary/mlx.nn.AvgPool1d.rst", "python/nn/_autosummary/mlx.nn.AvgPool2d.rst", "python/nn/_autosummary/mlx.nn.AvgPool3d.rst", "python/nn/_autosummary/mlx.nn.BatchNorm.rst", "python/nn/_autosummary/mlx.nn.CELU.rst", "python/nn/_autosummary/mlx.nn.Conv1d.rst", "python/nn/_autosummary/mlx.nn.Conv2d.rst", "python/nn/_autosummary/mlx.nn.Conv3d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose1d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose2d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose3d.rst", "python/nn/_autosummary/mlx.nn.Dropout.rst", "python/nn/_autosummary/mlx.nn.Dropout2d.rst", "python/nn/_autosummary/mlx.nn.Dropout3d.rst", "python/nn/_autosummary/mlx.nn.ELU.rst", "python/nn/_autosummary/mlx.nn.Embedding.rst", "python/nn/_autosummary/mlx.nn.GELU.rst", "python/nn/_autosummary/mlx.nn.GLU.rst", "python/nn/_autosummary/mlx.nn.GRU.rst", "python/nn/_autosummary/mlx.nn.GroupNorm.rst", "python/nn/_autosummary/mlx.nn.HardShrink.rst", "python/nn/_autosummary/mlx.nn.HardTanh.rst", "python/nn/_autosummary/mlx.nn.Hardswish.rst", "python/nn/_autosummary/mlx.nn.InstanceNorm.rst", "python/nn/_autosummary/mlx.nn.LSTM.rst", "python/nn/_autosummary/mlx.nn.LayerNorm.rst", "python/nn/_autosummary/mlx.nn.LeakyReLU.rst", "python/nn/_autosummary/mlx.nn.Linear.rst", "python/nn/_autosummary/mlx.nn.LogSigmoid.rst", "python/nn/_autosummary/mlx.nn.LogSoftmax.rst", "python/nn/_autosummary/mlx.nn.MaxPool1d.rst", "python/nn/_autosummary/mlx.nn.MaxPool2d.rst", "python/nn/_autosummary/mlx.nn.MaxPool3d.rst", "python/nn/_autosummary/mlx.nn.Mish.rst", "python/nn/_autosummary/mlx.nn.Module.apply.rst", "python/nn/_autosummary/mlx.nn.Module.apply_to_modules.rst", "python/nn/_autosummary/mlx.nn.Module.children.rst", "python/nn/_autosummary/mlx.nn.Module.eval.rst", "python/nn/_autosummary/mlx.nn.Module.filter_and_map.rst", "python/nn/_autosummary/mlx.nn.Module.freeze.rst", "python/nn/_autosummary/mlx.nn.Module.leaf_modules.rst", "python/nn/_autosummary/mlx.nn.Module.load_weights.rst", "python/nn/_autosummary/mlx.nn.Module.modules.rst", "python/nn/_autosummary/mlx.nn.Module.named_modules.rst", "python/nn/_autosummary/mlx.nn.Module.parameters.rst", "python/nn/_autosummary/mlx.nn.Module.save_weights.rst", "python/nn/_autosummary/mlx.nn.Module.set_dtype.rst", "python/nn/_autosummary/mlx.nn.Module.state.rst", "python/nn/_autosummary/mlx.nn.Module.train.rst", "python/nn/_autosummary/mlx.nn.Module.trainable_parameters.rst", "python/nn/_autosummary/mlx.nn.Module.training.rst", "python/nn/_autosummary/mlx.nn.Module.unfreeze.rst", "python/nn/_autosummary/mlx.nn.Module.update.rst", "python/nn/_autosummary/mlx.nn.Module.update_modules.rst", "python/nn/_autosummary/mlx.nn.MultiHeadAttention.rst", "python/nn/_autosummary/mlx.nn.PReLU.rst", "python/nn/_autosummary/mlx.nn.QuantizedEmbedding.rst", "python/nn/_autosummary/mlx.nn.QuantizedLinear.rst", "python/nn/_autosummary/mlx.nn.RMSNorm.rst", "python/nn/_autosummary/mlx.nn.RNN.rst", "python/nn/_autosummary/mlx.nn.ReLU.rst", "python/nn/_autosummary/mlx.nn.ReLU6.rst", "python/nn/_autosummary/mlx.nn.RoPE.rst", "python/nn/_autosummary/mlx.nn.SELU.rst", "python/nn/_autosummary/mlx.nn.Sequential.rst", "python/nn/_autosummary/mlx.nn.SiLU.rst", "python/nn/_autosummary/mlx.nn.Sigmoid.rst", "python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding.rst", "python/nn/_autosummary/mlx.nn.Softmax.rst", "python/nn/_autosummary/mlx.nn.Softmin.rst", "python/nn/_autosummary/mlx.nn.Softplus.rst", "python/nn/_autosummary/mlx.nn.Softshrink.rst", "python/nn/_autosummary/mlx.nn.Softsign.rst", "python/nn/_autosummary/mlx.nn.Step.rst", "python/nn/_autosummary/mlx.nn.Tanh.rst", "python/nn/_autosummary/mlx.nn.Transformer.rst", "python/nn/_autosummary/mlx.nn.Upsample.rst", "python/nn/_autosummary/mlx.nn.init.constant.rst", "python/nn/_autosummary/mlx.nn.init.glorot_normal.rst", "python/nn/_autosummary/mlx.nn.init.glorot_uniform.rst", "python/nn/_autosummary/mlx.nn.init.he_normal.rst", "python/nn/_autosummary/mlx.nn.init.he_uniform.rst", "python/nn/_autosummary/mlx.nn.init.identity.rst", "python/nn/_autosummary/mlx.nn.init.normal.rst", "python/nn/_autosummary/mlx.nn.init.uniform.rst", "python/nn/_autosummary_functions/mlx.nn.celu.rst", "python/nn/_autosummary_functions/mlx.nn.elu.rst", "python/nn/_autosummary_functions/mlx.nn.gelu.rst", "python/nn/_autosummary_functions/mlx.nn.gelu_approx.rst", "python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx.rst", "python/nn/_autosummary_functions/mlx.nn.glu.rst", "python/nn/_autosummary_functions/mlx.nn.hard_shrink.rst", "python/nn/_autosummary_functions/mlx.nn.hard_tanh.rst", "python/nn/_autosummary_functions/mlx.nn.hardswish.rst", "python/nn/_autosummary_functions/mlx.nn.leaky_relu.rst", "python/nn/_autosummary_functions/mlx.nn.log_sigmoid.rst", "python/nn/_autosummary_functions/mlx.nn.log_softmax.rst", "python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy.rst", "python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy.rst", "python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.huber_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.l1_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.mse_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.nll_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss.rst", "python/nn/_autosummary_functions/mlx.nn.mish.rst", "python/nn/_autosummary_functions/mlx.nn.prelu.rst", "python/nn/_autosummary_functions/mlx.nn.relu.rst", "python/nn/_autosummary_functions/mlx.nn.relu6.rst", "python/nn/_autosummary_functions/mlx.nn.selu.rst", "python/nn/_autosummary_functions/mlx.nn.sigmoid.rst", "python/nn/_autosummary_functions/mlx.nn.silu.rst", "python/nn/_autosummary_functions/mlx.nn.softmax.rst", "python/nn/_autosummary_functions/mlx.nn.softmin.rst", "python/nn/_autosummary_functions/mlx.nn.softplus.rst", "python/nn/_autosummary_functions/mlx.nn.softshrink.rst", "python/nn/_autosummary_functions/mlx.nn.step.rst", "python/nn/_autosummary_functions/mlx.nn.tanh.rst", "python/nn/functions.rst", "python/nn/init.rst", "python/nn/layers.rst", "python/nn/losses.rst", "python/nn/module.rst", "python/ops.rst", "python/optimizers.rst", "python/optimizers/_autosummary/mlx.optimizers.AdaDelta.rst", "python/optimizers/_autosummary/mlx.optimizers.Adafactor.rst", "python/optimizers/_autosummary/mlx.optimizers.Adagrad.rst", "python/optimizers/_autosummary/mlx.optimizers.Adam.rst", "python/optimizers/_autosummary/mlx.optimizers.AdamW.rst", "python/optimizers/_autosummary/mlx.optimizers.Adamax.rst", "python/optimizers/_autosummary/mlx.optimizers.Lion.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.init.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.state.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.update.rst", "python/optimizers/_autosummary/mlx.optimizers.RMSprop.rst", "python/optimizers/_autosummary/mlx.optimizers.SGD.rst", "python/optimizers/_autosummary/mlx.optimizers.cosine_decay.rst", "python/optimizers/_autosummary/mlx.optimizers.exponential_decay.rst", "python/optimizers/_autosummary/mlx.optimizers.join_schedules.rst", "python/optimizers/_autosummary/mlx.optimizers.linear_schedule.rst", "python/optimizers/_autosummary/mlx.optimizers.step_decay.rst", "python/optimizers/common_optimizers.rst", "python/optimizers/optimizer.rst", "python/optimizers/schedulers.rst", "python/random.rst", "python/transforms.rst", "python/tree_utils.rst", "usage/compile.rst", "usage/distributed.rst", "usage/function_transforms.rst", "usage/indexing.rst", "usage/lazy_evaluation.rst", "usage/numpy.rst", "usage/quick_start.rst", "usage/saving_and_loading.rst", "usage/unified_memory.rst", "usage/using_streams.rst"], "indexentries": {"__init__() (array method)": [[30, "mlx.core.array.__init__", false]], "__init__() (custom_function method)": [[112, "mlx.core.custom_function.__init__", false]], "__init__() (device method)": [[9, "mlx.core.Device.__init__", false]], "__init__() (dtype method)": [[10, "mlx.core.Dtype.__init__", false]], "__init__() (dtypecategory method)": [[11, "mlx.core.DtypeCategory.__init__", false]], "__init__() (group method)": [[120, "mlx.core.distributed.Group.__init__", false]], "__init__() (stream method)": [[314, "mlx.core.Stream.__init__", false]], "abs (c++ function)": [[0, "_CPPv43absRK5array14StreamOrDevice", false]], "abs() (array method)": [[32, "mlx.core.array.abs", false]], "abs() (in module mlx.core)": [[12, "mlx.core.abs", false]], "adadelta (class in mlx.optimizers)": [[456, "mlx.optimizers.AdaDelta", false]], "adafactor (class in mlx.optimizers)": [[457, "mlx.optimizers.Adafactor", false]], "adagrad (class in mlx.optimizers)": [[458, "mlx.optimizers.Adagrad", false]], "adam (class in mlx.optimizers)": [[459, "mlx.optimizers.Adam", false]], "adamax (class in mlx.optimizers)": [[461, "mlx.optimizers.Adamax", false]], "adamw (class in mlx.optimizers)": [[460, "mlx.optimizers.AdamW", false]], "add (c++ function)": [[0, "_CPPv43addRK5arrayRK5array14StreamOrDevice", false]], "add() (in module mlx.core)": [[13, "mlx.core.add", false]], "addmm (c++ function)": [[0, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", false]], "addmm() (in module mlx.core)": [[14, "mlx.core.addmm", false]], "alibi (class in mlx.nn)": [[324, "mlx.nn.ALiBi", false]], "all (c++ function)": [[0, "_CPPv43allRK5array14StreamOrDevice", false], [0, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43allRK5arrayb14StreamOrDevice", false], [0, "_CPPv43allRK5arrayib14StreamOrDevice", false]], "all() (array method)": [[33, "mlx.core.array.all", false]], "all() (in module mlx.core)": [[15, "mlx.core.all", false]], "all_gather() (in module mlx.core.distributed)": [[121, "mlx.core.distributed.all_gather", false]], "all_sum() (in module mlx.core.distributed)": [[122, "mlx.core.distributed.all_sum", false]], "allclose (c++ function)": [[0, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", false]], "allclose() (in module mlx.core)": [[16, "mlx.core.allclose", false]], "any (c++ function)": [[0, "_CPPv43anyRK5array14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayb14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayib14StreamOrDevice", false]], "any() (array method)": [[34, "mlx.core.array.any", false]], "any() (in module mlx.core)": [[17, "mlx.core.any", false]], "apply() (module method)": [[359, "mlx.nn.Module.apply", false]], "apply_gradients() (optimizer method)": [[463, "mlx.optimizers.Optimizer.apply_gradients", false]], "apply_to_modules() (module method)": [[360, "mlx.nn.Module.apply_to_modules", false]], "arange (c++ function)": [[0, "_CPPv46aranged14StreamOrDevice", false], [0, "_CPPv46aranged5Dtype14StreamOrDevice", false], [0, "_CPPv46arangedd14StreamOrDevice", false], [0, "_CPPv46arangedd5Dtype14StreamOrDevice", false], [0, "_CPPv46arangeddd14StreamOrDevice", false], [0, "_CPPv46arangeddd5Dtype14StreamOrDevice", false], [0, "_CPPv46arangei14StreamOrDevice", false], [0, "_CPPv46arangeii14StreamOrDevice", false], [0, "_CPPv46arangeiii14StreamOrDevice", false]], "arange() (in module mlx.core)": [[18, "mlx.core.arange", false]], "arccos (c++ function)": [[0, "_CPPv46arccosRK5array14StreamOrDevice", false]], "arccos() (in module mlx.core)": [[19, "mlx.core.arccos", false]], "arccosh (c++ function)": [[0, "_CPPv47arccoshRK5array14StreamOrDevice", false]], "arccosh() (in module mlx.core)": [[20, "mlx.core.arccosh", false]], "arcsin (c++ function)": [[0, "_CPPv46arcsinRK5array14StreamOrDevice", false]], "arcsin() (in module mlx.core)": [[21, "mlx.core.arcsin", false]], "arcsinh (c++ function)": [[0, "_CPPv47arcsinhRK5array14StreamOrDevice", false]], "arcsinh() (in module mlx.core)": [[22, "mlx.core.arcsinh", false]], "arctan (c++ function)": [[0, "_CPPv46arctanRK5array14StreamOrDevice", false]], "arctan() (in module mlx.core)": [[23, "mlx.core.arctan", false]], "arctan2 (c++ function)": [[0, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", false]], "arctan2() (in module mlx.core)": [[24, "mlx.core.arctan2", false]], "arctanh (c++ function)": [[0, "_CPPv47arctanhRK5array14StreamOrDevice", false]], "arctanh() (in module mlx.core)": [[25, "mlx.core.arctanh", false]], "argmax (c++ function)": [[0, "_CPPv46argmaxRK5array14StreamOrDevice", false], [0, "_CPPv46argmaxRK5arrayb14StreamOrDevice", false], [0, "_CPPv46argmaxRK5arrayib14StreamOrDevice", false]], "argmax() (array method)": [[35, "mlx.core.array.argmax", false]], "argmax() (in module mlx.core)": [[26, "mlx.core.argmax", false]], "argmin (c++ function)": [[0, "_CPPv46argminRK5array14StreamOrDevice", false], [0, "_CPPv46argminRK5arrayb14StreamOrDevice", false], [0, "_CPPv46argminRK5arrayib14StreamOrDevice", false]], "argmin() (array method)": [[36, "mlx.core.array.argmin", false]], "argmin() (in module mlx.core)": [[27, "mlx.core.argmin", false]], "argpartition (c++ function)": [[0, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", false], [0, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", false]], "argpartition() (in module mlx.core)": [[28, "mlx.core.argpartition", false]], "argsort (c++ function)": [[0, "_CPPv47argsortRK5array14StreamOrDevice", false], [0, "_CPPv47argsortRK5arrayi14StreamOrDevice", false]], "argsort() (in module mlx.core)": [[29, "mlx.core.argsort", false]], "array (class in mlx.core)": [[30, "mlx.core.array", false]], "array_equal (c++ function)": [[0, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", false], [0, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", false]], "array_equal() (in module mlx.core)": [[82, "mlx.core.array_equal", false]], "as_strided (c++ function)": [[0, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", false]], "as_strided() (in module mlx.core)": [[83, "mlx.core.as_strided", false]], "astype (c++ function)": [[0, "_CPPv46astype5array5Dtype14StreamOrDevice", false]], "astype() (array method)": [[37, "mlx.core.array.astype", false]], "at (array property)": [[38, "mlx.core.array.at", false]], "atleast_1d (c++ function)": [[0, "_CPPv410atleast_1dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_1d() (in module mlx.core)": [[84, "mlx.core.atleast_1d", false]], "atleast_2d (c++ function)": [[0, "_CPPv410atleast_2dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_2d() (in module mlx.core)": [[85, "mlx.core.atleast_2d", false]], "atleast_3d (c++ function)": [[0, "_CPPv410atleast_3dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_3d() (in module mlx.core)": [[86, "mlx.core.atleast_3d", false]], "avgpool1d (class in mlx.nn)": [[325, "mlx.nn.AvgPool1d", false]], "avgpool2d (class in mlx.nn)": [[326, "mlx.nn.AvgPool2d", false]], "avgpool3d (class in mlx.nn)": [[327, "mlx.nn.AvgPool3d", false]], "batchnorm (class in mlx.nn)": [[328, "mlx.nn.BatchNorm", false]], "bernoulli() (in module mlx.core.random)": [[239, "mlx.core.random.bernoulli", false]], "binary_cross_entropy (class in mlx.nn.losses)": [[422, "mlx.nn.losses.binary_cross_entropy", false]], "bitwise_and (c++ function)": [[0, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", false]], "bitwise_and() (in module mlx.core)": [[87, "mlx.core.bitwise_and", false]], "bitwise_or (c++ function)": [[0, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", false]], "bitwise_or() (in module mlx.core)": [[88, "mlx.core.bitwise_or", false]], "bitwise_xor (c++ function)": [[0, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", false]], "bitwise_xor() (in module mlx.core)": [[89, "mlx.core.bitwise_xor", false]], "block_masked_mm (c++ function)": [[0, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", false]], "block_masked_mm() (in module mlx.core)": [[90, "mlx.core.block_masked_mm", false]], "broadcast_arrays (c++ function)": [[0, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "broadcast_to (c++ function)": [[0, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "broadcast_to() (in module mlx.core)": [[91, "mlx.core.broadcast_to", false]], "categorical() (in module mlx.core.random)": [[240, "mlx.core.random.categorical", false]], "ceil (c++ function)": [[0, "_CPPv44ceilRK5array14StreamOrDevice", false]], "ceil() (in module mlx.core)": [[92, "mlx.core.ceil", false]], "celu (class in mlx.nn)": [[329, "mlx.nn.CELU", false], [410, "mlx.nn.celu", false]], "children() (module method)": [[361, "mlx.nn.Module.children", false]], "cholesky() (in module mlx.core.linalg)": [[182, "mlx.core.linalg.cholesky", false]], "cholesky_inv() (in module mlx.core.linalg)": [[183, "mlx.core.linalg.cholesky_inv", false]], "clear_cache() (in module mlx.core.metal)": [[208, "mlx.core.metal.clear_cache", false]], "clip (c++ function)": [[0, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", false]], "clip() (in module mlx.core)": [[93, "mlx.core.clip", false]], "clip_grad_norm() (in module mlx.optimizers)": [[308, "mlx.optimizers.clip_grad_norm", false]], "compile() (in module mlx.core)": [[94, "mlx.core.compile", false]], "concatenate (c++ function)": [[0, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", false], [0, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", false]], "concatenate() (in module mlx.core)": [[95, "mlx.core.concatenate", false]], "conj() (array method)": [[39, "mlx.core.array.conj", false]], "conj() (in module mlx.core)": [[96, "mlx.core.conj", false]], "conjugate (c++ function)": [[0, "_CPPv49conjugateRK5array14StreamOrDevice", false]], "conjugate() (in module mlx.core)": [[97, "mlx.core.conjugate", false]], "constant() (in module mlx.nn.init)": [[402, "mlx.nn.init.constant", false]], "contiguous (c++ function)": [[0, "_CPPv410contiguousRK5arrayb14StreamOrDevice", false]], "conv1d (c++ function)": [[0, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", false]], "conv1d (class in mlx.nn)": [[330, "mlx.nn.Conv1d", false]], "conv1d() (in module mlx.core)": [[98, "mlx.core.conv1d", false]], "conv2d (c++ function)": [[0, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", false]], "conv2d (class in mlx.nn)": [[331, "mlx.nn.Conv2d", false]], "conv2d() (in module mlx.core)": [[99, "mlx.core.conv2d", false]], "conv3d (c++ function)": [[0, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", false]], "conv3d (class in mlx.nn)": [[332, "mlx.nn.Conv3d", false]], "conv3d() (in module mlx.core)": [[100, "mlx.core.conv3d", false]], "conv_general (c++ function)": [[0, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", false], [0, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", false]], "conv_general() (in module mlx.core)": [[101, "mlx.core.conv_general", false]], "conv_transpose1d (c++ function)": [[0, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", false]], "conv_transpose1d() (in module mlx.core)": [[102, "mlx.core.conv_transpose1d", false]], "conv_transpose2d (c++ function)": [[0, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", false]], "conv_transpose2d() (in module mlx.core)": [[103, "mlx.core.conv_transpose2d", false]], "conv_transpose3d (c++ function)": [[0, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", false]], "conv_transpose3d() (in module mlx.core)": [[104, "mlx.core.conv_transpose3d", false]], "convolve() (in module mlx.core)": [[105, "mlx.core.convolve", false]], "convtranspose1d (class in mlx.nn)": [[333, "mlx.nn.ConvTranspose1d", false]], "convtranspose2d (class in mlx.nn)": [[334, "mlx.nn.ConvTranspose2d", false]], "convtranspose3d (class in mlx.nn)": [[335, "mlx.nn.ConvTranspose3d", false]], "copy (c++ function)": [[0, "_CPPv44copy5array14StreamOrDevice", false]], "cos (c++ function)": [[0, "_CPPv43cosRK5array14StreamOrDevice", false]], "cos() (array method)": [[40, "mlx.core.array.cos", false]], "cos() (in module mlx.core)": [[106, "mlx.core.cos", false]], "cosh (c++ function)": [[0, "_CPPv44coshRK5array14StreamOrDevice", false]], "cosh() (in module mlx.core)": [[107, "mlx.core.cosh", false]], "cosine_decay() (in module mlx.optimizers)": [[469, "mlx.optimizers.cosine_decay", false]], "cosine_similarity_loss (class in mlx.nn.losses)": [[423, "mlx.nn.losses.cosine_similarity_loss", false]], "cross() (in module mlx.core.linalg)": [[184, "mlx.core.linalg.cross", false]], "cross_entropy (class in mlx.nn.losses)": [[424, "mlx.nn.losses.cross_entropy", false]], "cummax (c++ function)": [[0, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", false]], "cummax() (array method)": [[41, "mlx.core.array.cummax", false]], "cummax() (in module mlx.core)": [[108, "mlx.core.cummax", false]], "cummin (c++ function)": [[0, "_CPPv46cumminRK5arrayibb14StreamOrDevice", false]], "cummin() (array method)": [[42, "mlx.core.array.cummin", false]], "cummin() (in module mlx.core)": [[109, "mlx.core.cummin", false]], "cumprod (c++ function)": [[0, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", false]], "cumprod() (array method)": [[43, "mlx.core.array.cumprod", false]], "cumprod() (in module mlx.core)": [[110, "mlx.core.cumprod", false]], "cumsum (c++ function)": [[0, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", false]], "cumsum() (array method)": [[44, "mlx.core.array.cumsum", false]], "cumsum() (in module mlx.core)": [[111, "mlx.core.cumsum", false]], "custom_function (class in mlx.core)": [[112, "mlx.core.custom_function", false]], "default_device() (in module mlx.core)": [[113, "mlx.core.default_device", false]], "default_stream() (in module mlx.core)": [[114, "mlx.core.default_stream", false]], "degrees (c++ function)": [[0, "_CPPv47degreesRK5array14StreamOrDevice", false]], "degrees() (in module mlx.core)": [[115, "mlx.core.degrees", false]], "depends (c++ function)": [[0, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", false]], "dequantize (c++ function)": [[0, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", false]], "dequantize() (in module mlx.core)": [[116, "mlx.core.dequantize", false]], "device (class in mlx.core)": [[9, "mlx.core.Device", false]], "device_info() (in module mlx.core.metal)": [[209, "mlx.core.metal.device_info", false]], "diag (c++ function)": [[0, "_CPPv44diagRK5arrayi14StreamOrDevice", false]], "diag() (array method)": [[45, "mlx.core.array.diag", false]], "diag() (in module mlx.core)": [[117, "mlx.core.diag", false]], "diagonal (c++ function)": [[0, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", false]], "diagonal() (array method)": [[46, "mlx.core.array.diagonal", false]], "diagonal() (in module mlx.core)": [[118, "mlx.core.diagonal", false]], "disable_compile() (in module mlx.core)": [[119, "mlx.core.disable_compile", false]], "divide (c++ function)": [[0, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", false]], "divide() (in module mlx.core)": [[128, "mlx.core.divide", false]], "divmod (c++ function)": [[0, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", false]], "divmod() (in module mlx.core)": [[129, "mlx.core.divmod", false]], "dropout (class in mlx.nn)": [[336, "mlx.nn.Dropout", false]], "dropout2d (class in mlx.nn)": [[337, "mlx.nn.Dropout2d", false]], "dropout3d (class in mlx.nn)": [[338, "mlx.nn.Dropout3d", false]], "dtype (array property)": [[47, "mlx.core.array.dtype", false]], "dtype (class in mlx.core)": [[10, "mlx.core.Dtype", false]], "dtypecategory (class in mlx.core)": [[11, "mlx.core.DtypeCategory", false]], "eigh() (in module mlx.core.linalg)": [[185, "mlx.core.linalg.eigh", false]], "eigvalsh() (in module mlx.core.linalg)": [[186, "mlx.core.linalg.eigvalsh", false]], "einsum() (in module mlx.core)": [[130, "mlx.core.einsum", false]], "einsum_path() (in module mlx.core)": [[131, "mlx.core.einsum_path", false]], "elu (class in mlx.nn)": [[339, "mlx.nn.ELU", false], [411, "mlx.nn.elu", false]], "embedding (class in mlx.nn)": [[340, "mlx.nn.Embedding", false]], "enable_compile() (in module mlx.core)": [[132, "mlx.core.enable_compile", false]], "equal (c++ function)": [[0, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", false]], "equal() (in module mlx.core)": [[133, "mlx.core.equal", false]], "erf (c++ function)": [[0, "_CPPv43erfRK5array14StreamOrDevice", false]], "erf() (in module mlx.core)": [[134, "mlx.core.erf", false]], "erfinv (c++ function)": [[0, "_CPPv46erfinvRK5array14StreamOrDevice", false]], "erfinv() (in module mlx.core)": [[135, "mlx.core.erfinv", false]], "eval() (in module mlx.core)": [[136, "mlx.core.eval", false]], "eval() (module method)": [[362, "mlx.nn.Module.eval", false]], "exp (c++ function)": [[0, "_CPPv43expRK5array14StreamOrDevice", false]], "exp() (array method)": [[48, "mlx.core.array.exp", false]], "exp() (in module mlx.core)": [[137, "mlx.core.exp", false]], "expand_dims (c++ function)": [[0, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", false]], "expand_dims() (in module mlx.core)": [[138, "mlx.core.expand_dims", false]], "expm1 (c++ function)": [[0, "_CPPv45expm1RK5array14StreamOrDevice", false]], "expm1() (in module mlx.core)": [[139, "mlx.core.expm1", false]], "exponential_decay() (in module mlx.optimizers)": [[470, "mlx.optimizers.exponential_decay", false]], "eye (c++ function)": [[0, "_CPPv43eyei14StreamOrDevice", false], [0, "_CPPv43eyei5Dtype14StreamOrDevice", false], [0, "_CPPv43eyeii14StreamOrDevice", false], [0, "_CPPv43eyeiii14StreamOrDevice", false], [0, "_CPPv43eyeiii5Dtype14StreamOrDevice", false]], "eye() (in module mlx.core)": [[140, "mlx.core.eye", false]], "fft() (in module mlx.core.fft)": [[146, "mlx.core.fft.fft", false]], "fft2() (in module mlx.core.fft)": [[147, "mlx.core.fft.fft2", false]], "fftn() (in module mlx.core.fft)": [[148, "mlx.core.fft.fftn", false]], "filter_and_map() (module method)": [[363, "mlx.nn.Module.filter_and_map", false]], "flatten (c++ function)": [[0, "_CPPv47flattenRK5array14StreamOrDevice", false], [0, "_CPPv47flattenRK5arrayii14StreamOrDevice", false]], "flatten() (array method)": [[49, "mlx.core.array.flatten", false]], "flatten() (in module mlx.core)": [[158, "mlx.core.flatten", false]], "floor (c++ function)": [[0, "_CPPv45floorRK5array14StreamOrDevice", false]], "floor() (in module mlx.core)": [[159, "mlx.core.floor", false]], "floor_divide (c++ function)": [[0, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", false]], "floor_divide() (in module mlx.core)": [[160, "mlx.core.floor_divide", false]], "freeze() (module method)": [[364, "mlx.nn.Module.freeze", false]], "full (c++ function)": [[0, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", false], [0, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", false], [0, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", false], [0, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", false]], "full() (in module mlx.core)": [[161, "mlx.core.full", false]], "gather (c++ function)": [[0, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false]], "gather_mm (c++ function)": [[0, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", false]], "gather_mm() (in module mlx.core)": [[162, "mlx.core.gather_mm", false]], "gather_qmm (c++ function)": [[0, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", false]], "gather_qmm() (in module mlx.core)": [[163, "mlx.core.gather_qmm", false]], "gaussian_nll_loss (class in mlx.nn.losses)": [[425, "mlx.nn.losses.gaussian_nll_loss", false]], "gelu (class in mlx.nn)": [[341, "mlx.nn.GELU", false], [412, "mlx.nn.gelu", false]], "gelu_approx (class in mlx.nn)": [[413, "mlx.nn.gelu_approx", false]], "gelu_fast_approx (class in mlx.nn)": [[414, "mlx.nn.gelu_fast_approx", false]], "get_active_memory() (in module mlx.core.metal)": [[210, "mlx.core.metal.get_active_memory", false]], "get_cache_memory() (in module mlx.core.metal)": [[211, "mlx.core.metal.get_cache_memory", false]], "get_peak_memory() (in module mlx.core.metal)": [[212, "mlx.core.metal.get_peak_memory", false]], "glorot_normal() (in module mlx.nn.init)": [[403, "mlx.nn.init.glorot_normal", false]], "glorot_uniform() (in module mlx.nn.init)": [[404, "mlx.nn.init.glorot_uniform", false]], "glu (class in mlx.nn)": [[342, "mlx.nn.GLU", false], [415, "mlx.nn.glu", false]], "grad() (in module mlx.core)": [[164, "mlx.core.grad", false]], "greater (c++ function)": [[0, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", false]], "greater() (in module mlx.core)": [[165, "mlx.core.greater", false]], "greater_equal (c++ function)": [[0, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", false]], "greater_equal() (in module mlx.core)": [[166, "mlx.core.greater_equal", false]], "group (class in mlx.core.distributed)": [[120, "mlx.core.distributed.Group", false]], "groupnorm (class in mlx.nn)": [[344, "mlx.nn.GroupNorm", false]], "gru (class in mlx.nn)": [[343, "mlx.nn.GRU", false]], "gumbel() (in module mlx.core.random)": [[241, "mlx.core.random.gumbel", false]], "hadamard_transform (c++ function)": [[0, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", false]], "hadamard_transform() (in module mlx.core)": [[167, "mlx.core.hadamard_transform", false]], "hard_shrink (class in mlx.nn)": [[416, "mlx.nn.hard_shrink", false]], "hard_tanh (class in mlx.nn)": [[417, "mlx.nn.hard_tanh", false]], "hardshrink (class in mlx.nn)": [[345, "mlx.nn.HardShrink", false]], "hardswish (class in mlx.nn)": [[347, "mlx.nn.Hardswish", false], [418, "mlx.nn.hardswish", false]], "hardtanh (class in mlx.nn)": [[346, "mlx.nn.HardTanh", false]], "he_normal() (in module mlx.nn.init)": [[405, "mlx.nn.init.he_normal", false]], "he_uniform() (in module mlx.nn.init)": [[406, "mlx.nn.init.he_uniform", false]], "hinge_loss (class in mlx.nn.losses)": [[426, "mlx.nn.losses.hinge_loss", false]], "huber_loss (class in mlx.nn.losses)": [[427, "mlx.nn.losses.huber_loss", false]], "identity (c++ function)": [[0, "_CPPv48identityi14StreamOrDevice", false], [0, "_CPPv48identityi5Dtype14StreamOrDevice", false]], "identity() (in module mlx.core)": [[168, "mlx.core.identity", false]], "identity() (in module mlx.nn.init)": [[407, "mlx.nn.init.identity", false]], "ifft() (in module mlx.core.fft)": [[149, "mlx.core.fft.ifft", false]], "ifft2() (in module mlx.core.fft)": [[150, "mlx.core.fft.ifft2", false]], "ifftn() (in module mlx.core.fft)": [[151, "mlx.core.fft.ifftn", false]], "imag (c++ function)": [[0, "_CPPv44imagRK5array14StreamOrDevice", false]], "imag() (in module mlx.core)": [[169, "mlx.core.imag", false]], "init() (in module mlx.core.distributed)": [[123, "mlx.core.distributed.init", false]], "init() (optimizer method)": [[464, "mlx.optimizers.Optimizer.init", false]], "inner (c++ function)": [[0, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", false]], "inner() (in module mlx.core)": [[170, "mlx.core.inner", false]], "instancenorm (class in mlx.nn)": [[348, "mlx.nn.InstanceNorm", false]], "inv() (in module mlx.core.linalg)": [[187, "mlx.core.linalg.inv", false]], "irfft() (in module mlx.core.fft)": [[152, "mlx.core.fft.irfft", false]], "irfft2() (in module mlx.core.fft)": [[153, "mlx.core.fft.irfft2", false]], "irfftn() (in module mlx.core.fft)": [[154, "mlx.core.fft.irfftn", false]], "is_available() (in module mlx.core.distributed)": [[124, "mlx.core.distributed.is_available", false]], "is_available() (in module mlx.core.metal)": [[213, "mlx.core.metal.is_available", false]], "isclose (c++ function)": [[0, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", false]], "isclose() (in module mlx.core)": [[171, "mlx.core.isclose", false]], "isfinite (c++ function)": [[0, "_CPPv48isfiniteRK5array14StreamOrDevice", false]], "isfinite() (in module mlx.core)": [[172, "mlx.core.isfinite", false]], "isinf (c++ function)": [[0, "_CPPv45isinfRK5array14StreamOrDevice", false]], "isinf() (in module mlx.core)": [[173, "mlx.core.isinf", false]], "isnan (c++ function)": [[0, "_CPPv45isnanRK5array14StreamOrDevice", false]], "isnan() (in module mlx.core)": [[174, "mlx.core.isnan", false]], "isneginf (c++ function)": [[0, "_CPPv48isneginfRK5array14StreamOrDevice", false]], "isneginf() (in module mlx.core)": [[175, "mlx.core.isneginf", false]], "isposinf (c++ function)": [[0, "_CPPv48isposinfRK5array14StreamOrDevice", false]], "isposinf() (in module mlx.core)": [[176, "mlx.core.isposinf", false]], "issubdtype() (in module mlx.core)": [[177, "mlx.core.issubdtype", false]], "item() (array method)": [[50, "mlx.core.array.item", false]], "itemsize (array property)": [[51, "mlx.core.array.itemsize", false]], "join_schedules() (in module mlx.optimizers)": [[471, "mlx.optimizers.join_schedules", false]], "jvp() (in module mlx.core)": [[178, "mlx.core.jvp", false]], "key() (in module mlx.core.random)": [[242, "mlx.core.random.key", false]], "kl_div_loss (class in mlx.nn.losses)": [[428, "mlx.nn.losses.kl_div_loss", false]], "l1_loss (class in mlx.nn.losses)": [[429, "mlx.nn.losses.l1_loss", false]], "laplace() (in module mlx.core.random)": [[243, "mlx.core.random.laplace", false]], "layer_norm() (in module mlx.core.fast)": [[141, "mlx.core.fast.layer_norm", false]], "layernorm (class in mlx.nn)": [[350, "mlx.nn.LayerNorm", false]], "leaf_modules() (module method)": [[365, "mlx.nn.Module.leaf_modules", false]], "leaky_relu (class in mlx.nn)": [[419, "mlx.nn.leaky_relu", false]], "leakyrelu (class in mlx.nn)": [[351, "mlx.nn.LeakyReLU", false]], "left_shift (c++ function)": [[0, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", false]], "left_shift() (in module mlx.core)": [[179, "mlx.core.left_shift", false]], "less (c++ function)": [[0, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", false]], "less() (in module mlx.core)": [[180, "mlx.core.less", false]], "less_equal (c++ function)": [[0, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", false]], "less_equal() (in module mlx.core)": [[181, "mlx.core.less_equal", false]], "linear (class in mlx.nn)": [[352, "mlx.nn.Linear", false]], "linear_schedule() (in module mlx.optimizers)": [[472, "mlx.optimizers.linear_schedule", false]], "linspace (c++ function)": [[0, "_CPPv48linspaceddi5Dtype14StreamOrDevice", false]], "linspace() (in module mlx.core)": [[192, "mlx.core.linspace", false]], "lion (class in mlx.optimizers)": [[462, "mlx.optimizers.Lion", false]], "load() (in module mlx.core)": [[193, "mlx.core.load", false]], "load_weights() (module method)": [[366, "mlx.nn.Module.load_weights", false]], "log (c++ function)": [[0, "_CPPv43logRK5array14StreamOrDevice", false]], "log() (array method)": [[52, "mlx.core.array.log", false]], "log() (in module mlx.core)": [[194, "mlx.core.log", false]], "log10 (c++ function)": [[0, "_CPPv45log10RK5array14StreamOrDevice", false]], "log10() (array method)": [[53, "mlx.core.array.log10", false]], "log10() (in module mlx.core)": [[195, "mlx.core.log10", false]], "log1p (c++ function)": [[0, "_CPPv45log1pRK5array14StreamOrDevice", false]], "log1p() (array method)": [[54, "mlx.core.array.log1p", false]], "log1p() (in module mlx.core)": [[196, "mlx.core.log1p", false]], "log2 (c++ function)": [[0, "_CPPv44log2RK5array14StreamOrDevice", false]], "log2() (array method)": [[55, "mlx.core.array.log2", false]], "log2() (in module mlx.core)": [[197, "mlx.core.log2", false]], "log_cosh_loss (class in mlx.nn.losses)": [[430, "mlx.nn.losses.log_cosh_loss", false]], "log_sigmoid (class in mlx.nn)": [[420, "mlx.nn.log_sigmoid", false]], "log_softmax (class in mlx.nn)": [[421, "mlx.nn.log_softmax", false]], "logaddexp (c++ function)": [[0, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", false]], "logaddexp() (in module mlx.core)": [[198, "mlx.core.logaddexp", false]], "logical_and (c++ function)": [[0, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", false]], "logical_and() (in module mlx.core)": [[199, "mlx.core.logical_and", false]], "logical_not (c++ function)": [[0, "_CPPv411logical_notRK5array14StreamOrDevice", false]], "logical_not() (in module mlx.core)": [[200, "mlx.core.logical_not", false]], "logical_or (c++ function)": [[0, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", false]], "logical_or() (in module mlx.core)": [[201, "mlx.core.logical_or", false]], "logsigmoid (class in mlx.nn)": [[353, "mlx.nn.LogSigmoid", false]], "logsoftmax (class in mlx.nn)": [[354, "mlx.nn.LogSoftmax", false]], "logsumexp (c++ function)": [[0, "_CPPv49logsumexpRK5array14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", false]], "logsumexp() (array method)": [[56, "mlx.core.array.logsumexp", false]], "logsumexp() (in module mlx.core)": [[202, "mlx.core.logsumexp", false]], "lstm (class in mlx.nn)": [[349, "mlx.nn.LSTM", false]], "margin_ranking_loss (class in mlx.nn.losses)": [[431, "mlx.nn.losses.margin_ranking_loss", false]], "matmul (c++ function)": [[0, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", false]], "matmul() (in module mlx.core)": [[203, "mlx.core.matmul", false]], "max (c++ function)": [[0, "_CPPv43maxRK5array14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayb14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayib14StreamOrDevice", false]], "max() (array method)": [[57, "mlx.core.array.max", false]], "max() (in module mlx.core)": [[204, "mlx.core.max", false]], "maximum (c++ function)": [[0, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", false]], "maximum() (in module mlx.core)": [[205, "mlx.core.maximum", false]], "maxpool1d (class in mlx.nn)": [[355, "mlx.nn.MaxPool1d", false]], "maxpool2d (class in mlx.nn)": [[356, "mlx.nn.MaxPool2d", false]], "maxpool3d (class in mlx.nn)": [[357, "mlx.nn.MaxPool3d", false]], "mean (c++ function)": [[0, "_CPPv44meanRK5array14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayb14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayib14StreamOrDevice", false]], "mean() (array method)": [[58, "mlx.core.array.mean", false]], "mean() (in module mlx.core)": [[206, "mlx.core.mean", false]], "meshgrid (c++ function)": [[0, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", false]], "meshgrid() (in module mlx.core)": [[207, "mlx.core.meshgrid", false]], "metal_kernel() (in module mlx.core.fast)": [[142, "mlx.core.fast.metal_kernel", false]], "min (c++ function)": [[0, "_CPPv43minRK5array14StreamOrDevice", false], [0, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43minRK5arrayb14StreamOrDevice", false], [0, "_CPPv43minRK5arrayib14StreamOrDevice", false]], "min() (array method)": [[59, "mlx.core.array.min", false]], "min() (in module mlx.core)": [[220, "mlx.core.min", false]], "minimum (c++ function)": [[0, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", false]], "minimum() (in module mlx.core)": [[221, "mlx.core.minimum", false]], "mish (class in mlx.nn)": [[358, "mlx.nn.Mish", false], [436, "mlx.nn.mish", false]], "module (class in mlx.nn)": [[453, "mlx.nn.Module", false]], "modules() (module method)": [[367, "mlx.nn.Module.modules", false]], "moveaxis (c++ function)": [[0, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", false]], "moveaxis() (array method)": [[60, "mlx.core.array.moveaxis", false]], "moveaxis() (in module mlx.core)": [[222, "mlx.core.moveaxis", false]], "mse_loss (class in mlx.nn.losses)": [[432, "mlx.nn.losses.mse_loss", false]], "multiheadattention (class in mlx.nn)": [[379, "mlx.nn.MultiHeadAttention", false]], "multiply (c++ function)": [[0, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", false]], "multiply() (in module mlx.core)": [[223, "mlx.core.multiply", false]], "multivariate_normal() (in module mlx.core.random)": [[244, "mlx.core.random.multivariate_normal", false]], "named_modules() (module method)": [[368, "mlx.nn.Module.named_modules", false]], "nan_to_num (c++ function)": [[0, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", false]], "nan_to_num() (in module mlx.core)": [[224, "mlx.core.nan_to_num", false]], "nbytes (array property)": [[61, "mlx.core.array.nbytes", false]], "ndim (array property)": [[62, "mlx.core.array.ndim", false]], "negative (c++ function)": [[0, "_CPPv48negativeRK5array14StreamOrDevice", false]], "negative() (in module mlx.core)": [[225, "mlx.core.negative", false]], "new_stream() (in module mlx.core)": [[226, "mlx.core.new_stream", false]], "nll_loss (class in mlx.nn.losses)": [[433, "mlx.nn.losses.nll_loss", false]], "norm() (in module mlx.core.linalg)": [[188, "mlx.core.linalg.norm", false]], "normal() (in module mlx.core.random)": [[245, "mlx.core.random.normal", false]], "normal() (in module mlx.nn.init)": [[408, "mlx.nn.init.normal", false]], "not_equal (c++ function)": [[0, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", false]], "not_equal() (in module mlx.core)": [[227, "mlx.core.not_equal", false]], "number_of_elements (c++ function)": [[0, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", false]], "ones (c++ function)": [[0, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", false]], "ones() (in module mlx.core)": [[228, "mlx.core.ones", false]], "ones_like (c++ function)": [[0, "_CPPv49ones_likeRK5array14StreamOrDevice", false]], "ones_like() (in module mlx.core)": [[229, "mlx.core.ones_like", false]], "operator!= (c++ function)": [[0, "_CPPv4I0Ene5array1TRK5array", false], [0, "_CPPv4I0Ene5arrayRK5array1T", false], [0, "_CPPv4neRK5arrayRK5array", false]], "operator% (c++ function)": [[0, "_CPPv4I0Erm5array1TRK5array", false], [0, "_CPPv4I0Erm5arrayRK5array1T", false], [0, "_CPPv4rmRK5arrayRK5array", false]], "operator& (c++ function)": [[0, "_CPPv4anRK5arrayRK5array", false]], "operator&& (c++ function)": [[0, "_CPPv4aaRK5arrayRK5array", false]], "operator* (c++ function)": [[0, "_CPPv4I0Eml5array1TRK5array", false], [0, "_CPPv4I0Eml5arrayRK5array1T", false], [0, "_CPPv4mlRK5arrayRK5array", false]], "operator+ (c++ function)": [[0, "_CPPv4I0Epl5array1TRK5array", false], [0, "_CPPv4I0Epl5arrayRK5array1T", false], [0, "_CPPv4plRK5arrayRK5array", false]], "operator- (c++ function)": [[0, "_CPPv4I0Emi5array1TRK5array", false], [0, "_CPPv4I0Emi5arrayRK5array1T", false], [0, "_CPPv4miRK5array", false], [0, "_CPPv4miRK5arrayRK5array", false]], "operator/ (c++ function)": [[0, "_CPPv4dvRK5arrayRK5array", false], [0, "_CPPv4dvRK5arrayd", false], [0, "_CPPv4dvdRK5array", false]], "operator< (c++ function)": [[0, "_CPPv4I0Elt5array1TRK5array", false], [0, "_CPPv4I0Elt5arrayRK5array1T", false], [0, "_CPPv4ltRK5arrayRK5array", false]], "operator<< (c++ function)": [[0, "_CPPv4lsRK5arrayRK5array", false]], "operator<= (c++ function)": [[0, "_CPPv4I0Ele5array1TRK5array", false], [0, "_CPPv4I0Ele5arrayRK5array1T", false], [0, "_CPPv4leRK5arrayRK5array", false]], "operator== (c++ function)": [[0, "_CPPv4I0Eeq5array1TRK5array", false], [0, "_CPPv4I0Eeq5arrayRK5array1T", false], [0, "_CPPv4eqRK5arrayRK5array", false]], "operator> (c++ function)": [[0, "_CPPv4I0Egt5array1TRK5array", false], [0, "_CPPv4I0Egt5arrayRK5array1T", false], [0, "_CPPv4gtRK5arrayRK5array", false]], "operator>= (c++ function)": [[0, "_CPPv4I0Ege5array1TRK5array", false], [0, "_CPPv4I0Ege5arrayRK5array1T", false], [0, "_CPPv4geRK5arrayRK5array", false]], "operator>> (c++ function)": [[0, "_CPPv4rsRK5arrayRK5array", false]], "operator^ (c++ function)": [[0, "_CPPv4eoRK5arrayRK5array", false]], "operator| (c++ function)": [[0, "_CPPv4orRK5arrayRK5array", false]], "operator|| (c++ function)": [[0, "_CPPv4ooRK5arrayRK5array", false]], "optimizer (class in mlx.optimizers)": [[475, "mlx.optimizers.Optimizer", false]], "outer (c++ function)": [[0, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", false]], "outer() (in module mlx.core)": [[230, "mlx.core.outer", false]], "pad (c++ function)": [[0, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", false]], "pad() (in module mlx.core)": [[231, "mlx.core.pad", false]], "parameters() (module method)": [[369, "mlx.nn.Module.parameters", false]], "partition (c++ function)": [[0, "_CPPv49partitionRK5arrayi14StreamOrDevice", false], [0, "_CPPv49partitionRK5arrayii14StreamOrDevice", false]], "partition() (in module mlx.core)": [[232, "mlx.core.partition", false]], "permutation() (in module mlx.core.random)": [[246, "mlx.core.random.permutation", false]], "power (c++ function)": [[0, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", false]], "power() (in module mlx.core)": [[233, "mlx.core.power", false]], "prelu (class in mlx.nn)": [[380, "mlx.nn.PReLU", false], [437, "mlx.nn.prelu", false]], "prod (c++ function)": [[0, "_CPPv44prodRK5array14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayb14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayib14StreamOrDevice", false]], "prod() (array method)": [[63, "mlx.core.array.prod", false]], "prod() (in module mlx.core)": [[234, "mlx.core.prod", false]], "put_along_axis (c++ function)": [[0, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false]], "put_along_axis() (in module mlx.core)": [[235, "mlx.core.put_along_axis", false]], "qr() (in module mlx.core.linalg)": [[189, "mlx.core.linalg.qr", false]], "quantize (c++ function)": [[0, "_CPPv48quantizeRK5arrayii14StreamOrDevice", false]], "quantize() (in module mlx.core)": [[236, "mlx.core.quantize", false]], "quantize() (in module mlx.nn)": [[306, "mlx.nn.quantize", false]], "quantized_matmul (c++ function)": [[0, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", false]], "quantized_matmul() (in module mlx.core)": [[237, "mlx.core.quantized_matmul", false]], "quantizedembedding (class in mlx.nn)": [[381, "mlx.nn.QuantizedEmbedding", false]], "quantizedlinear (class in mlx.nn)": [[382, "mlx.nn.QuantizedLinear", false]], "radians (c++ function)": [[0, "_CPPv47radiansRK5array14StreamOrDevice", false]], "radians() (in module mlx.core)": [[238, "mlx.core.radians", false]], "randint() (in module mlx.core.random)": [[247, "mlx.core.random.randint", false]], "real (c++ function)": [[0, "_CPPv44realRK5array14StreamOrDevice", false]], "real() (in module mlx.core)": [[252, "mlx.core.real", false]], "reciprocal (c++ function)": [[0, "_CPPv410reciprocalRK5array14StreamOrDevice", false]], "reciprocal() (array method)": [[64, "mlx.core.array.reciprocal", false]], "reciprocal() (in module mlx.core)": [[253, "mlx.core.reciprocal", false]], "recv() (in module mlx.core.distributed)": [[125, "mlx.core.distributed.recv", false]], "recv_like() (in module mlx.core.distributed)": [[126, "mlx.core.distributed.recv_like", false]], "relu (class in mlx.nn)": [[385, "mlx.nn.ReLU", false], [438, "mlx.nn.relu", false]], "relu6 (class in mlx.nn)": [[386, "mlx.nn.ReLU6", false], [439, "mlx.nn.relu6", false]], "remainder (c++ function)": [[0, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", false]], "remainder() (in module mlx.core)": [[254, "mlx.core.remainder", false]], "repeat (c++ function)": [[0, "_CPPv46repeatRK5arrayi14StreamOrDevice", false], [0, "_CPPv46repeatRK5arrayii14StreamOrDevice", false]], "repeat() (in module mlx.core)": [[255, "mlx.core.repeat", false]], "reset_peak_memory() (in module mlx.core.metal)": [[214, "mlx.core.metal.reset_peak_memory", false]], "reshape (c++ function)": [[0, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "reshape() (array method)": [[65, "mlx.core.array.reshape", false]], "reshape() (in module mlx.core)": [[256, "mlx.core.reshape", false]], "rfft() (in module mlx.core.fft)": [[155, "mlx.core.fft.rfft", false]], "rfft2() (in module mlx.core.fft)": [[156, "mlx.core.fft.rfft2", false]], "rfftn() (in module mlx.core.fft)": [[157, "mlx.core.fft.rfftn", false]], "right_shift (c++ function)": [[0, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", false]], "right_shift() (in module mlx.core)": [[257, "mlx.core.right_shift", false]], "rms_norm() (in module mlx.core.fast)": [[143, "mlx.core.fast.rms_norm", false]], "rmsnorm (class in mlx.nn)": [[383, "mlx.nn.RMSNorm", false]], "rmsprop (class in mlx.optimizers)": [[467, "mlx.optimizers.RMSprop", false]], "rnn (class in mlx.nn)": [[384, "mlx.nn.RNN", false]], "roll (c++ function)": [[0, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayi14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayii14StreamOrDevice", false]], "roll() (in module mlx.core)": [[258, "mlx.core.roll", false]], "rope (class in mlx.nn)": [[387, "mlx.nn.RoPE", false]], "rope() (in module mlx.core.fast)": [[144, "mlx.core.fast.rope", false]], "round (c++ function)": [[0, "_CPPv45roundRK5array14StreamOrDevice", false], [0, "_CPPv45roundRK5arrayi14StreamOrDevice", false]], "round() (array method)": [[66, "mlx.core.array.round", false]], "round() (in module mlx.core)": [[259, "mlx.core.round", false]], "rsqrt (c++ function)": [[0, "_CPPv45rsqrtRK5array14StreamOrDevice", false]], "rsqrt() (array method)": [[67, "mlx.core.array.rsqrt", false]], "rsqrt() (in module mlx.core)": [[260, "mlx.core.rsqrt", false]], "save() (in module mlx.core)": [[261, "mlx.core.save", false]], "save_gguf() (in module mlx.core)": [[262, "mlx.core.save_gguf", false]], "save_safetensors() (in module mlx.core)": [[263, "mlx.core.save_safetensors", false]], "save_weights() (module method)": [[370, "mlx.nn.Module.save_weights", false]], "savez() (in module mlx.core)": [[264, "mlx.core.savez", false]], "savez_compressed() (in module mlx.core)": [[265, "mlx.core.savez_compressed", false]], "scaled_dot_product_attention() (in module mlx.core.fast)": [[145, "mlx.core.fast.scaled_dot_product_attention", false]], "scatter (c++ function)": [[0, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_add (c++ function)": [[0, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_max (c++ function)": [[0, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_min (c++ function)": [[0, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_prod (c++ function)": [[0, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "seed() (in module mlx.core.random)": [[248, "mlx.core.random.seed", false]], "selu (class in mlx.nn)": [[388, "mlx.nn.SELU", false], [440, "mlx.nn.selu", false]], "send() (in module mlx.core.distributed)": [[127, "mlx.core.distributed.send", false]], "sequential (class in mlx.nn)": [[389, "mlx.nn.Sequential", false]], "set_cache_limit() (in module mlx.core.metal)": [[215, "mlx.core.metal.set_cache_limit", false]], "set_default_device() (in module mlx.core)": [[266, "mlx.core.set_default_device", false]], "set_default_stream() (in module mlx.core)": [[267, "mlx.core.set_default_stream", false]], "set_dtype() (module method)": [[371, "mlx.nn.Module.set_dtype", false]], "set_memory_limit() (in module mlx.core.metal)": [[216, "mlx.core.metal.set_memory_limit", false]], "set_wired_limit() (in module mlx.core.metal)": [[217, "mlx.core.metal.set_wired_limit", false]], "sgd (class in mlx.optimizers)": [[468, "mlx.optimizers.SGD", false]], "shape (array property)": [[68, "mlx.core.array.shape", false]], "sigmoid (c++ function)": [[0, "_CPPv47sigmoidRK5array14StreamOrDevice", false]], "sigmoid (class in mlx.nn)": [[391, "mlx.nn.Sigmoid", false], [441, "mlx.nn.sigmoid", false]], "sigmoid() (in module mlx.core)": [[268, "mlx.core.sigmoid", false]], "sign (c++ function)": [[0, "_CPPv44signRK5array14StreamOrDevice", false]], "sign() (in module mlx.core)": [[269, "mlx.core.sign", false]], "silu (class in mlx.nn)": [[390, "mlx.nn.SiLU", false], [442, "mlx.nn.silu", false]], "sin (c++ function)": [[0, "_CPPv43sinRK5array14StreamOrDevice", false]], "sin() (array method)": [[69, "mlx.core.array.sin", false]], "sin() (in module mlx.core)": [[270, "mlx.core.sin", false]], "sinh (c++ function)": [[0, "_CPPv44sinhRK5array14StreamOrDevice", false]], "sinh() (in module mlx.core)": [[271, "mlx.core.sinh", false]], "sinusoidalpositionalencoding (class in mlx.nn)": [[392, "mlx.nn.SinusoidalPositionalEncoding", false]], "size (array property)": [[70, "mlx.core.array.size", false]], "slice (c++ function)": [[0, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false]], "slice_update (c++ function)": [[0, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false]], "smooth_l1_loss (class in mlx.nn.losses)": [[434, "mlx.nn.losses.smooth_l1_loss", false]], "softmax (c++ function)": [[0, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv47softmaxRK5arrayb14StreamOrDevice", false], [0, "_CPPv47softmaxRK5arrayib14StreamOrDevice", false]], "softmax (class in mlx.nn)": [[393, "mlx.nn.Softmax", false], [443, "mlx.nn.softmax", false]], "softmax() (in module mlx.core)": [[272, "mlx.core.softmax", false]], "softmin (class in mlx.nn)": [[394, "mlx.nn.Softmin", false], [444, "mlx.nn.softmin", false]], "softplus (class in mlx.nn)": [[395, "mlx.nn.Softplus", false], [445, "mlx.nn.softplus", false]], "softshrink (class in mlx.nn)": [[396, "mlx.nn.Softshrink", false], [446, "mlx.nn.softshrink", false]], "softsign (class in mlx.nn)": [[397, "mlx.nn.Softsign", false]], "sort (c++ function)": [[0, "_CPPv44sortRK5array14StreamOrDevice", false], [0, "_CPPv44sortRK5arrayi14StreamOrDevice", false]], "sort() (in module mlx.core)": [[273, "mlx.core.sort", false]], "split (c++ function)": [[0, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayi14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayii14StreamOrDevice", false]], "split() (array method)": [[71, "mlx.core.array.split", false]], "split() (in module mlx.core)": [[274, "mlx.core.split", false]], "split() (in module mlx.core.random)": [[249, "mlx.core.random.split", false]], "sqrt (c++ function)": [[0, "_CPPv44sqrtRK5array14StreamOrDevice", false]], "sqrt() (array method)": [[72, "mlx.core.array.sqrt", false]], "sqrt() (in module mlx.core)": [[275, "mlx.core.sqrt", false]], "square (c++ function)": [[0, "_CPPv46squareRK5array14StreamOrDevice", false]], "square() (array method)": [[73, "mlx.core.array.square", false]], "square() (in module mlx.core)": [[276, "mlx.core.square", false]], "squeeze (c++ function)": [[0, "_CPPv47squeezeRK5array14StreamOrDevice", false], [0, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv47squeezeRK5arrayi14StreamOrDevice", false]], "squeeze() (array method)": [[74, "mlx.core.array.squeeze", false]], "squeeze() (in module mlx.core)": [[277, "mlx.core.squeeze", false]], "stack (c++ function)": [[0, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", false], [0, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", false]], "stack() (in module mlx.core)": [[278, "mlx.core.stack", false]], "start_capture() (in module mlx.core.metal)": [[218, "mlx.core.metal.start_capture", false]], "state (module property)": [[372, "mlx.nn.Module.state", false]], "state (optimizer property)": [[465, "mlx.optimizers.Optimizer.state", false]], "std (c++ function)": [[0, "_CPPv4StRK5array14StreamOrDevice", false], [0, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", false], [0, "_CPPv4StRK5arraybi14StreamOrDevice", false], [0, "_CPPv4StRK5arrayibi14StreamOrDevice", false]], "std() (array method)": [[75, "mlx.core.array.std", false]], "std() (in module mlx.core)": [[279, "mlx.core.std", false]], "step (class in mlx.nn)": [[398, "mlx.nn.Step", false], [447, "mlx.nn.step", false]], "step_decay() (in module mlx.optimizers)": [[473, "mlx.optimizers.step_decay", false]], "stop_capture() (in module mlx.core.metal)": [[219, "mlx.core.metal.stop_capture", false]], "stop_gradient (c++ function)": [[0, "_CPPv413stop_gradientRK5array14StreamOrDevice", false]], "stop_gradient() (in module mlx.core)": [[280, "mlx.core.stop_gradient", false]], "stream (class in mlx.core)": [[314, "mlx.core.Stream", false]], "stream() (in module mlx.core)": [[281, "mlx.core.stream", false]], "subtract (c++ function)": [[0, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", false]], "subtract() (in module mlx.core)": [[282, "mlx.core.subtract", false]], "sum (c++ function)": [[0, "_CPPv43sumRK5array14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayb14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayib14StreamOrDevice", false]], "sum() (array method)": [[76, "mlx.core.array.sum", false]], "sum() (in module mlx.core)": [[283, "mlx.core.sum", false]], "svd() (in module mlx.core.linalg)": [[190, "mlx.core.linalg.svd", false]], "swapaxes (c++ function)": [[0, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", false]], "swapaxes() (array method)": [[77, "mlx.core.array.swapaxes", false]], "swapaxes() (in module mlx.core)": [[284, "mlx.core.swapaxes", false]], "synchronize() (in module mlx.core)": [[285, "mlx.core.synchronize", false]], "t (array property)": [[31, "mlx.core.array.T", false]], "take (c++ function)": [[0, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayi14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayii14StreamOrDevice", false]], "take() (in module mlx.core)": [[286, "mlx.core.take", false]], "take_along_axis (c++ function)": [[0, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", false]], "take_along_axis() (in module mlx.core)": [[287, "mlx.core.take_along_axis", false]], "tan (c++ function)": [[0, "_CPPv43tanRK5array14StreamOrDevice", false]], "tan() (in module mlx.core)": [[288, "mlx.core.tan", false]], "tanh (c++ function)": [[0, "_CPPv44tanhRK5array14StreamOrDevice", false]], "tanh (class in mlx.nn)": [[399, "mlx.nn.Tanh", false], [448, "mlx.nn.tanh", false]], "tanh() (in module mlx.core)": [[289, "mlx.core.tanh", false]], "tensordot (c++ function)": [[0, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", false], [0, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false]], "tensordot() (in module mlx.core)": [[290, "mlx.core.tensordot", false]], "tile (c++ function)": [[0, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "tile() (in module mlx.core)": [[291, "mlx.core.tile", false]], "tolist() (array method)": [[78, "mlx.core.array.tolist", false]], "topk (c++ function)": [[0, "_CPPv44topkRK5arrayi14StreamOrDevice", false], [0, "_CPPv44topkRK5arrayii14StreamOrDevice", false]], "topk() (in module mlx.core)": [[292, "mlx.core.topk", false]], "trace (c++ function)": [[0, "_CPPv45traceRK5array14StreamOrDevice", false], [0, "_CPPv45traceRK5arrayiii14StreamOrDevice", false], [0, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", false]], "trace() (in module mlx.core)": [[293, "mlx.core.trace", false]], "train() (module method)": [[373, "mlx.nn.Module.train", false]], "trainable_parameters() (module method)": [[374, "mlx.nn.Module.trainable_parameters", false]], "training (module property)": [[375, "mlx.nn.Module.training", false]], "transformer (class in mlx.nn)": [[400, "mlx.nn.Transformer", false]], "transpose (c++ function)": [[0, "_CPPv49transposeRK5array14StreamOrDevice", false], [0, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", false], [0, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "transpose() (array method)": [[79, "mlx.core.array.transpose", false]], "transpose() (in module mlx.core)": [[294, "mlx.core.transpose", false]], "tree_flatten() (in module mlx.utils)": [[309, "mlx.utils.tree_flatten", false]], "tree_map() (in module mlx.utils)": [[310, "mlx.utils.tree_map", false]], "tree_map_with_path() (in module mlx.utils)": [[311, "mlx.utils.tree_map_with_path", false]], "tree_reduce() (in module mlx.utils)": [[312, "mlx.utils.tree_reduce", false]], "tree_unflatten() (in module mlx.utils)": [[313, "mlx.utils.tree_unflatten", false]], "tri (c++ function)": [[0, "_CPPv43trii5Dtype14StreamOrDevice", false], [0, "_CPPv43triiii5Dtype14StreamOrDevice", false]], "tri() (in module mlx.core)": [[295, "mlx.core.tri", false]], "tri_inv() (in module mlx.core.linalg)": [[191, "mlx.core.linalg.tri_inv", false]], "tril (c++ function)": [[0, "_CPPv44tril5arrayi14StreamOrDevice", false]], "tril() (in module mlx.core)": [[296, "mlx.core.tril", false]], "triplet_loss (class in mlx.nn.losses)": [[435, "mlx.nn.losses.triplet_loss", false]], "triu (c++ function)": [[0, "_CPPv44triu5arrayi14StreamOrDevice", false]], "triu() (in module mlx.core)": [[297, "mlx.core.triu", false]], "truncated_normal() (in module mlx.core.random)": [[250, "mlx.core.random.truncated_normal", false]], "unfreeze() (module method)": [[376, "mlx.nn.Module.unfreeze", false]], "uniform() (in module mlx.core.random)": [[251, "mlx.core.random.uniform", false]], "uniform() (in module mlx.nn.init)": [[409, "mlx.nn.init.uniform", false]], "update() (module method)": [[377, "mlx.nn.Module.update", false]], "update() (optimizer method)": [[466, "mlx.optimizers.Optimizer.update", false]], "update_modules() (module method)": [[378, "mlx.nn.Module.update_modules", false]], "upsample (class in mlx.nn)": [[401, "mlx.nn.Upsample", false]], "value_and_grad() (in module mlx.core)": [[298, "mlx.core.value_and_grad", false]], "value_and_grad() (in module mlx.nn)": [[307, "mlx.nn.value_and_grad", false]], "var (c++ function)": [[0, "_CPPv43varRK5array14StreamOrDevice", false], [0, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", false], [0, "_CPPv43varRK5arraybi14StreamOrDevice", false], [0, "_CPPv43varRK5arrayibi14StreamOrDevice", false]], "var() (array method)": [[80, "mlx.core.array.var", false]], "var() (in module mlx.core)": [[299, "mlx.core.var", false]], "view (c++ function)": [[0, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", false]], "view() (array method)": [[81, "mlx.core.array.view", false]], "view() (in module mlx.core)": [[300, "mlx.core.view", false]], "vjp() (in module mlx.core)": [[301, "mlx.core.vjp", false]], "vmap() (in module mlx.core)": [[302, "mlx.core.vmap", false]], "where (c++ function)": [[0, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", false]], "where() (in module mlx.core)": [[303, "mlx.core.where", false]], "zeros (c++ function)": [[0, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", false]], "zeros() (in module mlx.core)": [[304, "mlx.core.zeros", false]], "zeros_like (c++ function)": [[0, "_CPPv410zeros_likeRK5array14StreamOrDevice", false]], "zeros_like() (in module mlx.core)": [[305, "mlx.core.zeros_like", false]]}, "objects": {"": [[0, 0, 1, "_CPPv43absRK5array14StreamOrDevice", "abs"], [0, 1, 1, "_CPPv43absRK5array14StreamOrDevice", "abs::a"], [0, 1, 1, "_CPPv43absRK5array14StreamOrDevice", "abs::s"], [0, 0, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::a"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::b"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::s"], [0, 0, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::a"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::alpha"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::b"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::beta"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::c"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::s"], [0, 0, 1, "_CPPv43allRK5array14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all"], [0, 1, 1, "_CPPv43allRK5array14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::axes"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::axis"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5array14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::s"], [0, 0, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::a"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::atol"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::b"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::equal_nan"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::rtol"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::s"], [0, 0, 1, "_CPPv43anyRK5array14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any"], [0, 1, 1, "_CPPv43anyRK5array14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::axes"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::axis"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5array14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::s"], [0, 0, 1, "_CPPv46aranged14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangedd14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeddd14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangei14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeii14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeiii14StreamOrDevice", "arange"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46aranged14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangei14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46aranged14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangei14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::stop"], [0, 0, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos"], [0, 1, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos::a"], [0, 1, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos::s"], [0, 0, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh"], [0, 1, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh::a"], [0, 1, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh::s"], [0, 0, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin"], [0, 1, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin::a"], [0, 1, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin::s"], [0, 0, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh"], [0, 1, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh::a"], [0, 1, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh::s"], [0, 0, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan"], [0, 0, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::a"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::b"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::s"], [0, 1, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan::a"], [0, 1, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan::s"], [0, 0, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh"], [0, 1, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh::a"], [0, 1, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh::s"], [0, 0, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax"], [0, 0, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax"], [0, 0, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax"], [0, 1, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::axis"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::keepdims"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::keepdims"], [0, 1, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax::s"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::s"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::s"], [0, 0, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin"], [0, 0, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin"], [0, 0, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin"], [0, 1, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::axis"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::keepdims"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::keepdims"], [0, 1, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin::s"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::s"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::s"], [0, 0, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition"], [0, 0, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::a"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::a"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::axis"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::kth"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::kth"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::s"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::s"], [0, 0, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort"], [0, 0, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort"], [0, 1, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort::a"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::a"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::axis"], [0, 1, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort::s"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::s"], [0, 0, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal"], [0, 0, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::a"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::a"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::b"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::b"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::equal_nan"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::s"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::s"], [0, 0, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::a"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::offset"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::s"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::shape"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::strides"], [0, 0, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::a"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::dtype"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::s"], [0, 0, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d"], [0, 0, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d"], [0, 1, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d::a"], [0, 1, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d::a"], [0, 1, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d::s"], [0, 1, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d::s"], [0, 0, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d"], [0, 0, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d"], [0, 1, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d::a"], [0, 1, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d::a"], [0, 1, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d::s"], [0, 1, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d::s"], [0, 0, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d"], [0, 0, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d"], [0, 1, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d::a"], [0, 1, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d::a"], [0, 1, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d::s"], [0, 1, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d::s"], [0, 0, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::a"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::b"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::s"], [0, 0, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::a"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::b"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::s"], [0, 0, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::a"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::b"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::s"], [0, 0, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::a"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::b"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::block_size"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_lhs"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_out"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_rhs"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::s"], [0, 0, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays"], [0, 1, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays::inputs"], [0, 1, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays::s"], [0, 0, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::a"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::s"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::shape"], [0, 0, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil"], [0, 1, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil::a"], [0, 1, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil::s"], [0, 0, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a_max"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a_min"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::s"], [0, 0, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate"], [0, 0, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate::arrays"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::arrays"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::axis"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate::s"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::s"], [0, 0, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate"], [0, 1, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate::a"], [0, 1, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate::s"], [0, 0, 1, "_CPPv410contiguousRK5arrayb14StreamOrDevice", "contiguous"], [0, 1, 1, "_CPPv410contiguousRK5arrayb14StreamOrDevice", "contiguous::a"], [0, 1, 1, "_CPPv410contiguousRK5arrayb14StreamOrDevice", "contiguous::allow_col_major"], [0, 1, 1, "_CPPv410contiguousRK5arrayb14StreamOrDevice", "contiguous::s"], [0, 0, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::dilation"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::groups"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::input"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::padding"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::s"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::stride"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::weight"], [0, 0, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::dilation"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::groups"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::input"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::padding"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::s"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::stride"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::weight"], [0, 0, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::dilation"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::groups"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::input"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::padding"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::s"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::stride"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::weight"], [0, 0, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general"], [0, 0, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::flip"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::flip"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::groups"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::groups"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input_dilation"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::kernel_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::kernel_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding_hi"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding_lo"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::s"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::s"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::stride"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::stride"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::weight"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::weight"], [0, 0, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::dilation"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::groups"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::input"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::padding"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::s"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::stride"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::weight"], [0, 0, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::dilation"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::groups"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::input"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::padding"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::s"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::stride"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::weight"], [0, 0, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::dilation"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::groups"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::input"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::padding"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::s"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::stride"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::weight"], [0, 0, 1, "_CPPv44copy5array14StreamOrDevice", "copy"], [0, 1, 1, "_CPPv44copy5array14StreamOrDevice", "copy::a"], [0, 1, 1, "_CPPv44copy5array14StreamOrDevice", "copy::s"], [0, 0, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos"], [0, 1, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos::a"], [0, 1, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos::s"], [0, 0, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh"], [0, 1, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh::a"], [0, 1, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh::s"], [0, 0, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::a"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::axis"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::inclusive"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::reverse"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::s"], [0, 0, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::a"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::axis"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::inclusive"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::reverse"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::s"], [0, 0, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::a"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::axis"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::inclusive"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::reverse"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::s"], [0, 0, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::a"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::axis"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::inclusive"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::reverse"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::s"], [0, 0, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees"], [0, 1, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees::a"], [0, 1, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees::s"], [0, 0, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends"], [0, 1, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends::dependencies"], [0, 1, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends::inputs"], [0, 0, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::biases"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::bits"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::group_size"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::s"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::scales"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::w"], [0, 0, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::a"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::k"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::s"], [0, 0, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::a"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::axis1"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::axis2"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::offset"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::s"], [0, 0, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::a"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::b"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::s"], [0, 0, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::a"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::b"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::s"], [0, 0, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::a"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::b"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::s"], [0, 0, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf"], [0, 1, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf::a"], [0, 1, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf::s"], [0, 0, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv"], [0, 1, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv::a"], [0, 1, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv::s"], [0, 0, 1, "_CPPv43expRK5array14StreamOrDevice", "exp"], [0, 1, 1, "_CPPv43expRK5array14StreamOrDevice", "exp::a"], [0, 1, 1, "_CPPv43expRK5array14StreamOrDevice", "exp::s"], [0, 0, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims"], [0, 0, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::a"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::a"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::axes"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::axis"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::s"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::s"], [0, 0, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1"], [0, 1, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1::a"], [0, 1, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1::s"], [0, 0, 1, "_CPPv43eyei14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeii14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeiii14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::dtype"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::dtype"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::k"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::k"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyei14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyei14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::s"], [0, 0, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten"], [0, 0, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten"], [0, 1, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten::a"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::a"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::end_axis"], [0, 1, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten::s"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::s"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::start_axis"], [0, 0, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor"], [0, 1, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor::a"], [0, 1, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor::s"], [0, 0, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::a"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::b"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::s"], [0, 0, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full"], [0, 0, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full"], [0, 0, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full"], [0, 0, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full"], [0, 2, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::T"], [0, 2, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::T"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::dtype"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::dtype"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::val"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::val"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::vals"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::vals"], [0, 0, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather"], [0, 0, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::a"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::a"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::axes"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::axis"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::indices"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::indices"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::s"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::s"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::slice_sizes"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::slice_sizes"], [0, 0, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::a"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::b"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::lhs_indices"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::rhs_indices"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::s"], [0, 0, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::biases"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::bits"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::group_size"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::lhs_indices"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::rhs_indices"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::s"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::scales"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::transpose"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::w"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::x"], [0, 0, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::a"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::b"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::s"], [0, 0, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::a"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::b"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::s"], [0, 0, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::a"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::s"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::scale"], [0, 0, 1, "_CPPv48identityi14StreamOrDevice", "identity"], [0, 0, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::dtype"], [0, 1, 1, "_CPPv48identityi14StreamOrDevice", "identity::n"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::n"], [0, 1, 1, "_CPPv48identityi14StreamOrDevice", "identity::s"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::s"], [0, 0, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag"], [0, 1, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag::a"], [0, 1, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag::s"], [0, 0, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::a"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::b"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::s"], [0, 0, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::a"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::atol"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::b"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::equal_nan"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::rtol"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::s"], [0, 0, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite"], [0, 1, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite::a"], [0, 1, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite::s"], [0, 0, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf"], [0, 1, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf::a"], [0, 1, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf::s"], [0, 0, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan"], [0, 1, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan::a"], [0, 1, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan::s"], [0, 0, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf"], [0, 1, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf::a"], [0, 1, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf::s"], [0, 0, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf"], [0, 1, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf::a"], [0, 1, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf::s"], [0, 0, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::a"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::b"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::s"], [0, 0, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::a"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::b"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::s"], [0, 0, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::a"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::b"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::s"], [0, 0, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::dtype"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::num"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::s"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::start"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::stop"], [0, 0, 1, "_CPPv43logRK5array14StreamOrDevice", "log"], [0, 0, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10"], [0, 1, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10::a"], [0, 1, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10::s"], [0, 0, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p"], [0, 1, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p::a"], [0, 1, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p::s"], [0, 0, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2"], [0, 1, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2::a"], [0, 1, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2::s"], [0, 1, 1, "_CPPv43logRK5array14StreamOrDevice", "log::a"], [0, 1, 1, "_CPPv43logRK5array14StreamOrDevice", "log::s"], [0, 0, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::a"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::b"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::s"], [0, 0, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::a"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::b"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::s"], [0, 0, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not"], [0, 1, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not::a"], [0, 1, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not::s"], [0, 0, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::a"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::b"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::s"], [0, 0, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp"], [0, 1, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::axes"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::axis"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::s"], [0, 0, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::a"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::b"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::s"], [0, 0, 1, "_CPPv43maxRK5array14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max"], [0, 1, 1, "_CPPv43maxRK5array14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::axes"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::axis"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5array14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::s"], [0, 0, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::a"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::b"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::s"], [0, 0, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean"], [0, 1, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::axes"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::axis"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::s"], [0, 0, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::arrays"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::indexing"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::s"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::sparse"], [0, 0, 1, "_CPPv43minRK5array14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min"], [0, 1, 1, "_CPPv43minRK5array14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::axes"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::axis"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5array14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::s"], [0, 0, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::a"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::b"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::s"], [0, 0, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::a"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::destination"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::s"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::source"], [0, 0, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::a"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::b"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::s"], [0, 0, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::a"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::nan"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::neginf"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::posinf"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::s"], [0, 0, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative"], [0, 1, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative::a"], [0, 1, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative::s"], [0, 0, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::a"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::b"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::s"], [0, 0, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::a"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::axes"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::dtype"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::inverted"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::s"], [0, 0, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones"], [0, 0, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::dtype"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones::s"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::s"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones::shape"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::shape"], [0, 0, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like"], [0, 1, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like::a"], [0, 1, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like::s"], [0, 0, 1, "_CPPv4I0Ene5array1TRK5array", "operator!="], [0, 0, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!="], [0, 0, 1, "_CPPv4neRK5arrayRK5array", "operator!="], [0, 2, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::T"], [0, 2, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::T"], [0, 1, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::a"], [0, 1, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::a"], [0, 1, 1, "_CPPv4neRK5arrayRK5array", "operator!=::a"], [0, 1, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::b"], [0, 1, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::b"], [0, 1, 1, "_CPPv4neRK5arrayRK5array", "operator!=::b"], [0, 0, 1, "_CPPv4I0Erm5array1TRK5array", "operator%"], [0, 0, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%"], [0, 0, 1, "_CPPv4rmRK5arrayRK5array", "operator%"], [0, 2, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::T"], [0, 2, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::T"], [0, 1, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::a"], [0, 1, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::a"], [0, 1, 1, "_CPPv4rmRK5arrayRK5array", "operator%::a"], [0, 1, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::b"], [0, 1, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::b"], [0, 1, 1, "_CPPv4rmRK5arrayRK5array", "operator%::b"], [0, 0, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;"], [0, 0, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;"], [0, 1, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;::a"], [0, 1, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;::b"], [0, 1, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;::a"], [0, 1, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;::b"], [0, 0, 1, "_CPPv4I0Eml5array1TRK5array", "operator*"], [0, 0, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*"], [0, 0, 1, "_CPPv4mlRK5arrayRK5array", "operator*"], [0, 2, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::T"], [0, 2, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::T"], [0, 1, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::a"], [0, 1, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::a"], [0, 1, 1, "_CPPv4mlRK5arrayRK5array", "operator*::a"], [0, 1, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::b"], [0, 1, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::b"], [0, 1, 1, "_CPPv4mlRK5arrayRK5array", "operator*::b"], [0, 0, 1, "_CPPv4I0Epl5array1TRK5array", "operator+"], [0, 0, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+"], [0, 0, 1, "_CPPv4plRK5arrayRK5array", "operator+"], [0, 2, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::T"], [0, 2, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::T"], [0, 1, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::a"], [0, 1, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::a"], [0, 1, 1, "_CPPv4plRK5arrayRK5array", "operator+::a"], [0, 1, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::b"], [0, 1, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::b"], [0, 1, 1, "_CPPv4plRK5arrayRK5array", "operator+::b"], [0, 0, 1, "_CPPv4I0Emi5array1TRK5array", "operator-"], [0, 0, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-"], [0, 0, 1, "_CPPv4miRK5array", "operator-"], [0, 0, 1, "_CPPv4miRK5arrayRK5array", "operator-"], [0, 2, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::T"], [0, 2, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::T"], [0, 1, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::a"], [0, 1, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::a"], [0, 1, 1, "_CPPv4miRK5array", "operator-::a"], [0, 1, 1, "_CPPv4miRK5arrayRK5array", "operator-::a"], [0, 1, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::b"], [0, 1, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::b"], [0, 1, 1, "_CPPv4miRK5arrayRK5array", "operator-::b"], [0, 0, 1, "_CPPv4dvRK5arrayRK5array", "operator/"], [0, 0, 1, "_CPPv4dvRK5arrayd", "operator/"], [0, 0, 1, "_CPPv4dvdRK5array", "operator/"], [0, 1, 1, "_CPPv4dvRK5arrayRK5array", "operator/::a"], [0, 1, 1, "_CPPv4dvRK5arrayd", "operator/::a"], [0, 1, 1, "_CPPv4dvdRK5array", "operator/::a"], [0, 1, 1, "_CPPv4dvRK5arrayRK5array", "operator/::b"], [0, 1, 1, "_CPPv4dvRK5arrayd", "operator/::b"], [0, 1, 1, "_CPPv4dvdRK5array", "operator/::b"], [0, 0, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;"], [0, 0, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;"], [0, 0, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;"], [0, 2, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::T"], [0, 2, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::T"], [0, 1, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::a"], [0, 1, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::a"], [0, 1, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;::a"], [0, 1, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::b"], [0, 1, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::b"], [0, 1, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;::b"], [0, 0, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;"], [0, 1, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;::a"], [0, 1, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;::b"], [0, 0, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;="], [0, 0, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;="], [0, 0, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;="], [0, 2, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::T"], [0, 2, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::T"], [0, 1, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::a"], [0, 1, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::a"], [0, 1, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;=::a"], [0, 1, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::b"], [0, 1, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::b"], [0, 1, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;=::b"], [0, 0, 1, "_CPPv4I0Eeq5array1TRK5array", "operator=="], [0, 0, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator=="], [0, 0, 1, "_CPPv4eqRK5arrayRK5array", "operator=="], [0, 2, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::T"], [0, 2, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::T"], [0, 1, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::a"], [0, 1, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::a"], [0, 1, 1, "_CPPv4eqRK5arrayRK5array", "operator==::a"], [0, 1, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::b"], [0, 1, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::b"], [0, 1, 1, "_CPPv4eqRK5arrayRK5array", "operator==::b"], [0, 0, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;"], [0, 0, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;"], [0, 0, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;"], [0, 2, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::T"], [0, 2, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::T"], [0, 1, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::a"], [0, 1, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::a"], [0, 1, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;::a"], [0, 1, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::b"], [0, 1, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::b"], [0, 1, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;::b"], [0, 0, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;="], [0, 0, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;="], [0, 0, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;="], [0, 2, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::T"], [0, 2, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::T"], [0, 1, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::a"], [0, 1, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::a"], [0, 1, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;=::a"], [0, 1, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::b"], [0, 1, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::b"], [0, 1, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;=::b"], [0, 0, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;"], [0, 1, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;::a"], [0, 1, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;::b"], [0, 0, 1, "_CPPv4eoRK5arrayRK5array", "operator^"], [0, 1, 1, "_CPPv4eoRK5arrayRK5array", "operator^::a"], [0, 1, 1, "_CPPv4eoRK5arrayRK5array", "operator^::b"], [0, 0, 1, "_CPPv4orRK5arrayRK5array", "operator|"], [0, 1, 1, "_CPPv4orRK5arrayRK5array", "operator|::a"], [0, 1, 1, "_CPPv4orRK5arrayRK5array", "operator|::b"], [0, 0, 1, "_CPPv4ooRK5arrayRK5array", "operator||"], [0, 1, 1, "_CPPv4ooRK5arrayRK5array", "operator||::a"], [0, 1, 1, "_CPPv4ooRK5arrayRK5array", "operator||::b"], [0, 0, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::a"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::b"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::s"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::axes"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::high_pad_size"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::low_pad_size"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 0, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition"], [0, 0, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::a"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::a"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::axis"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::kth"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::kth"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::s"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::s"], [0, 0, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::a"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::b"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::s"], [0, 0, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod"], [0, 1, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::axes"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::axis"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::s"], [0, 0, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::a"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::axis"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::indices"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::s"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::values"], [0, 0, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::bits"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::group_size"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::s"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::w"], [0, 0, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::biases"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::bits"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::group_size"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::s"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::scales"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::transpose"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::w"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::x"], [0, 0, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians"], [0, 1, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians::a"], [0, 1, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians::s"], [0, 0, 1, "_CPPv44realRK5array14StreamOrDevice", "real"], [0, 1, 1, "_CPPv44realRK5array14StreamOrDevice", "real::a"], [0, 1, 1, "_CPPv44realRK5array14StreamOrDevice", "real::s"], [0, 0, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal"], [0, 1, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal::a"], [0, 1, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal::s"], [0, 0, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::a"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::b"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::s"], [0, 0, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat"], [0, 0, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::arr"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::arr"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::axis"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::repeats"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::repeats"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::s"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::s"], [0, 0, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::a"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::s"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::shape"], [0, 0, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::a"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::b"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::s"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::axes"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::axes"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::axis"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::axis"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::shift"], [0, 0, 1, "_CPPv45roundRK5array14StreamOrDevice", "round"], [0, 0, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round"], [0, 1, 1, "_CPPv45roundRK5array14StreamOrDevice", "round::a"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::a"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::decimals"], [0, 1, 1, "_CPPv45roundRK5array14StreamOrDevice", "round::s"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::s"], [0, 0, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt"], [0, 1, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt::a"], [0, 1, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt::s"], [0, 0, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter"], [0, 0, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::a"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::a"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::axes"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::axis"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::indices"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::indices"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::s"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::s"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::updates"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::updates"], [0, 0, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add"], [0, 0, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::a"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::a"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::axes"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::axis"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::indices"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::indices"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::s"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::s"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::updates"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::updates"], [0, 0, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max"], [0, 0, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::a"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::a"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::axes"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::axis"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::indices"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::indices"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::s"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::s"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::updates"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::updates"], [0, 0, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min"], [0, 0, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::a"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::a"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::axes"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::axis"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::indices"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::indices"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::s"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::s"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::updates"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::updates"], [0, 0, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod"], [0, 0, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::a"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::a"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::axes"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::axis"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::indices"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::indices"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::s"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::s"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::updates"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::updates"], [0, 0, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid"], [0, 1, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid::a"], [0, 1, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid::s"], [0, 0, 1, "_CPPv44signRK5array14StreamOrDevice", "sign"], [0, 1, 1, "_CPPv44signRK5array14StreamOrDevice", "sign::a"], [0, 1, 1, "_CPPv44signRK5array14StreamOrDevice", "sign::s"], [0, 0, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin"], [0, 1, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin::a"], [0, 1, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin::s"], [0, 0, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh"], [0, 1, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh::a"], [0, 1, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh::s"], [0, 0, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice"], [0, 0, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::a"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::a"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::s"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::s"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::start"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::start"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::stop"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::stop"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::strides"], [0, 0, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update"], [0, 0, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::s"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::s"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::src"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::src"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::start"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::start"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::stop"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::stop"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::strides"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::update"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::update"], [0, 0, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax"], [0, 0, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax"], [0, 0, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::axes"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::axis"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::s"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::s"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::s"], [0, 0, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort"], [0, 0, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort"], [0, 1, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort::a"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::a"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::axis"], [0, 1, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort::s"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::s"], [0, 0, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::axis"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::axis"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::indices"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::indices"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::num_splits"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::num_splits"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::s"], [0, 0, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt"], [0, 1, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt::a"], [0, 1, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt::s"], [0, 0, 1, "_CPPv46squareRK5array14StreamOrDevice", "square"], [0, 1, 1, "_CPPv46squareRK5array14StreamOrDevice", "square::a"], [0, 1, 1, "_CPPv46squareRK5array14StreamOrDevice", "square::s"], [0, 0, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze"], [0, 0, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze"], [0, 0, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze"], [0, 1, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::axes"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::axis"], [0, 1, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze::s"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::s"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::s"], [0, 0, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack"], [0, 0, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack::arrays"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::arrays"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::axis"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack::s"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::s"], [0, 0, 1, "_CPPv4StRK5array14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std"], [0, 1, 1, "_CPPv4StRK5array14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::axes"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::axis"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5array14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::s"], [0, 0, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient"], [0, 1, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient::a"], [0, 1, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient::s"], [0, 0, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::a"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::b"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::s"], [0, 0, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum"], [0, 1, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::axes"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::axis"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::s"], [0, 0, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::a"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::axis1"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::axis2"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::s"], [0, 0, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::axis"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::axis"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::index"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::index"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::indices"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::indices"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::s"], [0, 0, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::a"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::axis"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::indices"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::s"], [0, 0, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan"], [0, 1, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan::a"], [0, 1, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan::s"], [0, 0, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh"], [0, 1, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh::a"], [0, 1, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh::s"], [0, 0, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot"], [0, 0, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::axes_a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::axes_b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::axis"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::s"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::s"], [0, 0, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::arr"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::reps"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::s"], [0, 0, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk"], [0, 0, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::a"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::a"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::axis"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::k"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::k"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::s"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::s"], [0, 0, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace"], [0, 0, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace"], [0, 0, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace"], [0, 1, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::axis1"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::axis1"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::axis2"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::axis2"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::dtype"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::offset"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::offset"], [0, 1, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace::s"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::s"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::s"], [0, 0, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose"], [0, 0, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose"], [0, 0, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose"], [0, 1, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::axes"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::axes"], [0, 1, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose::s"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::s"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::s"], [0, 0, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri"], [0, 0, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::k"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::m"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::n"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::n"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::s"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::s"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::type"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::type"], [0, 0, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::k"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::s"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::x"], [0, 0, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::k"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::s"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::x"], [0, 0, 1, "_CPPv43varRK5array14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var"], [0, 1, 1, "_CPPv43varRK5array14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::axes"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::axis"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5array14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::s"], [0, 0, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::a"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::dtype"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::s"], [0, 0, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::condition"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::s"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::x"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::y"], [0, 0, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros"], [0, 0, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::dtype"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros::s"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::s"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros::shape"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::shape"], [0, 0, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like"], [0, 1, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like::a"], [0, 1, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like::s"]], "mlx.core": [[9, 3, 1, "", "Device"], [10, 3, 1, "", "Dtype"], [11, 3, 1, "", "DtypeCategory"], [314, 3, 1, "", "Stream"], [12, 5, 1, "", "abs"], [13, 5, 1, "", "add"], [14, 5, 1, "", "addmm"], [15, 5, 1, "", "all"], [16, 5, 1, "", "allclose"], [17, 5, 1, "", "any"], [18, 5, 1, "", "arange"], [19, 5, 1, "", "arccos"], [20, 5, 1, "", "arccosh"], [21, 5, 1, "", "arcsin"], [22, 5, 1, "", "arcsinh"], [23, 5, 1, "", "arctan"], [24, 5, 1, "", "arctan2"], [25, 5, 1, "", "arctanh"], [26, 5, 1, "", "argmax"], [27, 5, 1, "", "argmin"], [28, 5, 1, "", "argpartition"], [29, 5, 1, "", "argsort"], [30, 3, 1, "", "array"], [82, 5, 1, "", "array_equal"], [83, 5, 1, "", "as_strided"], [84, 5, 1, "", "atleast_1d"], [85, 5, 1, "", "atleast_2d"], [86, 5, 1, "", "atleast_3d"], [87, 5, 1, "", "bitwise_and"], [88, 5, 1, "", "bitwise_or"], [89, 5, 1, "", "bitwise_xor"], [90, 5, 1, "", "block_masked_mm"], [91, 5, 1, "", "broadcast_to"], [92, 5, 1, "", "ceil"], [93, 5, 1, "", "clip"], [94, 5, 1, "", "compile"], [95, 5, 1, "", "concatenate"], [96, 5, 1, "", "conj"], [97, 5, 1, "", "conjugate"], [98, 5, 1, "", "conv1d"], [99, 5, 1, "", "conv2d"], [100, 5, 1, "", "conv3d"], [101, 5, 1, "", "conv_general"], [102, 5, 1, "", "conv_transpose1d"], [103, 5, 1, "", "conv_transpose2d"], [104, 5, 1, "", "conv_transpose3d"], [105, 5, 1, "", "convolve"], [106, 5, 1, "", "cos"], [107, 5, 1, "", "cosh"], [108, 5, 1, "", "cummax"], [109, 5, 1, "", "cummin"], [110, 5, 1, "", "cumprod"], [111, 5, 1, "", "cumsum"], [112, 3, 1, "", "custom_function"], [113, 5, 1, "", "default_device"], [114, 5, 1, "", "default_stream"], [115, 5, 1, "", "degrees"], [116, 5, 1, "", "dequantize"], [117, 5, 1, "", "diag"], [118, 5, 1, "", "diagonal"], [119, 5, 1, "", "disable_compile"], [128, 5, 1, "", "divide"], [129, 5, 1, "", "divmod"], [130, 5, 1, "", "einsum"], [131, 5, 1, "", "einsum_path"], [132, 5, 1, "", "enable_compile"], [133, 5, 1, "", "equal"], [134, 5, 1, "", "erf"], [135, 5, 1, "", "erfinv"], [136, 5, 1, "", "eval"], [137, 5, 1, "", "exp"], [138, 5, 1, "", "expand_dims"], [139, 5, 1, "", "expm1"], [140, 5, 1, "", "eye"], [158, 5, 1, "", "flatten"], [159, 5, 1, "", "floor"], [160, 5, 1, "", "floor_divide"], [161, 5, 1, "", "full"], [162, 5, 1, "", "gather_mm"], [163, 5, 1, "", "gather_qmm"], [164, 5, 1, "", "grad"], [165, 5, 1, "", "greater"], [166, 5, 1, "", "greater_equal"], [167, 5, 1, "", "hadamard_transform"], [168, 5, 1, "", "identity"], [169, 5, 1, "", "imag"], [170, 5, 1, "", "inner"], [171, 5, 1, "", "isclose"], [172, 5, 1, "", "isfinite"], [173, 5, 1, "", "isinf"], [174, 5, 1, "", "isnan"], [175, 5, 1, "", "isneginf"], [176, 5, 1, "", "isposinf"], [177, 5, 1, "", "issubdtype"], [178, 5, 1, "", "jvp"], [179, 5, 1, "", "left_shift"], [180, 5, 1, "", "less"], [181, 5, 1, "", "less_equal"], [192, 5, 1, "", "linspace"], [193, 5, 1, "", "load"], [194, 5, 1, "", "log"], [195, 5, 1, "", "log10"], [196, 5, 1, "", "log1p"], [197, 5, 1, "", "log2"], [198, 5, 1, "", "logaddexp"], [199, 5, 1, "", "logical_and"], [200, 5, 1, "", "logical_not"], [201, 5, 1, "", "logical_or"], [202, 5, 1, "", "logsumexp"], [203, 5, 1, "", "matmul"], [204, 5, 1, "", "max"], [205, 5, 1, "", "maximum"], [206, 5, 1, "", "mean"], [207, 5, 1, "", "meshgrid"], [220, 5, 1, "", "min"], [221, 5, 1, "", "minimum"], [222, 5, 1, "", "moveaxis"], [223, 5, 1, "", "multiply"], [224, 5, 1, "", "nan_to_num"], [225, 5, 1, "", "negative"], [226, 5, 1, "", "new_stream"], [227, 5, 1, "", "not_equal"], [228, 5, 1, "", "ones"], [229, 5, 1, "", "ones_like"], [230, 5, 1, "", "outer"], [231, 5, 1, "", "pad"], [232, 5, 1, "", "partition"], [233, 5, 1, "", "power"], [234, 5, 1, "", "prod"], [235, 5, 1, "", "put_along_axis"], [236, 5, 1, "", "quantize"], [237, 5, 1, "", "quantized_matmul"], [238, 5, 1, "", "radians"], [252, 5, 1, "", "real"], [253, 5, 1, "", "reciprocal"], [254, 5, 1, "", "remainder"], [255, 5, 1, "", "repeat"], [256, 5, 1, "", "reshape"], [257, 5, 1, "", "right_shift"], [258, 5, 1, "", "roll"], [259, 5, 1, "", "round"], [260, 5, 1, "", "rsqrt"], [261, 5, 1, "", "save"], [262, 5, 1, "", "save_gguf"], [263, 5, 1, "", "save_safetensors"], [264, 5, 1, "", "savez"], [265, 5, 1, "", "savez_compressed"], [266, 5, 1, "", "set_default_device"], [267, 5, 1, "", "set_default_stream"], [268, 5, 1, "", "sigmoid"], [269, 5, 1, "", "sign"], [270, 5, 1, "", "sin"], [271, 5, 1, "", "sinh"], [272, 5, 1, "", "softmax"], [273, 5, 1, "", "sort"], [274, 5, 1, "", "split"], [275, 5, 1, "", "sqrt"], [276, 5, 1, "", "square"], [277, 5, 1, "", "squeeze"], [278, 5, 1, "", "stack"], [279, 5, 1, "", "std"], [280, 5, 1, "", "stop_gradient"], [281, 5, 1, "", "stream"], [282, 5, 1, "", "subtract"], [283, 5, 1, "", "sum"], [284, 5, 1, "", "swapaxes"], [285, 5, 1, "", "synchronize"], [286, 5, 1, "", "take"], [287, 5, 1, "", "take_along_axis"], [288, 5, 1, "", "tan"], [289, 5, 1, "", "tanh"], [290, 5, 1, "", "tensordot"], [291, 5, 1, "", "tile"], [292, 5, 1, "", "topk"], [293, 5, 1, "", "trace"], [294, 5, 1, "", "transpose"], [295, 5, 1, "", "tri"], [296, 5, 1, "", "tril"], [297, 5, 1, "", "triu"], [298, 5, 1, "", "value_and_grad"], [299, 5, 1, "", "var"], [300, 5, 1, "", "view"], [301, 5, 1, "", "vjp"], [302, 5, 1, "", "vmap"], [303, 5, 1, "", "where"], [304, 5, 1, "", "zeros"], [305, 5, 1, "", "zeros_like"]], "mlx.core.Device": [[9, 4, 1, "", "__init__"]], "mlx.core.Dtype": [[10, 4, 1, "", "__init__"]], "mlx.core.DtypeCategory": [[11, 4, 1, "", "__init__"]], "mlx.core.Stream": [[314, 4, 1, "", "__init__"]], "mlx.core.array": [[31, 6, 1, "", "T"], [30, 4, 1, "", "__init__"], [32, 4, 1, "", "abs"], [33, 4, 1, "", "all"], [34, 4, 1, "", "any"], [35, 4, 1, "", "argmax"], [36, 4, 1, "", "argmin"], [37, 4, 1, "", "astype"], [38, 6, 1, "", "at"], [39, 4, 1, "", "conj"], [40, 4, 1, "", "cos"], [41, 4, 1, "", "cummax"], [42, 4, 1, "", "cummin"], [43, 4, 1, "", "cumprod"], [44, 4, 1, "", "cumsum"], [45, 4, 1, "", "diag"], [46, 4, 1, "", "diagonal"], [47, 6, 1, "", "dtype"], [48, 4, 1, "", "exp"], [49, 4, 1, "", "flatten"], [50, 4, 1, "", "item"], [51, 6, 1, "", "itemsize"], [52, 4, 1, "", "log"], [53, 4, 1, "", "log10"], [54, 4, 1, "", "log1p"], [55, 4, 1, "", "log2"], [56, 4, 1, "", "logsumexp"], [57, 4, 1, "", "max"], [58, 4, 1, "", "mean"], [59, 4, 1, "", "min"], [60, 4, 1, "", "moveaxis"], [61, 6, 1, "", "nbytes"], [62, 6, 1, "", "ndim"], [63, 4, 1, "", "prod"], [64, 4, 1, "", "reciprocal"], [65, 4, 1, "", "reshape"], [66, 4, 1, "", "round"], [67, 4, 1, "", "rsqrt"], [68, 6, 1, "", "shape"], [69, 4, 1, "", "sin"], [70, 6, 1, "", "size"], [71, 4, 1, "", "split"], [72, 4, 1, "", "sqrt"], [73, 4, 1, "", "square"], [74, 4, 1, "", "squeeze"], [75, 4, 1, "", "std"], [76, 4, 1, "", "sum"], [77, 4, 1, "", "swapaxes"], [78, 4, 1, "", "tolist"], [79, 4, 1, "", "transpose"], [80, 4, 1, "", "var"], [81, 4, 1, "", "view"]], "mlx.core.custom_function": [[112, 4, 1, "", "__init__"]], "mlx.core.distributed": [[120, 3, 1, "", "Group"], [121, 5, 1, "", "all_gather"], [122, 5, 1, "", "all_sum"], [123, 5, 1, "", "init"], [124, 5, 1, "", "is_available"], [125, 5, 1, "", "recv"], [126, 5, 1, "", "recv_like"], [127, 5, 1, "", "send"]], "mlx.core.distributed.Group": [[120, 4, 1, "", "__init__"]], "mlx.core.fast": [[141, 5, 1, "", "layer_norm"], [142, 5, 1, "", "metal_kernel"], [143, 5, 1, "", "rms_norm"], [144, 5, 1, "", "rope"], [145, 5, 1, "", "scaled_dot_product_attention"]], "mlx.core.fft": [[146, 5, 1, "", "fft"], [147, 5, 1, "", "fft2"], [148, 5, 1, "", "fftn"], [149, 5, 1, "", "ifft"], [150, 5, 1, "", "ifft2"], [151, 5, 1, "", "ifftn"], [152, 5, 1, "", "irfft"], [153, 5, 1, "", "irfft2"], [154, 5, 1, "", "irfftn"], [155, 5, 1, "", "rfft"], [156, 5, 1, "", "rfft2"], [157, 5, 1, "", "rfftn"]], "mlx.core.linalg": [[182, 5, 1, "", "cholesky"], [183, 5, 1, "", "cholesky_inv"], [184, 5, 1, "", "cross"], [185, 5, 1, "", "eigh"], [186, 5, 1, "", "eigvalsh"], [187, 5, 1, "", "inv"], [188, 5, 1, "", "norm"], [189, 5, 1, "", "qr"], [190, 5, 1, "", "svd"], [191, 5, 1, "", "tri_inv"]], "mlx.core.metal": [[208, 5, 1, "", "clear_cache"], [209, 5, 1, "", "device_info"], [210, 5, 1, "", "get_active_memory"], [211, 5, 1, "", "get_cache_memory"], [212, 5, 1, "", "get_peak_memory"], [213, 5, 1, "", "is_available"], [214, 5, 1, "", "reset_peak_memory"], [215, 5, 1, "", "set_cache_limit"], [216, 5, 1, "", "set_memory_limit"], [217, 5, 1, "", "set_wired_limit"], [218, 5, 1, "", "start_capture"], [219, 5, 1, "", "stop_capture"]], "mlx.core.random": [[239, 5, 1, "", "bernoulli"], [240, 5, 1, "", "categorical"], [241, 5, 1, "", "gumbel"], [242, 5, 1, "", "key"], [243, 5, 1, "", "laplace"], [244, 5, 1, "", "multivariate_normal"], [245, 5, 1, "", "normal"], [246, 5, 1, "", "permutation"], [247, 5, 1, "", "randint"], [248, 5, 1, "", "seed"], [249, 5, 1, "", "split"], [250, 5, 1, "", "truncated_normal"], [251, 5, 1, "", "uniform"]], "mlx.nn": [[324, 3, 1, "", "ALiBi"], [325, 3, 1, "", "AvgPool1d"], [326, 3, 1, "", "AvgPool2d"], [327, 3, 1, "", "AvgPool3d"], [328, 3, 1, "", "BatchNorm"], [329, 3, 1, "", "CELU"], [330, 3, 1, "", "Conv1d"], [331, 3, 1, "", "Conv2d"], [332, 3, 1, "", "Conv3d"], [333, 3, 1, "", "ConvTranspose1d"], [334, 3, 1, "", "ConvTranspose2d"], [335, 3, 1, "", "ConvTranspose3d"], [336, 3, 1, "", "Dropout"], [337, 3, 1, "", "Dropout2d"], [338, 3, 1, "", "Dropout3d"], [339, 3, 1, "", "ELU"], [340, 3, 1, "", "Embedding"], [341, 3, 1, "", "GELU"], [342, 3, 1, "", "GLU"], [343, 3, 1, "", "GRU"], [344, 3, 1, "", "GroupNorm"], [345, 3, 1, "", "HardShrink"], [346, 3, 1, "", "HardTanh"], [347, 3, 1, "", "Hardswish"], [348, 3, 1, "", "InstanceNorm"], [349, 3, 1, "", "LSTM"], [350, 3, 1, "", "LayerNorm"], [351, 3, 1, "", "LeakyReLU"], [352, 3, 1, "", "Linear"], [353, 3, 1, "", "LogSigmoid"], [354, 3, 1, "", "LogSoftmax"], [355, 3, 1, "", "MaxPool1d"], [356, 3, 1, "", "MaxPool2d"], [357, 3, 1, "", "MaxPool3d"], [358, 3, 1, "", "Mish"], [453, 3, 1, "", "Module"], [379, 3, 1, "", "MultiHeadAttention"], [380, 3, 1, "", "PReLU"], [381, 3, 1, "", "QuantizedEmbedding"], [382, 3, 1, "", "QuantizedLinear"], [383, 3, 1, "", "RMSNorm"], [384, 3, 1, "", "RNN"], [385, 3, 1, "", "ReLU"], [386, 3, 1, "", "ReLU6"], [387, 3, 1, "", "RoPE"], [388, 3, 1, "", "SELU"], [389, 3, 1, "", "Sequential"], [390, 3, 1, "", "SiLU"], [391, 3, 1, "", "Sigmoid"], [392, 3, 1, "", "SinusoidalPositionalEncoding"], [393, 3, 1, "", "Softmax"], [394, 3, 1, "", "Softmin"], [395, 3, 1, "", "Softplus"], [396, 3, 1, "", "Softshrink"], [397, 3, 1, "", "Softsign"], [398, 3, 1, "", "Step"], [399, 3, 1, "", "Tanh"], [400, 3, 1, "", "Transformer"], [401, 3, 1, "", "Upsample"], [410, 3, 1, "", "celu"], [411, 3, 1, "", "elu"], [412, 3, 1, "", "gelu"], [413, 3, 1, "", "gelu_approx"], [414, 3, 1, "", "gelu_fast_approx"], [415, 3, 1, "", "glu"], [416, 3, 1, "", "hard_shrink"], [417, 3, 1, "", "hard_tanh"], [418, 3, 1, "", "hardswish"], [419, 3, 1, "", "leaky_relu"], [420, 3, 1, "", "log_sigmoid"], [421, 3, 1, "", "log_softmax"], [436, 3, 1, "", "mish"], [437, 3, 1, "", "prelu"], [306, 5, 1, "", "quantize"], [438, 3, 1, "", "relu"], [439, 3, 1, "", "relu6"], [440, 3, 1, "", "selu"], [441, 3, 1, "", "sigmoid"], [442, 3, 1, "", "silu"], [443, 3, 1, "", "softmax"], [444, 3, 1, "", "softmin"], [445, 3, 1, "", "softplus"], [446, 3, 1, "", "softshrink"], [447, 3, 1, "", "step"], [448, 3, 1, "", "tanh"], [307, 5, 1, "", "value_and_grad"]], "mlx.nn.Module": [[359, 4, 1, "", "apply"], [360, 4, 1, "", "apply_to_modules"], [361, 4, 1, "", "children"], [362, 4, 1, "", "eval"], [363, 4, 1, "", "filter_and_map"], [364, 4, 1, "", "freeze"], [365, 4, 1, "", "leaf_modules"], [366, 4, 1, "", "load_weights"], [367, 4, 1, "", "modules"], [368, 4, 1, "", "named_modules"], [369, 4, 1, "", "parameters"], [370, 4, 1, "", "save_weights"], [371, 4, 1, "", "set_dtype"], [372, 6, 1, "", "state"], [373, 4, 1, "", "train"], [374, 4, 1, "", "trainable_parameters"], [375, 6, 1, "", "training"], [376, 4, 1, "", "unfreeze"], [377, 4, 1, "", "update"], [378, 4, 1, "", "update_modules"]], "mlx.nn.init": [[402, 5, 1, "", "constant"], [403, 5, 1, "", "glorot_normal"], [404, 5, 1, "", "glorot_uniform"], [405, 5, 1, "", "he_normal"], [406, 5, 1, "", "he_uniform"], [407, 5, 1, "", "identity"], [408, 5, 1, "", "normal"], [409, 5, 1, "", "uniform"]], "mlx.nn.losses": [[422, 3, 1, "", "binary_cross_entropy"], [423, 3, 1, "", "cosine_similarity_loss"], [424, 3, 1, "", "cross_entropy"], [425, 3, 1, "", "gaussian_nll_loss"], [426, 3, 1, "", "hinge_loss"], [427, 3, 1, "", "huber_loss"], [428, 3, 1, "", "kl_div_loss"], [429, 3, 1, "", "l1_loss"], [430, 3, 1, "", "log_cosh_loss"], [431, 3, 1, "", "margin_ranking_loss"], [432, 3, 1, "", "mse_loss"], [433, 3, 1, "", "nll_loss"], [434, 3, 1, "", "smooth_l1_loss"], [435, 3, 1, "", "triplet_loss"]], "mlx.optimizers": [[456, 3, 1, "", "AdaDelta"], [457, 3, 1, "", "Adafactor"], [458, 3, 1, "", "Adagrad"], [459, 3, 1, "", "Adam"], [460, 3, 1, "", "AdamW"], [461, 3, 1, "", "Adamax"], [462, 3, 1, "", "Lion"], [475, 3, 1, "", "Optimizer"], [467, 3, 1, "", "RMSprop"], [468, 3, 1, "", "SGD"], [308, 5, 1, "", "clip_grad_norm"], [469, 5, 1, "", "cosine_decay"], [470, 5, 1, "", "exponential_decay"], [471, 5, 1, "", "join_schedules"], [472, 5, 1, "", "linear_schedule"], [473, 5, 1, "", "step_decay"]], "mlx.optimizers.Optimizer": [[463, 4, 1, "", "apply_gradients"], [464, 4, 1, "", "init"], [465, 6, 1, "", "state"], [466, 4, 1, "", "update"]], "mlx.utils": [[309, 5, 1, "", "tree_flatten"], [310, 5, 1, "", "tree_map"], [311, 5, 1, "", "tree_map_with_path"], [312, 5, 1, "", "tree_reduce"], [313, 5, 1, "", "tree_unflatten"]]}, "objnames": {"0": ["cpp", "function", "C++ function"], "1": ["cpp", "functionParam", "C++ function parameter"], "2": ["cpp", "templateParam", "C++ template parameter"], "3": ["py", "class", "Python class"], "4": ["py", "method", "Python method"], "5": ["py", "function", "Python function"], "6": ["py", "property", "Python property"]}, "objtypes": {"0": "cpp:function", "1": "cpp:functionParam", "2": "cpp:templateParam", "3": "py:class", "4": "py:method", "5": "py:function", "6": "py:property"}, "terms": {"": [0, 1, 2, 4, 5, 6, 47, 51, 62, 94, 114, 116, 147, 148, 150, 151, 153, 154, 156, 157, 164, 183, 188, 190, 193, 206, 230, 236, 240, 259, 262, 263, 279, 281, 298, 299, 300, 302, 307, 323, 326, 327, 343, 349, 356, 357, 363, 364, 366, 370, 371, 372, 376, 384, 455, 464, 465, 477, 480, 482, 485, 486, 487, 488], "0": [0, 1, 2, 4, 5, 6, 8, 9, 14, 18, 38, 45, 46, 49, 66, 71, 75, 80, 83, 95, 98, 99, 100, 101, 102, 103, 104, 117, 118, 140, 142, 145, 158, 162, 164, 185, 187, 188, 189, 191, 208, 215, 217, 224, 231, 239, 243, 245, 246, 251, 255, 259, 274, 278, 279, 293, 295, 296, 297, 298, 299, 302, 308, 309, 311, 312, 323, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 344, 345, 348, 350, 351, 355, 356, 357, 380, 385, 387, 392, 396, 398, 400, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 413, 414, 416, 417, 418, 419, 422, 424, 426, 427, 431, 434, 435, 437, 438, 439, 440, 446, 447, 450, 453, 456, 457, 459, 460, 461, 462, 464, 467, 468, 469, 470, 471, 472, 473, 477, 480, 481, 482, 483, 484, 485, 486, 487], "00005": 4, "0001": 392, "0005": 413, "001": 457, "00364": 4, "01": [4, 351, 419, 460], "0137595": 405, "015": 414, "0184009": 406, "02264": 404, "024": 482, "02765": 405, "0300242": 406, "044715": [341, 413], "0485873": 424, "05": [16, 171, 328, 344, 348, 350, 383], "0507": 440, "05202": 5, "06": [425, 435, 456], "0638": 431, "06450": 350, "0645099": 408, "06561": 470, "06675": 462, "07467": 383, "08": [16, 171, 423, 458, 459, 460, 461, 467], "08022": 348, "081": 473, "08415": 414, "08494": 344, "08619": 406, "08681": [358, 436], "09864": 5, "0999938": 471, "0999961": 469, "0f": 0, "1": [0, 1, 2, 3, 5, 6, 14, 18, 28, 29, 38, 46, 49, 98, 99, 100, 101, 102, 103, 104, 117, 118, 139, 142, 145, 146, 147, 149, 150, 152, 153, 154, 155, 156, 157, 158, 167, 170, 177, 183, 184, 185, 186, 188, 189, 203, 207, 216, 230, 232, 236, 240, 243, 244, 245, 251, 268, 273, 286, 292, 293, 298, 308, 311, 312, 316, 323, 325, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 342, 343, 344, 348, 349, 350, 352, 355, 380, 383, 384, 387, 391, 392, 398, 401, 403, 404, 405, 406, 407, 408, 409, 410, 411, 413, 414, 415, 417, 420, 421, 422, 423, 424, 425, 426, 427, 428, 430, 431, 433, 434, 435, 440, 441, 443, 444, 445, 447, 450, 453, 455, 456, 457, 458, 459, 460, 461, 462, 464, 467, 468, 469, 470, 471, 472, 473, 480, 481, 482, 483, 485, 486, 487, 488], "10": [0, 3, 5, 6, 195, 259, 264, 310, 323, 366, 450, 471, 473, 480, 481, 483], "100": [2, 4, 5, 422, 472, 480, 482, 484, 488], "1000": [469, 480], "10000": 387, "101": 472, "1024": [1, 5], "105361": 422, "109": 2, "10_000": 4, "10x": 462, "11": 188, "114": 2, "12": [5, 167, 471], "1212": 456, "12451": 404, "128": [264, 323], "13": 8, "14": 8, "15": [1, 8, 188, 217, 312, 480], "150594": 403, "15268": 405, "16": [1, 142, 316, 325, 327, 348, 355, 357, 359, 453], "1606": 414, "1607": [348, 350], "16384": 167, "16506": 406, "17": 8, "177208": 405, "1803": 344, "1908": [358, 436], "1910": 383, "191107": 403, "1985": 188, "1_000": 4, "1d": [0, 98, 102, 105, 262, 287], "1e": [0, 4, 6, 16, 171, 328, 344, 348, 350, 351, 383, 423, 425, 435, 455, 456, 457, 458, 459, 460, 461, 464, 467, 469, 470, 471, 472, 473], "1e3": 480, "1st": 236, "2": [0, 1, 2, 4, 5, 6, 38, 99, 103, 117, 118, 134, 147, 150, 152, 153, 154, 155, 156, 157, 158, 167, 177, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 197, 203, 236, 244, 249, 290, 293, 295, 296, 297, 308, 312, 316, 323, 325, 326, 327, 331, 334, 341, 351, 355, 356, 357, 383, 392, 401, 402, 403, 404, 405, 406, 407, 408, 409, 413, 424, 425, 427, 434, 435, 450, 453, 455, 456, 458, 459, 460, 464, 467, 480, 481, 482, 483, 484, 485, 486, 487, 488], "20": [167, 188], "200": [5, 471, 482], "2002": 5, "2011": 458, "2012": [456, 467], "2015": [337, 459, 461], "2019": [5, 460], "2020": 5, "2021": 5, "20397": 422, "20_000": 5, "21": [5, 473], "2104": 5, "223144": 422, "223404": 404, "225": 188, "225763": 431, "2302": 462, "23607": [188, 189], "24": 8, "24264": 188, "247": 5, "25": [380, 401], "25211": 405, "256": [1, 2, 6, 142], "256995": 431, "28": 167, "2d": [0, 99, 103, 118, 236, 328, 337], "2nd": 236, "2x": 485, "3": [0, 1, 2, 5, 8, 100, 104, 158, 177, 184, 185, 186, 188, 189, 308, 312, 327, 332, 335, 341, 357, 401, 404, 406, 413, 418, 457, 462, 477, 480, 483, 485, 486], "30": 457, "3118": 485, "32": [1, 5, 6, 90, 236, 237, 316, 326, 327, 356, 357, 383, 480], "330": 5, "33333": 401, "348587": 424, "363207": 403, "36788": 480, "379159": 404, "380709": 408, "39": 5, "3d": [0, 2, 100, 104, 328, 338, 401], "3f": [2, 6, 480], "3x": 2, "4": [0, 1, 2, 5, 116, 142, 145, 158, 163, 188, 236, 237, 264, 306, 312, 316, 325, 326, 327, 328, 348, 355, 356, 357, 381, 382, 400, 401, 403, 404, 405, 422, 480, 481, 483, 486, 488], "4096": [480, 482, 488], "40x": 1, "41421": 188, "417497": 409, "42": 313, "437": 5, "44": 5, "447214": 189, "458835": 405, "475": 5, "48095": 403, "4d": [1, 401], "4m": 1, "5": [0, 1, 2, 4, 5, 8, 188, 216, 239, 312, 325, 328, 336, 337, 338, 341, 345, 348, 355, 396, 401, 402, 405, 406, 413, 416, 434, 446, 450, 455, 467, 469, 470, 480, 482, 483], "50": [0, 192], "500": [5, 488], "5000": 2, "510826": 422, "512": [2, 3, 5, 400, 488], "534422": 408, "539245": 422, "53947": 403, "55": 1, "5701": 456, "573409": 431, "57771": 189, "579": 5, "5f": 4, "6": [1, 2, 5, 188, 264, 386, 400, 404, 413, 414, 418, 425, 435, 439, 467, 480, 483, 486], "61278": 403, "617261": 409, "628": 5, "633": 5, "639": 482, "64": [0, 1, 90, 116, 163, 236, 237, 306, 316, 381, 382], "64331": 406, "666329": 406, "66667": 401, "67326": 440, "676": 1, "690": 5, "6967": 405, "7": [2, 5, 188, 236, 483], "702": [341, 414], "707107": 185, "71828": 480, "74166": 188, "74597": 188, "75": 401, "75596": 431, "75787": 405, "765166": 431, "773433": 431, "776856": 404, "793615": 406, "79854": 406, "7b": 5, "7m": 1, "8": [0, 1, 2, 5, 8, 188, 236, 316, 326, 327, 348, 356, 357, 400, 423, 456, 457, 458, 459, 460, 461, 467, 480, 483, 486, 488], "8192": [5, 167], "84804": 188, "863726": 409, "883935": 409, "890597": 404, "894427": 189, "89613": 403, "8gb": 5, "8x": 1, "9": [8, 188, 424, 456, 459, 460, 461, 462, 464, 470, 473, 485], "90041": 404, "912766": 404, "916291": 422, "95": 6, "982273": 408, "99": [462, 467], "995016": 403, "999": [459, 460, 461], "A": [0, 2, 5, 7, 8, 9, 68, 82, 94, 141, 142, 143, 145, 164, 177, 178, 183, 185, 186, 188, 189, 190, 193, 202, 203, 204, 209, 220, 236, 239, 240, 241, 243, 244, 245, 246, 247, 250, 251, 274, 278, 281, 298, 301, 302, 306, 307, 308, 309, 310, 311, 312, 313, 314, 323, 328, 337, 343, 344, 348, 350, 363, 367, 368, 371, 377, 378, 383, 389, 392, 400, 403, 404, 406, 414, 435, 436, 453, 455, 459, 461, 463, 464, 466, 471, 480, 481, 482, 484, 485], "AS": 162, "And": [5, 401], "As": [6, 38, 286, 323], "At": 93, "But": 488, "By": [5, 306, 371, 422, 482, 485], "For": [0, 1, 2, 5, 8, 38, 145, 162, 177, 188, 236, 313, 323, 328, 337, 341, 359, 364, 373, 376, 382, 387, 392, 401, 403, 404, 405, 406, 422, 450, 455, 477, 480, 481, 482, 483, 484, 485, 486, 487, 488], "If": [0, 1, 2, 5, 8, 15, 16, 17, 18, 26, 27, 28, 29, 78, 82, 83, 93, 95, 105, 108, 109, 110, 111, 117, 118, 121, 122, 123, 125, 126, 127, 136, 141, 144, 155, 156, 157, 160, 161, 164, 171, 182, 183, 184, 188, 193, 202, 203, 204, 206, 207, 215, 216, 220, 224, 228, 231, 232, 234, 235, 240, 244, 246, 255, 258, 272, 273, 274, 279, 283, 285, 286, 287, 290, 292, 293, 298, 299, 302, 304, 306, 310, 312, 328, 330, 331, 332, 333, 334, 335, 344, 350, 352, 364, 366, 376, 382, 384, 387, 389, 392, 401, 422, 424, 435, 457, 480, 481, 482, 484, 487, 488, 489], "In": [0, 1, 2, 5, 6, 38, 145, 203, 236, 310, 323, 337, 344, 453, 456, 458, 459, 461, 462, 463, 479, 480, 481, 482, 484, 487, 488], "It": [2, 5, 8, 126, 164, 267, 298, 308, 312, 323, 378, 382, 463, 475, 485, 487], "Its": 323, "No": [2, 5, 185, 186], "Not": [94, 227, 480], "ON": [3, 8], "Of": 482, "On": [1, 480, 482, 484], "One": [146, 149, 155, 231, 260, 480, 482], "THE": 8, "That": 5, "The": [0, 1, 2, 3, 5, 6, 7, 8, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 47, 51, 61, 62, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 178, 179, 180, 181, 184, 185, 186, 188, 189, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 211, 212, 215, 216, 217, 218, 220, 221, 222, 223, 225, 227, 228, 229, 230, 231, 232, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 262, 263, 268, 269, 270, 271, 272, 273, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 316, 318, 325, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 357, 359, 360, 364, 366, 370, 371, 372, 373, 376, 377, 378, 379, 381, 382, 383, 384, 387, 389, 392, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 415, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 447, 450, 453, 455, 456, 457, 458, 459, 460, 461, 462, 465, 467, 468, 469, 472, 475, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489], "Then": [4, 8], "There": [1, 2, 323, 401, 480], "These": [1, 2, 94, 235, 287, 424, 488], "To": [0, 2, 3, 4, 5, 6, 8, 215, 323, 450, 455, 480, 481, 482, 486], "With": 2, "_": [1, 3, 4, 5, 311, 323, 469, 470, 471, 472, 473, 477, 480, 484, 488], "__call__": [1, 5, 6, 323, 453], "__init__": [2, 5, 6, 9, 10, 11, 30, 112, 120, 314, 323, 453], "__main__": [2, 5], "__name__": [2, 5], "_a": 2, "_ext": 2, "_f": 188, "_in": [403, 404], "_out": [403, 404], "_p": 435, "_val": 417, "a1": 162, "a2": 162, "a_": 188, "a_max": [0, 93], "a_min": [0, 93], "a_ndim": 1, "a_shap": 1, "a_strid": 1, "a_view": 485, "ab": [0, 16, 171, 188, 298, 344, 348, 350, 358, 383, 414, 436, 480], "abil": 481, "abl": [2, 236], "about": [1, 2, 5, 6, 131, 209, 484, 488], "abov": [1, 2, 5, 236, 296, 323, 401, 460, 481, 482, 483, 484, 488], "absolut": [0, 12, 16, 171, 413, 414, 434], "acc": 312, "acceler": [2, 328], "access": [0, 5, 50, 323, 453, 464, 481, 484, 488], "accord": [0, 241, 303, 306, 379, 403, 404, 405, 406], "accordingli": 2, "accumul": [312, 383], "accuraci": 6, "accustom": 5, "achiev": [323, 481], "across": [1, 2, 8, 344, 481], "act": [2, 430], "action": 323, "activ": [2, 8, 210, 337, 398, 400, 416, 436, 446, 447, 449, 480], "actual": [5, 18, 366, 453, 484], "ad": [0, 1, 2, 4, 8, 141, 348, 453, 456, 457, 458, 459, 460, 461, 467, 481, 484, 487], "adadelta": 455, "adafactor": 455, "adagrad": 455, "adam": [455, 461, 462, 471, 472], "adamax": 455, "adamw": [455, 462], "adapt": [456, 457, 458, 481], "add": [0, 1, 2, 3, 5, 14, 38, 138, 198, 231, 236, 330, 331, 332, 333, 334, 335, 482, 488], "add_argu": 5, "add_depend": 2, "add_librari": 2, "addit": [0, 2, 5, 8, 13, 14, 141, 143, 145, 193, 328, 344, 350, 379, 383, 453, 482], "addmm": 0, "address": 2, "adjac": 337, "advanc": [5, 480], "advantag": 488, "advis": 485, "affin": [328, 344, 348, 350, 352, 382], "after": [2, 5, 6, 28, 158, 160, 163, 208, 232, 236, 328, 344, 350, 359, 360, 364, 366, 373, 376, 377, 378, 379, 400, 434, 480, 488], "after_1": 231, "after_2": 231, "after_i": 231, "after_n": 231, "afternoon": 5, "again": [5, 8, 323, 480], "against": 0, "aggreg": 379, "ago": 5, "ai": 112, "ainv": [187, 191], "albeit": 488, "algebra": 7, "algorithm": [401, 462], "alia": [96, 97, 341], "alibi": 323, "align": [183, 236, 343, 349], "align_corn": 401, "all": [0, 1, 2, 3, 6, 8, 16, 28, 38, 84, 85, 86, 94, 99, 100, 101, 103, 104, 112, 121, 122, 123, 140, 148, 151, 154, 157, 162, 163, 190, 203, 231, 232, 258, 277, 306, 323, 359, 360, 364, 367, 368, 369, 374, 376, 379, 392, 400, 401, 450, 453, 475, 477, 480, 483, 484, 486, 489], "all_avg": 481, "all_reduce_grad": 481, "all_sum": 481, "allclos": [0, 1, 142], "alloc": [2, 211, 215, 216, 453], "allow": [0, 1, 2, 177, 308, 323, 378, 453, 475, 481, 483, 486], "allow_col_major": 0, "almost": 5, "alon": [2, 485], "along": [0, 2, 26, 27, 94, 95, 108, 109, 110, 111, 121, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 162, 163, 167, 184, 188, 235, 246, 255, 258, 272, 274, 278, 286, 287, 290, 291, 292, 293, 300, 323, 342, 384, 415], "alpha": [0, 2, 14, 236, 329, 339, 410, 411, 435, 437, 440, 460, 467], "alpha_": 2, "alreadi": [2, 3, 5, 481], "also": [0, 1, 2, 5, 6, 7, 8, 11, 13, 87, 88, 89, 119, 128, 129, 133, 148, 151, 154, 157, 165, 166, 179, 180, 181, 198, 205, 221, 223, 227, 233, 236, 254, 257, 282, 306, 307, 318, 323, 363, 377, 379, 381, 382, 390, 412, 440, 442, 449, 455, 480, 481, 482, 483, 484, 485, 486, 489], "altern": 477, "alwai": [1, 83, 210, 309, 482], "am": 5, "among": 2, "amount": [5, 212, 325, 355], "amus": 5, "an": [0, 1, 2, 3, 5, 6, 8, 10, 15, 17, 30, 84, 85, 86, 91, 98, 99, 100, 101, 102, 103, 104, 120, 125, 126, 127, 136, 140, 141, 145, 158, 161, 168, 172, 182, 188, 193, 216, 217, 222, 228, 229, 231, 234, 235, 236, 237, 246, 255, 256, 258, 259, 274, 277, 284, 286, 287, 290, 291, 295, 302, 304, 305, 309, 310, 311, 312, 323, 336, 341, 344, 349, 350, 352, 359, 379, 380, 382, 384, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 413, 437, 450, 455, 456, 466, 470, 475, 477, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489], "anaconda": 481, "anchor": 435, "angl": [115, 238, 351], "angular": [144, 387], "ani": [0, 1, 2, 5, 7, 18, 94, 309, 310, 311, 312, 313, 323, 341, 359, 360, 363, 372, 382, 400, 401, 450, 472, 479, 480, 482, 484, 486, 487, 488], "anonym": 480, "anoth": [0, 93, 177, 203, 282, 303, 316, 323, 359, 480, 482, 483, 488], "anwywher": 8, "anyhow": 5, "anymor": 5, "anyth": [5, 298, 484], "anytim": 484, "api": [1, 2, 341, 481, 482], "app": 8, "append": [5, 203, 480, 484], "appl": [2, 5, 7, 8, 488], "appli": [0, 38, 144, 145, 162, 190, 310, 311, 312, 323, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 337, 338, 339, 341, 342, 344, 345, 346, 347, 348, 350, 351, 352, 353, 354, 355, 356, 357, 358, 360, 373, 380, 382, 383, 384, 385, 386, 388, 390, 391, 393, 394, 395, 396, 397, 398, 399, 401, 410, 411, 412, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 450, 463, 466, 472, 475, 480, 481], "applic": [3, 8], "apply_fn": 360, "apply_gradi": 455, "apply_to_modul": [323, 364], "approach": [430, 482], "appropri": [2, 480], "approx": 341, "approxim": [16, 341, 412, 413, 414], "ar": [0, 1, 2, 4, 5, 6, 7, 8, 16, 18, 82, 90, 91, 93, 94, 101, 105, 112, 118, 125, 126, 136, 140, 142, 145, 147, 148, 150, 151, 153, 154, 156, 157, 158, 163, 164, 171, 172, 173, 174, 175, 176, 177, 178, 185, 186, 188, 189, 193, 203, 216, 230, 231, 232, 236, 237, 239, 240, 241, 246, 247, 250, 251, 258, 264, 265, 277, 278, 286, 298, 301, 302, 306, 309, 310, 316, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 344, 348, 350, 352, 366, 379, 382, 401, 422, 424, 425, 449, 453, 455, 462, 464, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "arang": [0, 1, 188, 246, 316, 401, 483, 485], "arbitrari": [309, 453], "arbitrarili": [1, 94, 323, 479, 482, 486], "arc": 0, "arcco": 0, "arccosh": 0, "architectur": [5, 8, 209, 323, 378, 488], "archiv": 487, "arcsin": 0, "arcsinh": 0, "arctan": 0, "arctan2": 0, "arctanh": 0, "arg": [2, 5, 10, 18, 120, 136, 264, 265], "arg1": 177, "arg2": 177, "argmax": [0, 6], "argmin": 0, "argnam": [164, 298], "argnum": [2, 164, 298, 482], "argpars": 5, "argpartit": 0, "argsort": 0, "argument": [1, 31, 65, 79, 94, 136, 164, 298, 310, 311, 312, 323, 401, 477, 481, 482, 487, 488, 489], "argumentpars": 5, "ari": [84, 85, 86], "aris": 485, "arm": 8, "arm64": 8, "around": 5, "arr": [0, 261, 483], "arr_0": 487, "arrai": [0, 1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 220, 221, 222, 223, 224, 225, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 282, 283, 284, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 308, 323, 328, 349, 359, 366, 369, 374, 380, 401, 402, 403, 404, 405, 406, 407, 408, 409, 415, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 447, 450, 453, 456, 457, 458, 459, 460, 461, 462, 467, 468, 469, 470, 471, 472, 473, 480, 481, 482, 484, 485, 486, 488], "array_equ": [0, 16, 171], "arrayfir": 7, "arxiv": [5, 344, 348, 350, 358, 383, 414, 436, 456, 462], "as_strid": 0, "ascend": [185, 186], "ask": [5, 481], "assert": [1, 2, 142], "assign": [0, 2, 38, 453], "associ": [2, 264, 265, 484], "assum": [0, 2, 5, 90, 184, 185, 186, 189, 310, 323, 344], "astyp": [0, 1, 2, 5, 142, 359, 485], "atleast": 0, "atleast_1d": 0, "atleast_2d": 0, "atleast_3d": 0, "atol": [0, 16, 171], "atom": [1, 142], "atomic_fetch_add_explicit": 1, "atomic_output": [1, 142], "attach": 2, "attempt": 94, "attend": 379, "attent": [145, 364, 379, 392, 400], "attention_norm": 5, "attribut": [1, 9, 10, 11, 30, 314, 372, 453, 475], "audio": 401, "auto": [0, 2, 8], "autom": 482, "automat": [1, 2, 7, 142, 193, 481, 486, 487, 488], "autoregress": 5, "avail": [2, 4, 5, 6, 8, 10, 124, 213, 318, 488], "averag": [325, 326, 327, 456, 457, 459, 460, 461, 481], "avgpool1d": 323, "avgpool2d": 323, "avgpool3d": 323, "avoid": [1, 2, 371, 480], "awai": [2, 5], "awar": [480, 484], "ax": [0, 2, 15, 17, 26, 27, 79, 112, 138, 147, 148, 150, 151, 153, 154, 156, 157, 158, 170, 188, 202, 204, 206, 220, 231, 234, 258, 272, 277, 279, 283, 284, 290, 294, 299, 482], "axes_a": 0, "axes_b": 0, "axi": [0, 2, 5, 6, 15, 17, 26, 27, 28, 29, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 71, 74, 75, 76, 80, 95, 108, 109, 110, 111, 118, 121, 138, 141, 143, 146, 149, 152, 153, 154, 155, 156, 157, 158, 167, 184, 188, 202, 204, 206, 220, 222, 231, 232, 234, 235, 240, 246, 255, 258, 272, 273, 274, 277, 278, 279, 283, 284, 286, 287, 291, 292, 293, 294, 299, 300, 302, 325, 326, 327, 342, 355, 356, 357, 384, 415, 421, 423, 424, 428, 433, 435, 443, 444, 483], "axis1": [0, 46, 77, 118, 284, 293], "axis2": [0, 46, 77, 118, 284, 293], "axpbi": 2, "axpby_": 2, "axpby_gener": 2, "axpby_general_": 2, "axpby_impl": 2, "axpby_impl_acceler": 2, "b": [0, 1, 2, 3, 5, 13, 14, 16, 24, 82, 87, 88, 89, 90, 128, 129, 133, 142, 145, 160, 162, 165, 166, 170, 171, 179, 180, 181, 184, 188, 198, 199, 201, 203, 205, 221, 223, 227, 230, 233, 236, 243, 254, 257, 282, 290, 298, 311, 312, 342, 352, 384, 401, 415, 482, 483, 484, 485, 486, 487, 488], "b1": 162, "b2": 162, "b_": [343, 349], "b_stride": 1, "ba": [459, 461], "back": [5, 112, 213, 485], "backend": [1, 8, 123, 124], "backward": [1, 480, 482], "bad": 484, "balanc": 430, "baltimor": 188, "bandwidth": [480, 481], "bar": 481, "base": [0, 2, 144, 188, 195, 197, 233, 387, 400, 453, 455, 461, 475, 477, 480, 483], "base_idx": 1, "basi": 475, "basic": [4, 259, 482], "batch": [5, 14, 90, 145, 162, 163, 203, 244, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 349, 379, 384, 401, 484], "batch_idx": 1, "batch_iter": [6, 455], "batch_siz": [6, 455], "batchnorm": 323, "becaus": [5, 210, 323, 484], "been": [0, 2, 5, 211, 484], "befor": [1, 2, 5, 8, 28, 142, 232, 363, 400, 464, 481, 483, 484], "before_1": 231, "before_2": 231, "before_i": 231, "before_n": 231, "beforehand": 230, "beggin": 258, "begin": [83, 183, 212, 236, 343, 349, 398, 416, 427, 434, 440, 446, 447], "behav": 112, "behavior": [244, 430, 483, 484], "behaviour": [112, 182, 183], "behind": 482, "being": [280, 323], "bell": 2, "below": [2, 8, 188, 295, 297, 316, 401, 484], "bench": 2, "benchmark": [2, 480], "benefici": [337, 338, 484], "best": 481, "beta": [0, 2, 14, 116, 236, 328, 344, 348, 350, 434, 455, 459, 460, 461, 462], "beta_": 2, "beta_1": [457, 459, 460, 461, 462], "beta_2": [459, 460, 461, 462], "better": [482, 488], "between": [0, 2, 7, 93, 158, 400, 423, 426, 427, 430, 471, 481, 484, 485, 488], "beyond": [258, 469, 472], "bfloat16": [2, 11, 167, 316, 485], "bfloat16_t": 2, "bia": [5, 116, 141, 163, 236, 237, 310, 323, 330, 331, 332, 333, 334, 335, 343, 349, 350, 352, 364, 366, 376, 379, 382, 384, 459, 460, 461, 464, 482], "bias": [0, 116, 163, 236, 237, 343, 349, 364, 376, 379], "bicub": 401, "big": [1, 480], "bigger": [5, 457], "bilinear": [1, 401], "binari": [193, 261, 262, 263, 264, 265, 300, 398, 422, 447, 480], "binary_cross_entropi": [323, 480], "bit": [0, 116, 163, 179, 236, 237, 257, 306, 316, 359, 381, 382, 383], "bitwis": [0, 87, 88, 89, 179, 257], "bitwise_and": 0, "bitwise_or": 0, "bitwise_xor": 0, "block": [0, 2, 5, 90, 400], "block_masked_mm": 0, "block_siz": [0, 90], "bn": 328, "bodi": [1, 142], "bool": [0, 1, 2, 15, 16, 17, 26, 27, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 75, 76, 78, 80, 82, 94, 101, 108, 109, 110, 111, 123, 124, 142, 144, 163, 171, 177, 182, 183, 188, 191, 193, 202, 204, 206, 207, 213, 216, 220, 234, 237, 279, 283, 299, 328, 330, 331, 332, 333, 334, 335, 343, 344, 348, 349, 350, 352, 359, 363, 364, 366, 371, 373, 376, 379, 382, 384, 387, 392, 400, 401, 422, 425, 457, 468], "bool_": [11, 316], "boolean": [0, 16, 82, 171, 172, 173, 174, 175, 176, 177, 199, 200, 201, 316, 375, 483], "both": [1, 2, 13, 87, 88, 89, 128, 129, 133, 165, 166, 177, 179, 180, 181, 188, 198, 205, 221, 223, 227, 233, 240, 254, 257, 282, 306, 325, 326, 327, 348, 349, 355, 356, 357, 455, 480, 481, 482, 486, 488], "bottom": 401, "bound": [0, 247, 250, 251, 341, 409, 480, 483, 488], "boundari": 471, "bracket": 5, "brain": 316, "break": 485, "bregler": 337, "broadcast": [0, 2, 13, 16, 87, 88, 89, 91, 93, 128, 129, 133, 145, 161, 165, 166, 171, 179, 180, 181, 198, 203, 205, 221, 223, 227, 233, 235, 239, 240, 244, 250, 251, 254, 257, 282, 287, 303, 379], "broadcast_arrai": [0, 2], "broadcast_to": 0, "broadcasted_input": 2, "brought": 7, "btl_tcp_link": 481, "buffer": [1, 2, 210, 485], "bui": 5, "build": [3, 5, 7, 405, 453, 480], "build_ext": [2, 8], "build_shared_lib": [2, 8], "built": [1, 2, 8, 484], "bundl": 5, "byte": [51, 61, 210, 211, 212, 215, 216, 217, 316], "c": [0, 1, 2, 5, 14, 188, 328, 330, 331, 332, 333, 334, 335, 337, 338, 348, 349, 485, 486, 488], "c_": [349, 462], "c_in": [98, 99, 100, 101, 102, 103, 104], "c_out": [98, 99, 100, 101, 102, 103, 104], "c_pad": 1, "c_t": [349, 462], "cach": [5, 8, 208, 210, 211, 215, 480], "calcul": [188, 422, 425, 431, 457], "call": [2, 3, 5, 6, 31, 126, 160, 208, 212, 323, 340, 364, 376, 381, 389, 453, 455, 464, 480, 481, 482, 484], "callabl": [94, 112, 142, 164, 178, 298, 301, 302, 306, 307, 309, 310, 311, 312, 359, 360, 363, 371, 384, 389, 400, 402, 403, 404, 405, 406, 407, 408, 409, 456, 457, 458, 459, 460, 461, 462, 467, 468, 469, 470, 471, 472, 473], "can": [1, 2, 3, 5, 7, 8, 13, 18, 65, 79, 83, 87, 88, 89, 94, 118, 119, 120, 128, 129, 133, 136, 145, 165, 166, 179, 180, 181, 188, 198, 205, 217, 221, 223, 227, 233, 239, 240, 247, 250, 251, 254, 257, 262, 282, 293, 298, 312, 323, 326, 327, 340, 341, 356, 357, 363, 376, 381, 389, 401, 424, 450, 453, 455, 463, 464, 477, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489], "cannot": [5, 93, 483, 485], "captur": [2, 3, 94, 218, 219, 323, 480], "care": [5, 484], "carri": 2, "cartesian": 207, "case": [2, 5, 121, 122, 123, 125, 126, 127, 148, 151, 152, 154, 155, 156, 157, 158, 182, 183, 184, 185, 186, 187, 189, 190, 191, 203, 256, 277, 326, 327, 337, 356, 357, 398, 416, 434, 440, 446, 447, 463, 464, 480, 482, 486, 487, 488, 489], "cast": [2, 37, 155, 156, 157, 193, 359, 371, 485], "caster": 2, "categor": 5, "categori": [11, 177, 316], "catlas_saxpbi": 2, "caus": [323, 480, 484], "causal": 5, "caution": 83, "cd": [3, 8], "cdf": [241, 341, 412], "cdot": [414, 423, 426, 442], "ceil": 0, "ceildiv": 1, "cell": 349, "celu": 323, "certain": [2, 373, 480], "chang": [83, 94, 267, 300, 377, 382, 401, 427, 434, 480, 485], "channel": [1, 98, 99, 100, 101, 102, 103, 104, 328, 330, 331, 332, 333, 334, 335, 337, 338], "channel_idx": 1, "charact": 309, "check": [0, 2, 8, 82, 124, 177, 185, 186, 213, 366, 482, 483], "checklist": 481, "checkout": [3, 480], "checkpoint": [400, 455], "chen": 462, "child": 378, "children": 323, "chip": 8, "choleski": 183, "choos": [5, 144, 387], "chosen": 131, "clamp": 158, "clang": 8, "clariti": 482, "class": [2, 5, 6, 9, 10, 11, 30, 112, 120, 314, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 453, 456, 457, 458, 459, 460, 461, 462, 467, 468, 475], "class_pred": 306, "classif": [405, 406], "classifi": 6, "classmethod": [381, 382], "clear": 208, "click": 8, "clip": [0, 308, 422, 457], "clip_threshold": 457, "clipped_grad": 308, "clone": 8, "close": [4, 7, 8, 16, 171], "closer": 310, "cmake": [3, 8], "cmake_arg": 3, "cmake_build_parallel_level": 8, "cmake_build_typ": 8, "cmake_current_list_dir": 2, "cmake_host_system_processor": 8, "cmake_library_output_directori": 2, "cmakebuild": 2, "cmakeextens": 2, "cmakelist": 2, "cmdclass": 2, "co": [0, 2, 112, 392, 482], "code": [1, 142, 480, 481, 484], "coeffici": [2, 456, 457, 459, 460, 461, 462], "col": 295, "col_contigu": 2, "cold": 8, "collect": [2, 310, 311, 479], "column": [2, 140, 168, 185, 236], "com": [8, 481], "combin": [5, 190, 312], "come": [2, 5, 481, 482], "command": [2, 3, 8, 481], "command_buff": 2, "common": [2, 455, 480, 484], "commonli": [6, 377, 450, 480], "commun": [7, 120, 123, 124], "compar": [2, 82, 480], "comparison": [16, 133, 165, 166, 180, 181, 227], "compat": [5, 145, 240, 244, 341, 487], "compil": [0, 3, 7, 8, 119, 132, 142, 481, 482, 484], "compiled_fun": 480, "compiled_grad_fn": 480, "complet": [4, 5, 8, 216, 377, 378, 482, 488], "complex": [2, 96, 97, 153, 154, 155, 156, 157, 169, 185, 186, 252, 309, 316, 323, 378, 480, 482], "complex64": [2, 11, 316], "complex64_t": 2, "complexflo": 11, "compon": [2, 5], "compos": [7, 323, 480, 482, 486], "composit": 486, "compress": 265, "compromis": 5, "comput": [0, 1, 2, 4, 5, 6, 7, 8, 108, 109, 110, 111, 112, 116, 131, 139, 144, 164, 178, 182, 183, 184, 185, 186, 187, 188, 191, 198, 206, 230, 236, 254, 272, 279, 280, 290, 298, 299, 301, 307, 323, 328, 343, 344, 348, 349, 350, 364, 377, 382, 383, 387, 400, 403, 404, 405, 406, 413, 414, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 455, 456, 457, 459, 460, 461, 462, 466, 480, 481, 482, 486, 488], "computation": 484, "compute_encod": 2, "concaten": [0, 5, 121], "concept": 453, "concis": 5, "concret": [2, 343, 349, 352, 384, 484, 488], "conda": [8, 481], "condit": [0, 2, 303, 488], "config": [2, 481], "configu": 455, "configur": [116, 481], "confirm": 481, "confus": 6, "conj": 97, "conjug": [0, 96], "connect": 481, "consecut": [144, 236, 387], "consequ": 5, "consid": [5, 16, 82, 171, 309, 310, 311, 344, 479], "consider": 480, "const": [0, 1, 2, 425], "constant": [0, 2, 5, 8, 141, 143, 231, 323, 328, 344, 350, 383, 425, 435, 467, 469, 480, 485], "constant_valu": 231, "constitut": 310, "construct": [0, 2, 6, 45, 117, 161, 228, 291, 304], "consum": 484, "contain": [2, 5, 8, 28, 29, 68, 94, 118, 131, 152, 153, 154, 162, 163, 185, 188, 199, 200, 201, 236, 274, 303, 308, 323, 363, 365, 366, 372, 400, 431, 450, 453, 480, 481, 482], "content": [8, 363, 480], "context": 281, "contigu": [0, 1, 2, 83, 142], "continu": [329, 410, 482], "contract": [0, 131], "contrast": 460, "contribut": 2, "contriv": [482, 488], "control": [0, 351, 477, 484], "conv": 105, "conv1d": [0, 323], "conv2d": [0, 323], "conv3d": [0, 323], "conv_gener": 0, "conv_transpose1d": 0, "conv_transpose2d": 0, "conv_transpose3d": 0, "conveni": [1, 2, 6, 177], "convent": [18, 105, 130, 131, 401, 460], "convers": 7, "convert": [0, 1, 2, 78, 84, 85, 86, 115, 158, 238, 381, 382, 484, 485, 486], "convolut": [0, 98, 99, 100, 101, 102, 103, 104, 105, 330, 331, 332, 333, 334, 335, 337, 338], "convolv": [98, 99, 100, 101, 102, 103, 104], "convtranspose1d": 323, "convtranspose2d": 323, "convtranspose3d": 323, "coordin": [0, 207], "copi": [0, 1, 2, 5, 7, 232, 273, 485], "copy_inplac": 2, "copytyp": 2, "core": [1, 2, 3, 4, 5, 6, 306, 323, 325, 326, 327, 328, 348, 355, 356, 357, 366, 369, 371, 374, 401, 402, 403, 404, 405, 406, 407, 408, 409, 422, 424, 431, 450, 453, 455, 480, 481, 485, 486], "corner": 401, "correct": [2, 8, 459, 460, 461, 483, 484], "correctli": 38, "correl": [101, 337], "correspond": [0, 1, 2, 15, 17, 78, 93, 116, 118, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 185, 202, 204, 220, 234, 283, 290, 302, 310, 482], "cos_first": 392, "cosh": [0, 430], "cosin": [0, 19, 20, 106, 107, 423, 469, 471, 482], "cosine_decai": [455, 471], "cosine_similarity_loss": 323, "cost": [8, 457, 481, 484], "costli": 484, "cot": 1, "cot_index": 1, "cotan": [2, 112], "cotang": [1, 2, 112, 301], "could": [5, 323], "count": [323, 471], "counter": 477, "cours": 482, "coursera": 467, "cov": 244, "covari": [244, 328], "cover": 2, "cpp": 2, "cpu": [7, 8, 185, 186, 189, 488], "cpython": 2, "crash": [83, 480], "creat": [0, 2, 5, 8, 83, 123, 140, 168, 281, 323, 453, 455, 471, 480, 483, 485], "create_additive_causal_mask": 5, "criteria": 2, "cross": [6, 101, 422, 424], "cross_entropi": [6, 323], "crowd": 5, "cry": 5, "cubic": 401, "cummax": 0, "cummin": 0, "cumprod": 0, "cumsum": 0, "cumul": [0, 83, 108, 109, 110, 111], "current": [5, 7, 8, 83, 90, 100, 103, 104, 127, 209, 211, 236, 312, 323, 457, 481, 484], "custom": [7, 112, 142, 400], "custom_decod": 400, "custom_encod": 400, "custom_funct": 1, "custom_kernel_myexp_float": 1, "custom_tim": 2, "cvpr": 337, "cycl": 479, "d": [0, 1, 2, 5, 100, 104, 117, 118, 145, 170, 188, 203, 207, 230, 286, 293, 295, 296, 297, 313, 332, 335, 338, 343, 349, 384, 456, 459, 461, 488], "d1": 488, "d2": 488, "d2fdx2": 482, "d_i": 352, "dampen": 468, "darwin": 2, "data": [0, 2, 6, 7, 10, 18, 125, 140, 155, 156, 161, 168, 192, 224, 228, 241, 250, 293, 295, 300, 304, 338, 402, 403, 404, 405, 406, 407, 408, 409, 480, 481, 483, 485], "dataset": [4, 481, 484], "datatyp": 51, "dbuild_shared_lib": 8, "dcmake_build_typ": 8, "ddof": [0, 75, 80, 279, 299], "deal": 480, "debug": [1, 3, 481], "debugg": 7, "decai": [457, 460, 462, 468, 469, 470, 473], "decay_r": [457, 470, 473], "decay_step": 469, "decent": 6, "decid": [310, 363], "decim": [0, 66, 259], "declar": 2, "decltyp": 1, "decod": 400, "decomposit": [182, 183, 190], "decor": [1, 112], "decoupl": 460, "deep": [328, 403, 404, 405, 406], "def": [1, 2, 4, 5, 6, 112, 142, 298, 323, 453, 480, 481, 482, 483, 484, 485, 488], "default": [1, 2, 8, 14, 15, 16, 17, 18, 26, 27, 28, 29, 82, 83, 90, 94, 95, 98, 99, 100, 101, 102, 103, 104, 112, 113, 114, 116, 117, 118, 121, 122, 123, 125, 126, 127, 140, 142, 144, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 162, 163, 164, 167, 168, 171, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 202, 204, 206, 207, 215, 216, 217, 220, 224, 228, 231, 232, 234, 236, 237, 239, 240, 241, 243, 244, 245, 246, 247, 249, 250, 251, 255, 256, 259, 266, 267, 273, 274, 277, 278, 279, 281, 283, 285, 290, 292, 293, 294, 295, 296, 297, 298, 299, 302, 304, 306, 316, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 339, 342, 343, 345, 348, 349, 351, 352, 355, 356, 357, 359, 364, 366, 371, 373, 376, 379, 380, 381, 382, 384, 387, 392, 396, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 415, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 453, 456, 457, 458, 459, 460, 461, 462, 467, 468, 469, 477, 479, 480, 482, 485, 487, 489], "default_devic": 489, "default_stream": 489, "defin": [1, 2, 4, 5, 6, 8, 112, 126, 142, 163, 184, 188, 237, 306, 309, 485], "definit": [112, 182, 183, 244], "degre": [0, 238, 435], "delta": [427, 456], "delv": [405, 406], "demonstr": 485, "denomin": [348, 423, 456, 458, 459, 460, 461, 467], "dens": [207, 488], "depend": [0, 2, 3, 4, 8, 78, 188, 343, 349, 384, 481, 483, 487, 488], "depth": [309, 327, 332, 335, 338, 357, 482], "dequant": [0, 236], "deriv": [2, 482, 484], "descend": 361, "descent": [468, 480, 484], "describ": [2, 484], "descript": [2, 5, 316], "design": [1, 4, 7, 477, 488], "destin": [0, 2, 60, 127, 222, 235], "destroi": 480, "detach": 482, "detail": [1, 2, 10, 215, 323, 337, 387, 392, 401, 403, 404, 405, 406, 456, 458, 459, 461, 462, 483, 486], "determin": [0, 2, 118, 244, 312, 316, 370, 487], "dev": [2, 8], "develop": [2, 8], "developer_dir": 8, "deviat": [0, 245, 279, 403, 405, 408], "deviatoin": 0, "devic": [1, 2, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 209, 216, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 243, 244, 245, 246, 247, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 299, 300, 303, 304, 305, 314, 488, 489], "device_info": 217, "devicetyp": 9, "df": 485, "dfdx": [482, 483], "dft": [146, 147, 148, 149, 150, 151, 155, 156, 157], "dhwc": 338, "diag": [0, 190], "diagon": [0, 45, 117, 140, 293, 295, 296, 297], "dict": [94, 136, 193, 209, 262, 263, 264, 308, 369, 374, 377, 378, 453, 455, 463, 464, 466, 479, 482, 487], "dict_kei": [310, 464], "dictionari": [5, 94, 193, 209, 262, 263, 308, 309, 312, 323, 363, 372, 377, 378, 465, 479, 487], "did": 5, "diff": 2, "differ": [7, 177, 282, 300, 434, 482], "differenti": [1, 2, 7, 329, 410], "difficult": 482, "difficulti": [403, 404], "dilat": [0, 98, 99, 100, 101, 102, 103, 104, 330, 331, 333, 334], "dim": [1, 5, 144, 145, 340, 344, 348, 350, 379, 381, 383, 387, 392, 400], "dimens": [0, 1, 2, 5, 15, 17, 26, 27, 62, 68, 78, 84, 85, 86, 94, 99, 100, 101, 103, 104, 118, 138, 144, 145, 153, 154, 156, 157, 158, 162, 163, 170, 182, 183, 185, 186, 187, 188, 189, 190, 191, 202, 203, 204, 206, 220, 234, 235, 236, 240, 249, 279, 283, 287, 290, 294, 299, 328, 330, 331, 332, 333, 334, 335, 337, 338, 342, 343, 344, 348, 349, 350, 379, 383, 384, 387, 400, 401, 415, 424, 480, 482], "dimension": [30, 141, 143, 146, 147, 148, 149, 150, 151, 155, 156, 157, 325, 326, 327, 328, 330, 331, 332, 333, 334, 335, 340, 352, 355, 356, 357, 381, 382, 392, 483, 485], "direct": [2, 5, 361, 462, 488], "directli": [2, 5, 83], "directori": [2, 5, 8], "disabl": [119, 215, 480], "disable_compil": 480, "disappoint": 5, "discard": [5, 309], "discov": 8, "discoveri": 462, "discret": [105, 146, 147, 148, 149, 150, 151, 155, 156, 157, 340, 381], "discuss": 2, "disk": 5, "dispatch": 2, "dispatch_thread": 2, "dispatchthread": 1, "displai": 323, "distanc": [5, 435], "distribut": [7, 8, 239, 240, 241, 243, 244, 245, 250, 251, 352, 403, 404, 405, 406, 408, 409, 425, 428, 433, 435, 450], "diverg": 428, "divid": [0, 2, 38, 160, 236, 254, 481], "divis": [0, 128, 160, 236, 254], "divisor": [279, 299], "divmod": 0, "dloss_dw": 482, "dloss_dx": 482, "dlpack": 485, "dlvalu": 298, "dmlx_build_cpu": 8, "dmlx_build_gguf": 8, "dmlx_build_safetensor": 8, "dmlx_metal_debug": 3, "dmlx_metal_jit": 8, "do": [0, 2, 5, 8, 300, 323, 365, 376, 450, 453, 460, 480, 481, 482, 484], "doc": [2, 6, 481], "document": [2, 3, 65, 79, 142, 262, 263, 316, 480, 482, 483], "doe": [0, 2, 3, 5, 8, 210, 300, 308, 323, 480, 483, 484, 485], "doesn": [2, 323], "domain": [250, 481], "don": [1, 8, 480, 488], "done": [323, 336, 383, 480, 481, 484, 485], "dot": [182, 187, 191, 290, 309, 368, 379], "doubl": [0, 5], "doubt": 5, "down": [5, 308], "downsampl": [325, 326, 327, 355, 356, 357], "dparam": 298, "draw": 240, "drop": 363, "dropout": [323, 337, 338, 373, 400, 480], "dropout2d": 323, "dropout3d": 323, "dst": 127, "dt": 134, "dtype": [0, 1, 2, 5, 11, 18, 30, 37, 38, 78, 81, 125, 126, 140, 142, 158, 161, 168, 177, 185, 186, 188, 189, 192, 228, 241, 243, 244, 245, 247, 250, 251, 293, 295, 300, 304, 316, 371, 401, 402, 403, 404, 405, 406, 407, 408, 409, 422, 424, 431, 469, 470, 471, 472, 473, 480, 481, 482, 483, 485, 486, 487], "dtypecategori": [177, 316], "dual": 430, "duchi": 458, "dure": [3, 94, 336, 337, 338, 401, 485], "dx": 112, "dy": 112, "dyld": 481, "dyld_library_path": 481, "dylib": 2, "dynam": 484, "e": [2, 6, 8, 112, 134, 142, 162, 163, 178, 268, 328, 330, 331, 332, 333, 334, 335, 337, 338, 344, 348, 350, 364, 383, 420, 421, 443, 444, 449, 455, 458, 480, 484, 489], "e5": 316, "e8": 316, "each": [0, 1, 2, 68, 116, 136, 144, 163, 177, 182, 183, 185, 186, 187, 190, 191, 203, 207, 231, 236, 237, 240, 255, 264, 265, 274, 291, 294, 300, 302, 303, 337, 338, 340, 343, 344, 349, 384, 387, 400, 422, 424, 477, 480, 481, 484], "eager": 484, "earli": 337, "earlier": 2, "eas": 5, "easi": [2, 323, 481], "easier": [1, 484], "edg": [93, 231, 401, 480], "edit": [8, 378], "effect": [337, 480, 484], "effici": [5, 7, 162, 337, 387, 484, 486], "eigenvalu": [185, 186], "eigenvector": 185, "einstein": [130, 131], "einsum": 131, "either": [8, 13, 65, 78, 79, 87, 88, 89, 93, 128, 129, 133, 160, 165, 166, 179, 180, 181, 188, 198, 203, 205, 221, 223, 227, 233, 254, 257, 282, 298, 326, 327, 356, 357, 389, 401, 405, 406, 485], "elem": [1, 142], "elem_to_loc": [1, 2], "element": [0, 1, 2, 12, 13, 19, 20, 21, 22, 23, 24, 25, 28, 70, 83, 87, 88, 89, 92, 106, 107, 108, 109, 110, 111, 116, 128, 129, 133, 134, 135, 137, 139, 140, 159, 160, 163, 165, 166, 171, 172, 173, 174, 175, 176, 179, 180, 181, 194, 195, 196, 197, 198, 199, 200, 201, 205, 207, 221, 223, 225, 227, 232, 233, 236, 237, 253, 254, 255, 257, 258, 260, 268, 269, 270, 271, 275, 276, 282, 286, 288, 289, 292, 298, 300, 303, 329, 336, 337, 338, 343, 347, 349, 358, 380, 384, 387, 391, 410, 417, 418, 420, 421, 436, 437, 439, 442, 443, 444, 445, 480, 482], "elementwis": [1, 96, 97], "elif": 5, "ellipsi": 483, "elman": 384, "els": [0, 2, 5, 323, 364, 481, 484], "elsewher": [295, 483], "elu": [323, 440], "emb": [5, 340, 381, 392], "embed": [5, 306, 323, 381, 387, 392, 423], "empti": [127, 244], "enabl": [3, 5, 8, 94, 132, 468], "encod": [2, 144, 387, 392, 400, 424], "encount": [2, 482], "end": [118, 183, 213, 236, 258, 343, 349, 398, 416, 427, 434, 440, 446, 447, 469, 472], "end_axi": [0, 49, 158], "end_encod": 2, "endif": 2, "endswith": 364, "enhanc": [5, 387, 484], "enjoi": 2, "enough": [2, 484], "ensur": [0, 1, 2, 8, 142, 308, 430, 481], "ensure_row_contigu": [1, 142], "enter": 5, "entir": [15, 17, 26, 27, 202, 204, 206, 220, 234, 279, 283, 299, 337, 338], "entri": [0, 246, 337, 338], "entropi": [6, 422, 424], "enumer": 323, "environ": [8, 119, 132, 481], "ep": [4, 141, 143, 328, 344, 348, 350, 383, 423, 425, 435, 455, 456, 457, 458, 459, 460, 461, 467], "epoch": 6, "epsilon": [328, 344, 348, 350, 383, 423, 425, 456, 458, 459, 460, 461, 467], "epsilon_1": 457, "epsilon_2": 457, "equal": [0, 1, 16, 28, 82, 140, 166, 171, 181, 227, 232, 247, 274, 348, 352], "equal_nan": [0, 16, 82, 171], "equat": [130, 131], "equival": [0, 2, 31, 65, 79, 126, 129, 160, 163, 167, 286, 329, 339, 341, 345, 346, 347, 353, 354, 378, 380, 382, 385, 386, 388, 390, 393, 394, 395, 396, 397, 399], "erf": [0, 135, 480], "erfinv": 0, "error": [0, 2, 8, 123, 134, 135, 216, 217, 274, 341, 412, 413, 414, 430, 432, 482, 485], "error_norm": 4, "estim": [459, 461], "eta": 462, "etc": [2, 236, 323, 401, 481], "eval": [2, 3, 4, 5, 6, 323, 453, 455, 480, 481, 482, 484, 486], "eval_cpu": 2, "eval_fn": 6, "eval_gpu": 2, "evalu": [2, 5, 6, 7, 127, 136, 178, 301, 323, 362, 373, 453, 455, 480, 486], "even": [1, 2, 5, 94, 480, 484, 485], "evenli": [0, 192], "everi": [236, 310, 455, 473, 482], "everyth": [5, 481], "everywher": 0, "exact": [413, 414], "exactli": [2, 5, 144, 366, 482], "exampl": [0, 3, 4, 5, 6, 8, 18, 38, 112, 142, 145, 158, 177, 185, 186, 188, 189, 281, 286, 308, 311, 312, 323, 325, 326, 327, 328, 348, 355, 356, 357, 364, 366, 373, 376, 401, 402, 403, 404, 405, 406, 407, 408, 409, 422, 424, 431, 450, 455, 464, 469, 470, 471, 472, 473, 477, 482, 483, 484, 485, 486, 487], "exce": 308, "exceed": 216, "except": [7, 140, 152, 153, 155, 156, 157, 344, 366, 483, 485], "exclud": [235, 287], "exclus": [0, 83, 89], "execut": [2, 8, 84, 85, 86, 212, 485, 488], "exist": [2, 3, 5, 364, 376], "exp": [0, 1, 139, 142, 198, 202, 241, 272, 329, 339, 391, 410, 411, 428, 440, 441, 445, 480, 488], "exp_elementwis": [1, 142], "expand_dim": 0, "expect": [2, 5, 330, 331, 332, 333, 334, 335, 336, 337, 338, 392, 400, 425, 480, 483], "expens": 400, "expensive_fun": 484, "experiment": 485, "explain": 2, "explicit": [2, 464, 477, 485], "explicitli": [162, 323, 477], "explor": 8, "expm1": 0, "exponenti": [0, 137, 139, 329, 339, 388, 410, 411, 440, 470], "exponential_decai": 455, "export": 8, "ext_modul": 2, "extend": [2, 231], "extens": [7, 193, 218, 370, 487], "extern": 485, "extra": [1, 310, 311], "extract": [0, 5, 45, 117, 118, 323, 363, 453, 481], "extras_requir": 2, "extrem": [483, 484], "ey": [0, 5, 187, 191], "f": [0, 2, 4, 6, 112, 188, 323, 349, 460, 480, 485], "f_jvp": 112, "f_t": 349, "f_vjp": 112, "f_vmap": 112, "face": 5, "factor": [2, 14, 167, 182, 183, 189, 401, 424, 470, 473], "fall": [2, 112], "fallback": 2, "fals": [0, 1, 2, 5, 15, 16, 17, 26, 27, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 75, 76, 80, 82, 94, 101, 108, 109, 110, 111, 123, 142, 171, 177, 182, 183, 188, 191, 193, 202, 204, 206, 207, 216, 220, 234, 279, 283, 299, 303, 306, 309, 310, 311, 312, 316, 344, 348, 350, 352, 364, 366, 376, 379, 382, 387, 392, 400, 401, 422, 425, 457, 468, 485], "famili": 5, "fan": [403, 404, 405, 406], "fan_in": [403, 404, 405, 406], "fan_out": [403, 404, 405, 406], "far": 455, "fast": [1, 7, 341, 414, 481, 488], "faster": [1, 2, 8, 129, 412, 422, 480, 482], "featur": [1, 7, 98, 99, 100, 101, 102, 103, 104, 144, 328, 343, 344, 348, 349, 350, 352, 382, 383, 384, 387, 400, 401, 480, 484], "feed": 5, "feed_forward": 5, "feedforward": [403, 404], "feel": 5, "fetch": 1, "few": [1, 2, 5, 6, 7, 8, 481, 484, 486], "ffn": 5, "ffn_norm": 5, "fft": 7, "figur": 481, "file": [5, 8, 193, 261, 262, 263, 264, 265, 366, 370, 481, 482, 487], "file_or_weight": 366, "fill": [0, 2, 161, 229, 295, 305, 402, 403, 404, 405, 406, 408, 409], "filter": [0, 105, 330, 331, 332, 333, 334, 335, 359, 363], "filter_and_map": 323, "filter_fn": [359, 363], "final": [2, 4, 5, 6, 167, 469, 472], "find": [2, 4, 8, 481], "find_packag": 2, "finder": 8, "fine": [477, 484], "finetun": 323, "finish": 2, "finit": [0, 172, 224], "first": [0, 1, 2, 3, 4, 5, 6, 8, 118, 121, 158, 164, 177, 179, 190, 199, 201, 203, 232, 249, 257, 284, 290, 293, 298, 309, 311, 312, 323, 326, 327, 344, 356, 357, 401, 423, 431, 457, 459, 460, 461, 464, 480, 482, 485, 488], "first_lay": 484, "fit": [2, 236, 488], "five": 480, "fix": [2, 5, 8, 484], "flag": [2, 8, 480, 485], "flat": [162, 163, 309, 313], "flat_param": 264, "flatten": [0, 28, 29, 108, 109, 110, 111, 188, 230, 232, 235, 255, 258, 273, 286, 287, 292, 309], "flexibl": 7, "flexibli": 378, "flip": [0, 101, 105], "float": [0, 1, 2, 11, 14, 16, 18, 78, 141, 142, 143, 144, 145, 160, 161, 167, 171, 177, 188, 224, 237, 239, 243, 245, 308, 316, 328, 336, 337, 338, 344, 348, 350, 359, 371, 383, 387, 392, 398, 400, 401, 402, 403, 404, 405, 406, 408, 409, 423, 424, 425, 427, 431, 434, 435, 446, 447, 456, 457, 458, 459, 460, 461, 462, 467, 468, 469, 470, 472, 473], "float16": [1, 2, 11, 142, 167, 193, 316, 359, 484, 485], "float16_t": [1, 2], "float32": [0, 1, 2, 11, 18, 140, 142, 145, 167, 168, 177, 185, 186, 188, 189, 192, 228, 241, 243, 244, 245, 250, 251, 295, 304, 316, 401, 402, 403, 404, 405, 406, 407, 408, 409, 422, 424, 431, 469, 470, 471, 472, 473, 480, 481, 482, 483, 484, 485, 486, 487], "float64": 177, "floor": [0, 1, 160], "floor_divid": 0, "flow": [0, 280, 484], "flush": 2, "fn": [307, 310, 311, 312, 486], "follow": [1, 2, 5, 6, 7, 8, 18, 105, 116, 145, 162, 188, 231, 236, 311, 323, 413, 414, 428, 456, 457, 458, 459, 460, 461, 462, 468, 477, 480, 481, 482, 488], "foo": 481, "food": 5, "forc": [5, 6, 323, 481, 486], "forg": 8, "formal": [116, 236], "format": [5, 193, 261, 262, 263, 264, 265, 485], "formul": [329, 339], "formula": 434, "forth": 401, "forward": [1, 2, 298, 480, 484], "found": 363, "four": 328, "fourier": [146, 147, 148, 149, 150, 151, 155, 156, 157], "frac": [134, 236, 268, 328, 336, 337, 338, 344, 348, 350, 352, 383, 391, 403, 404, 405, 406, 423, 425, 427, 430, 441, 443, 444, 456, 458, 459, 460, 461, 467], "fraction": 18, "framework": [2, 7], "free": 215, "freez": [323, 376, 453], "freq": 144, "frequenc": [144, 387, 392], "frequent": [480, 484], "friend": 5, "fro": 188, "frobeniu": 188, "from": [0, 1, 2, 5, 6, 7, 83, 115, 116, 118, 121, 122, 125, 126, 127, 142, 153, 154, 156, 157, 161, 162, 167, 188, 193, 203, 207, 212, 215, 229, 236, 238, 239, 240, 241, 242, 243, 247, 250, 264, 277, 280, 282, 286, 287, 292, 293, 303, 305, 309, 310, 311, 312, 313, 323, 352, 364, 366, 379, 403, 404, 405, 406, 408, 409, 425, 434, 450, 455, 479, 480, 481, 482, 484, 485, 486, 487, 488], "from_embed": 381, "from_linear": 382, "front": 2, "frozen": [323, 364, 374, 376, 382, 453], "fuction": 129, "full": [0, 1, 2, 6, 65, 79, 105, 142, 272, 377, 378, 425, 480, 481, 484], "full_turn": 392, "fulli": [2, 7, 481, 485, 488], "fun": [94, 164, 178, 298, 301, 302, 480, 483, 484, 488], "fun1": 484, "func": 384, "function": [0, 1, 2, 3, 4, 5, 6, 7, 16, 18, 83, 94, 112, 129, 134, 135, 142, 164, 171, 178, 182, 183, 185, 186, 187, 188, 189, 190, 191, 203, 217, 268, 298, 301, 302, 307, 308, 310, 311, 312, 323, 329, 339, 341, 342, 345, 346, 347, 353, 354, 358, 360, 364, 371, 376, 380, 384, 385, 386, 388, 389, 390, 391, 393, 394, 395, 396, 397, 398, 399, 400, 412, 413, 414, 415, 416, 417, 418, 420, 421, 422, 436, 441, 443, 444, 445, 446, 447, 448, 450, 455, 464, 477, 479, 481, 483, 484, 485, 487], "functool": 480, "further": [2, 8, 482], "fuse": [1, 480], "fusibl": 480, "futur": [5, 382, 483, 484], "g": [3, 8, 112, 142, 188, 236, 349, 449, 467, 468, 484, 489], "g_t": [349, 456, 458, 459, 460, 461, 462, 467, 468], "gain": [403, 404, 405, 406], "gamma": [328, 344, 348, 350, 383, 403, 404, 405, 406], "gap": 1, "gate": [342, 343, 415], "gather": [0, 121, 162, 163], "gather_mm": [0, 163], "gather_qmm": 0, "gaurante": 300, "gaussian": [4, 341, 412, 413, 414, 425], "gaussian_nll_loss": 323, "gelu": [323, 413, 414, 480], "gelu_approx": [323, 341, 412], "gelu_fast_approx": [323, 341, 412], "geluapprox": 341, "gelufast": 341, "gener": [0, 1, 2, 3, 4, 11, 18, 101, 140, 142, 153, 154, 192, 207, 239, 244, 245, 246, 247, 250, 251, 400, 477, 480, 483, 484, 489], "general_": 2, "generate_stub": 8, "geq": [398, 447], "get": [2, 4, 6, 8, 99, 100, 101, 103, 104, 113, 114, 209, 210, 211, 212, 242, 323, 480, 482, 484, 488], "get_cache_memori": 208, "get_command_encod": 2, "get_kernel": 2, "gguf": [8, 193, 262, 487], "gh": 1, "gii": 1, "git": 8, "github": [4, 6, 8, 480], "give": [2, 5, 6, 28, 480], "given": [0, 2, 8, 15, 17, 28, 38, 83, 91, 93, 95, 108, 109, 110, 111, 116, 118, 131, 136, 138, 145, 146, 147, 148, 149, 150, 151, 155, 156, 157, 161, 162, 188, 202, 204, 206, 215, 220, 224, 226, 234, 244, 246, 247, 258, 259, 267, 272, 274, 279, 283, 285, 291, 292, 293, 295, 296, 297, 299, 314, 336, 363, 379, 423, 425, 431], "gix": 1, "gix_mult": 1, "giy_mult": 1, "global": [119, 121, 122, 123, 125, 126, 127, 132, 248, 308, 477, 480], "glorot": [403, 404], "glorot_norm": 323, "glorot_uniform": 323, "glu": [5, 323], "gm": 1, "gn": 1, "go": [2, 5, 482], "golub": 188, "good": [2, 8, 455, 480, 481, 488], "goroshin": 337, "gower": 5, "gpu": [1, 3, 7, 8, 209, 483, 488], "gputrac": [3, 218], "grad": [2, 4, 6, 298, 308, 455, 463, 480, 481, 482, 483, 484, 486], "grad_fn": [4, 480, 482], "gradient": [0, 4, 6, 112, 164, 280, 298, 307, 308, 323, 364, 377, 382, 400, 430, 453, 455, 456, 457, 459, 460, 461, 462, 463, 466, 468, 480, 481, 482, 483, 484, 485, 486], "grain": 477, "graph": [2, 5, 6, 7, 482], "great": 3, "greater": [0, 5, 28, 139, 166, 232, 308, 398, 447], "greater_equ": 0, "grep": 8, "grid": [2, 142, 207], "grid_dim": 2, "grid_grad": 1, "grid_idx": 1, "grid_sampl": 1, "grid_sample_grad": 1, "grid_sample_ref": 1, "grid_sample_vjp": 1, "grid_shap": 1, "grid_siz": 1, "ground": [4, 5, 424, 434], "group": [0, 1, 98, 99, 100, 101, 102, 103, 104, 116, 121, 122, 123, 125, 126, 127, 145, 163, 236, 237, 300, 306, 330, 331, 344, 381, 382, 481], "group_dim": 2, "group_siz": [0, 116, 163, 236, 237, 306, 381, 382], "groupnorm": 323, "grow": 484, "gru": 323, "guid": [2, 7], "gw": 1, "h": [1, 2, 98, 99, 100, 102, 103, 104, 188, 328, 331, 332, 334, 335, 337, 338, 343, 349, 384, 482, 484], "h_": [343, 349, 384], "h_in": 1, "h_stride": 1, "h_t": [343, 349, 384], "ha": [2, 3, 5, 6, 7, 8, 78, 94, 118, 127, 152, 153, 155, 156, 157, 164, 182, 183, 185, 186, 187, 190, 191, 207, 211, 240, 328, 343, 349, 352, 384, 453, 455, 480, 483, 484, 486, 488], "had": 5, "hadamard": [0, 167], "hadamard_transform": 0, "half": [2, 18, 247, 251, 387, 484], "halv": [342, 415], "hand": [5, 482, 484], "handi": 482, "handl": [2, 323, 480], "happen": [2, 5, 141, 400, 455, 480, 484], "happi": 5, "hard": 5, "hard_shrink": [323, 345], "hard_tanh": [323, 346], "hardshrink": [323, 416], "hardswish": 323, "hardtanh": [323, 417], "hat": [116, 236], "have": [0, 1, 2, 5, 8, 16, 82, 84, 85, 86, 90, 121, 145, 153, 154, 156, 157, 163, 171, 203, 218, 240, 300, 309, 349, 379, 389, 462, 464, 479, 480, 481, 483, 484, 488], "haven": 5, "hazan": 458, "he": [5, 405, 406], "he_norm": 323, "he_uniform": 323, "head": [145, 379, 400], "header": [2, 142], "heart": 5, "heavi": 5, "height": [326, 327, 328, 331, 332, 334, 335, 337, 338, 356, 357], "hello": [309, 313], "help": [2, 5, 480, 488], "helper": [5, 142, 480], "henc": [0, 2, 236, 480], "hendryck": 414, "here": [2, 5, 455, 480, 482, 484, 487, 488], "hermitian": [185, 186], "hf": 349, "hg": 349, "hh": 384, "hi": [5, 349], "hidden": [343, 349, 384, 400], "hidden_dim": [6, 453, 455], "hidden_s": [343, 349, 384], "hierarchi": 316, "high": [247, 251, 323, 340, 409, 450], "high_pad_s": 0, "higher": [2, 170, 217, 431, 482], "highli": 8, "him": 5, "hing": 426, "hinge_loss": 323, "hinton": 467, "hit": 2, "hn": 343, "ho": 349, "hold": [2, 5, 10, 11, 188, 480], "homebrew": 481, "hopkin": 188, "host": 2, "host1": 481, "host2": 481, "host_nam": [1, 2], "hostfil": 481, "hostnam": 481, "hot": 424, "hour": 5, "how": [2, 5, 6, 323, 325, 326, 327, 330, 331, 332, 333, 334, 335, 340, 355, 356, 357, 381, 401, 463, 480, 483, 488], "howev": [2, 112, 323, 341, 344, 464, 477, 480, 481, 484, 485], "hr": 343, "http": [344, 348, 350, 358, 383, 414, 436], "huber": 427, "huber_loss": 323, "human": [405, 406], "hundr": 8, "hurri": 5, "hutter": 460, "hyperbol": [0, 20, 22, 25, 107, 271, 289, 399, 448], "hz": 343, "i": [0, 1, 2, 3, 5, 6, 7, 8, 16, 18, 28, 37, 78, 83, 93, 99, 100, 101, 103, 104, 105, 108, 109, 110, 111, 112, 117, 118, 121, 122, 124, 125, 126, 127, 129, 136, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 160, 161, 162, 163, 167, 171, 172, 177, 178, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 198, 202, 203, 207, 213, 216, 217, 231, 232, 235, 236, 237, 244, 245, 246, 256, 258, 261, 262, 263, 268, 272, 274, 279, 280, 285, 286, 287, 290, 293, 294, 298, 299, 300, 301, 302, 303, 306, 308, 309, 310, 311, 312, 316, 318, 323, 325, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 341, 343, 344, 348, 349, 350, 352, 355, 356, 357, 363, 364, 370, 372, 373, 375, 376, 378, 379, 380, 382, 383, 384, 387, 392, 398, 400, 401, 405, 406, 412, 414, 422, 423, 425, 430, 431, 434, 435, 437, 442, 447, 453, 455, 457, 460, 462, 463, 464, 469, 471, 472, 477, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489], "i386": 8, "i_n": 1, "i_nw": 1, "i_s": 1, "i_sw": 1, "i_t": 349, "iclr": [459, 460, 461], "id": [6, 8], "idea": [482, 484], "idempot": [364, 376], "ident": [0, 112, 140, 280, 323, 373], "identifi": [2, 309, 479], "idim": 6, "idiom": [6, 480], "idx": [38, 483], "ie": [376, 481], "ieee": 316, "ignor": [5, 38, 93, 94, 136, 457], "ih": 384, "ii": 1, "ij": 207, "imag": [0, 331, 332, 334, 335, 337, 338, 401], "imagenet": [405, 406], "imaginari": 169, "immedi": [5, 359], "implement": [0, 1, 4, 6, 144, 145, 188, 340, 363, 379, 387, 389, 392, 398, 400, 401, 447, 456, 457, 458, 459, 461, 462, 463, 475, 480, 482], "impli": 300, "implicit": [477, 480, 482], "implicitli": 484, "import": [2, 3, 4, 5, 6, 8, 112, 167, 188, 264, 298, 309, 310, 311, 312, 313, 323, 325, 326, 327, 328, 348, 355, 356, 357, 366, 401, 422, 424, 431, 450, 453, 455, 480, 481, 482, 483, 484, 485, 486], "improv": [1, 2, 3, 5, 422, 456, 457, 458, 459, 460, 461, 467, 480, 481], "in_ax": [302, 482], "in_channel": [330, 331, 332, 333, 334, 335], "in_dim": [323, 453], "in_proj": 453, "inci": 2, "includ": [1, 2, 108, 109, 110, 111, 142, 210, 211, 216, 350, 360, 372, 382, 425, 455, 480, 482, 483, 486, 487, 489], "include_dir": 2, "inclus": [0, 41, 42, 43, 44, 108, 109, 110, 111, 158], "incom": 2, "inconveni": 480, "incorpor": 485, "incorrect": 485, "increas": 217, "increment": 18, "incur": [5, 8], "incx": 2, "independ": [120, 337, 338], "index": [0, 1, 2, 7, 9, 28, 38, 138, 140, 164, 207, 232, 286, 287, 298, 314], "indic": [0, 2, 16, 26, 27, 28, 29, 38, 162, 163, 164, 171, 172, 173, 174, 175, 176, 177, 190, 235, 274, 286, 287, 298, 373, 375, 424, 431, 471, 483], "indices_or_sect": [71, 274], "indirectli": 485, "individu": [323, 337, 338], "ineffici": [483, 484], "inexact": [11, 177], "inf": [188, 224, 379], "infer": [7, 161, 193, 293, 481], "infin": [0, 173, 175, 176, 224, 355, 356, 357, 461], "infinit": [16, 171, 172], "info": [5, 8], "inform": [3, 5, 6, 8, 131, 209, 262, 263, 316, 323, 328, 341, 379, 482, 488], "inherit": [6, 479], "inifn": 173, "init": [323, 380, 450, 455, 469, 470, 472, 473, 481], "init_fn": [402, 403, 404, 405, 406, 407, 408, 409, 450], "init_valu": 1, "initi": [1, 3, 4, 5, 123, 312, 323, 328, 344, 348, 350, 352, 380, 383, 402, 403, 404, 405, 406, 407, 408, 409, 453, 464, 469, 470, 472, 473, 480, 481, 484], "initializer_list": 0, "inject": 0, "inlin": 0, "inner": [0, 480], "inorm": 348, "inp": [1, 142], "inp_ndim": 1, "inp_shap": 1, "inp_strid": 1, "inplac": [2, 8], "input": [0, 1, 2, 4, 5, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 117, 118, 121, 122, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 162, 163, 164, 165, 166, 167, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 220, 221, 222, 223, 224, 225, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 246, 249, 252, 253, 254, 255, 256, 257, 258, 259, 260, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 282, 283, 284, 286, 287, 288, 289, 290, 291, 292, 293, 294, 296, 297, 298, 299, 300, 302, 303, 305, 325, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 357, 379, 382, 383, 384, 387, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 415, 422, 423, 425, 426, 427, 428, 430, 431, 433, 435, 447, 450, 480, 482, 483, 486, 487], "input_dil": [0, 101], "input_dim": [6, 323, 352, 382], "input_nam": [1, 142], "input_s": [343, 349, 384], "inputs1": 431, "inputs2": 431, "insert": [118, 138, 488], "insid": 480, "inspect": [3, 480, 486], "inspir": 7, "instabl": 435, "instal": 2, "instanc": [5, 38, 112, 236, 313, 323, 348, 359, 360, 361, 364, 366, 367, 368, 373, 376, 377, 378, 389, 453, 485], "instancenorm": 323, "instanti": [1, 2, 6, 484], "instantiate_axpbi": 2, "instead": [2, 8, 112, 323, 378, 392, 481, 482, 484], "int": [0, 1, 2, 5, 6, 9, 15, 17, 18, 26, 27, 28, 29, 33, 34, 35, 36, 41, 42, 43, 44, 45, 46, 49, 56, 57, 58, 59, 60, 63, 66, 68, 71, 74, 75, 76, 77, 78, 80, 83, 90, 91, 95, 98, 99, 100, 101, 102, 103, 104, 108, 109, 110, 111, 116, 117, 118, 125, 126, 127, 131, 138, 140, 144, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 163, 164, 168, 177, 184, 188, 192, 202, 204, 206, 209, 210, 211, 212, 215, 216, 217, 220, 222, 228, 231, 232, 234, 235, 236, 237, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 255, 256, 258, 259, 272, 273, 274, 277, 278, 279, 283, 284, 286, 287, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 302, 304, 306, 314, 323, 325, 326, 327, 328, 330, 331, 332, 333, 334, 335, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 357, 379, 381, 382, 383, 384, 387, 392, 400, 415, 423, 424, 428, 433, 435, 453, 469, 471, 472, 473], "int16": 316, "int32": [0, 1, 11, 18, 38, 158, 177, 188, 247, 316, 401, 483, 486], "int64": [11, 316], "int8": [11, 316], "int_0": 134, "integ": [0, 11, 160, 162, 163, 177, 188, 209, 231, 236, 237, 239, 246, 247, 274, 286, 290, 302, 316, 340, 371, 471, 483], "integr": [18, 286, 484], "intend": [0, 480], "interact": 400, "interest": 488, "interfac": 2, "intermedi": 485, "intern": 328, "interpol": 401, "interv": [18, 192, 247, 251], "introduc": [0, 258], "intuit": 323, "invalid": [0, 83], "invers": [0, 19, 20, 21, 22, 23, 24, 25, 135, 149, 150, 151, 152, 153, 154, 183, 187, 191], "invert": 0, "involv": [455, 480], "iogpu": 217, "ip": 481, "is_avail": 123, "is_equival": 2, "is_floating_point": 2, "is_leaf": [309, 310, 311, 312], "is_leaf_fn": 363, "isclos": 0, "isfinit": 0, "ish": 5, "ishmael": 5, "isinf": 0, "isnan": 0, "isneginf": 0, "isposinf": 0, "issu": [481, 482, 485], "issubdtyp": [11, 316], "item": [0, 2, 4, 5, 6, 310, 455, 484, 485, 486], "iter": [4, 6, 190, 310, 311, 477, 480, 484], "iterm": 8, "itertool": [5, 310], "its": [0, 1, 2, 8, 183, 203, 232, 249, 295, 307, 313, 323, 382, 455, 459, 460, 461, 481, 484, 485, 488], "itself": [2, 306, 464], "ix": 1, "ix_n": 1, "ix_nw": 1, "ix_s": 1, "ix_sw": 1, "iy_n": 1, "iy_nw": 1, "iy_s": 1, "iy_sw": 1, "j": [5, 8, 188, 337, 458, 459, 461], "j8": 2, "jacobian": [2, 178, 301, 486], "jain": 337, "jax": [7, 477], "jit": 142, "jmlr": 458, "jnp": 485, "john": 188, "join": 471, "join_schedul": 455, "jointli": 244, "just": [2, 6, 350, 480, 483], "jvp": [2, 112, 486], "k": [0, 5, 45, 90, 117, 140, 145, 162, 167, 292, 295, 296, 297, 352, 364], "kaim": 406, "keep": [2, 15, 17, 26, 27, 202, 204, 206, 220, 234, 279, 283, 299, 323, 363, 482, 484], "keepdim": [0, 15, 17, 26, 27, 33, 34, 35, 36, 56, 57, 58, 59, 63, 75, 76, 80, 188, 202, 204, 206, 220, 234, 272, 279, 283, 299], "kei": [1, 3, 5, 145, 209, 239, 240, 241, 243, 244, 245, 246, 247, 249, 250, 251, 309, 310, 363, 364, 376, 379, 464, 477, 479, 482], "kept": 217, "kernel": [2, 7, 8, 98, 99, 100, 101, 102, 103, 104, 142, 325, 355, 480, 483], "kernel_dil": [0, 101], "kernel_s": [325, 326, 327, 330, 331, 332, 333, 334, 335, 355, 356, 357], "key_cach": 5, "key_input_dim": 379, "key_proj": 5, "keyword": [164, 264, 265, 298, 310, 323, 477, 487, 489], "kind": 5, "kingma": [459, 461], "kl_div_loss": 323, "kname": 2, "know": [2, 5], "known": [390, 442], "kth": [0, 28, 232], "kullback": 428, "kw_onli": 2, "kwarg": [10, 120, 264, 265, 489], "l": [5, 6, 182, 183, 185, 186, 323, 328, 330, 333, 343, 349, 384, 434], "l1": [298, 427, 429, 430, 434], "l1_loss": 323, "l2": [427, 430, 468], "l2_loss": 323, "l_": 427, "la": 188, "label": [3, 4, 424, 431], "label_smooth": 424, "lack": 483, "lambd": [345, 396, 416, 446], "lambda": [310, 311, 312, 323, 345, 359, 364, 371, 396, 416, 440, 446, 456, 457, 458, 459, 460, 461, 462, 467, 468, 480, 481, 482], "languag": [1, 2], "larg": [5, 323, 379, 430, 480, 481, 484], "larger": [1, 144, 217, 387, 462], "largest": [188, 224, 292], "lasso": 298, "last": [0, 1, 5, 29, 78, 141, 143, 148, 151, 153, 154, 156, 157, 158, 162, 163, 170, 182, 183, 185, 186, 187, 189, 190, 191, 203, 212, 240, 273, 290, 300, 330, 331, 332, 333, 334, 335, 337, 338, 344, 401, 485], "latenc": 481, "later": [3, 8, 455], "launch": [1, 2, 123, 481, 483], "layer": [7, 141, 306, 323, 325, 326, 327, 337, 338, 343, 344, 349, 350, 352, 355, 356, 357, 373, 378, 381, 382, 384, 389, 400, 449, 453], "layer_s": 6, "layernorm": 323, "layout": 1, "lazi": [7, 453, 486], "lazili": [5, 323], "lceil": 90, "ld": [343, 349, 384], "lead": [0, 18, 83, 480], "leaf": [94, 306, 309, 310, 311, 312, 363], "leaf_modul": 323, "leaki": [351, 419], "leaky_relu": 323, "leakyrelu": 323, "learn": [4, 6, 7, 328, 344, 348, 350, 380, 383, 455, 456, 457, 458, 459, 460, 461, 462, 467, 468], "learnabl": [330, 331, 332, 333, 334, 335, 389], "learning_r": [6, 455, 456, 457, 458, 459, 460, 461, 462, 464, 467, 468, 469, 470, 471, 472, 473, 480], "least": [5, 84, 85, 86, 93, 182, 183, 185, 186, 187, 189, 190, 191, 236], "leav": [2, 136, 310, 311, 312], "lectur": 467, "lecun": 337, "left": [0, 5, 144, 179, 188, 236, 258, 341, 387, 401, 413, 414, 425, 427, 435], "left_shift": 0, "leibler": 428, "len": [5, 148, 151, 154, 157, 167, 471], "length": [5, 277, 328, 330, 333, 343, 349, 384, 471], "leq": [427, 440], "less": [0, 1, 5, 28, 181, 217, 232, 387, 434], "less_equ": 0, "let": [1, 2, 4, 5, 183, 480, 482, 484, 485], "level": [0, 162, 163, 405, 406], "lh": [343, 349, 384], "lhs_indic": [0, 162, 163], "lhs_mask": 90, "lib": 481, "libmlx": 8, "libmlx_ext": 2, "libmpi": 481, "librari": [2, 8, 318, 323], "like": [2, 5, 7, 126, 177, 229, 305, 338, 430, 464, 466, 480, 481, 482, 484, 485, 486, 488], "likelihood": [425, 433], "limit": [0, 2, 93, 215, 216, 217, 483], "linalg": 167, "line": [5, 481, 484, 485], "linear": [0, 2, 5, 6, 7, 306, 310, 323, 329, 339, 341, 342, 351, 366, 382, 384, 385, 386, 388, 390, 401, 410, 411, 412, 413, 414, 415, 419, 438, 439, 440, 442, 450, 453, 464, 472, 480], "linear1": 5, "linear2": 5, "linear3": 5, "linear_schedul": [455, 471], "linearli": 379, "link": [2, 8], "linspac": 0, "lion": 455, "list": [1, 5, 10, 15, 17, 30, 71, 78, 83, 84, 85, 86, 91, 94, 95, 101, 131, 136, 142, 147, 148, 150, 151, 153, 154, 156, 157, 161, 164, 178, 188, 202, 204, 206, 207, 220, 228, 231, 234, 239, 240, 241, 243, 244, 245, 247, 250, 251, 262, 272, 274, 278, 279, 283, 290, 291, 294, 298, 299, 301, 304, 309, 312, 313, 323, 364, 366, 367, 368, 369, 374, 376, 377, 378, 453, 455, 459, 460, 461, 462, 471, 479, 480, 481, 482, 484], "liter": [2, 231, 401, 405, 406, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435], "littl": 5, "liu": 5, "live": [7, 142, 488], "ll": [1, 4, 6, 427, 480, 482], "llama": 5, "llamaattent": 5, "llamaencoderlay": 5, "llm": 7, "load": [6, 7, 318, 366, 481], "load_weight": [323, 484], "loader": 6, "loader_path": 2, "loan": 188, "loc": [1, 243, 245], "local": [323, 337, 481], "locat": [0, 2, 83, 377, 378, 481, 488], "log": [0, 196, 198, 202, 353, 354, 420, 421, 422, 425, 428, 430, 433, 445], "log10": 0, "log1p": 0, "log2": 0, "log_cosh_loss": 323, "log_sigmoid": [323, 353], "log_softmax": [323, 354], "logaddexp": 0, "logarithm": [0, 194, 195, 196, 197], "logcosh": 430, "logic": [0, 2, 199, 200, 201], "logical_and": 0, "logical_not": 0, "logical_or": 0, "logist": [0, 4, 268, 414, 442], "logit": [5, 240, 422, 424, 480], "logsigmoid": 323, "logsoftmax": 323, "logsumexp": 0, "long": 5, "longer": [5, 105, 482], "look": [2, 5, 481], "lookup": 340, "loop": [5, 6, 480, 481, 482, 484], "loshchilov": 460, "loss": [4, 6, 298, 323, 455, 480, 481, 482, 484], "loss_and_grad": 323, "loss_and_grad_fn": [6, 455, 480, 482], "loss_fn": [4, 6, 455, 480, 482], "loss_grad_fn": 481, "lot": [481, 482], "low": [247, 251, 409, 450], "low_pad_s": 0, "lower": [182, 183, 185, 186, 191, 236, 247, 250, 251, 295, 409], "lr": [4, 462], "lr_schedul": [469, 470, 471, 473], "lstm": 323, "lto": 2, "lu": 5, "luckili": 484, "lvalu": 298, "m": [0, 2, 5, 8, 90, 140, 162, 167, 188, 295, 456, 480], "m1": [1, 5, 480, 482, 488], "m10": 316, "m7": 316, "m_": [459, 460, 461, 462], "m_t": [459, 460, 461, 462], "mac": 481, "machin": [5, 7, 8, 467, 481], "maco": [8, 217], "macosx": 8, "made": [5, 318], "mai": [2, 188, 306, 337, 481, 482, 483], "main": [7, 118, 140, 142, 293, 310, 311, 323, 481], "maintain": [337, 338, 462], "major": [0, 2], "make": [1, 2, 3, 5, 6, 8, 203, 226, 267, 323, 469, 470, 472, 473, 480, 484, 486, 488], "make_shar": 2, "malloc_or_wait": 2, "man": 5, "manag": [281, 477, 481, 488], "mani": [2, 83, 274, 330, 331, 332, 333, 334, 335, 340, 381, 480, 481, 484], "manual": 323, "map": [2, 6, 38, 193, 310, 340, 359], "map_fn": [359, 363], "map_torch_to_mlx": 5, "margin": [431, 435], "margin_ranking_loss": 323, "mask": [0, 5, 90, 145, 373, 379, 483], "mask_lh": [0, 90], "mask_n": 1, "mask_nw": 1, "mask_out": [0, 90], "mask_rh": [0, 90], "mask_s": 1, "mask_sw": 1, "matadata": 193, "match": [8, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 210, 366, 401, 424, 481, 483, 485], "materi": [5, 7], "math": [5, 435, 480], "mathbf": 183, "mathcal": 352, "mathemat": 188, "mathrm": [134, 268, 348], "matmul": [0, 162, 488], "matric": [188, 189, 190], "matrix": [0, 4, 14, 45, 90, 116, 117, 140, 162, 163, 167, 168, 182, 183, 185, 186, 187, 188, 189, 190, 191, 203, 207, 236, 237, 244, 381, 382, 407, 450], "matter": [5, 323], "max": [0, 1, 2, 188, 205, 329, 355, 356, 357, 380, 410, 417, 418, 423, 425, 426, 431, 435, 437, 439, 457, 461, 480, 482, 488], "max_buffer_s": 209, "max_freq": 392, "max_i": 236, "max_norm": 308, "max_recommended_working_set_s": [209, 217], "max_val": 417, "maximum": [0, 6, 26, 38, 93, 108, 212, 216, 308, 323, 351, 355, 356, 357, 385, 392, 413, 414, 419, 438, 453, 484], "maxpool1d": 323, "maxpool2d": 323, "maxpool3d": 323, "maxtotalthreadsperthreadgroup": 2, "mca": 481, "md": 188, "me": 5, "mean": [0, 1, 4, 5, 6, 143, 243, 244, 245, 298, 323, 328, 344, 364, 383, 408, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 480, 482, 485], "meant": 112, "measur": 488, "mechan": 400, "medic": 338, "meet": 8, "member": [2, 323, 369, 374], "memori": [0, 1, 2, 7, 83, 208, 210, 211, 212, 214, 215, 216, 217, 400, 453, 457, 480, 484, 485], "memory_order_relax": 1, "memory_s": [209, 217], "memoryview": [484, 485], "merg": 480, "meshgrid": 0, "metadata": [4, 193, 262, 263], "metal": [2, 7, 142], "metal_captur": 3, "metal_kernel": 1, "metal_path": 8, "metallib": [2, 8], "method": [2, 5, 9, 10, 30, 112, 120, 306, 314, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 370, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 453, 456, 457, 458, 459, 460, 461, 462, 464, 467, 468, 475], "millisecond": [8, 480, 488], "min": [0, 2, 188, 221, 329, 380, 410, 417, 418, 437, 439], "min_freq": 392, "min_i": 236, "min_val": 417, "mind": [2, 5], "mine": 5, "minibatch": 6, "minim": 481, "minimum": [0, 27, 38, 93, 109, 392, 422, 423], "minsizerel": 8, "minu": 139, "minut": 5, "mish": 323, "miss": [366, 487], "mix": 483, "mkdir": [3, 8], "ml": 8, "mlp": [6, 323, 400, 455], "mlp_dim": [5, 400], "mlx": [1, 3, 4, 5, 6, 8, 318, 323, 450, 453, 455, 477, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "mlx_build_benchmark": 8, "mlx_build_cpu": 8, "mlx_build_exampl": 8, "mlx_build_gguf": 8, "mlx_build_met": [2, 8], "mlx_build_metallib": 2, "mlx_build_python_bind": 8, "mlx_build_safetensor": 8, "mlx_build_test": 8, "mlx_disable_compil": [119, 132, 480], "mlx_ext": 2, "mlx_ext_metallib": 2, "mlx_include_dir": 2, "mlx_metal_debug": [3, 8], "mlx_metal_jit": 8, "mlx_sample_extens": 2, "mlx_trace": 3, "mnist": 6, "mode": [0, 1, 2, 105, 231, 362, 373, 375, 401, 405, 406, 481], "model": [4, 6, 7, 264, 306, 307, 310, 311, 323, 359, 362, 364, 366, 370, 373, 375, 376, 377, 379, 400, 450, 453, 455, 463, 464, 466, 480, 481, 484], "modest": 2, "modif": 485, "modifi": 485, "modul": [2, 5, 6, 306, 307, 389, 400, 450, 466, 479, 480, 484], "moment": [5, 457, 459, 460, 461], "momentum": [328, 462, 464, 468, 480], "monei": 5, "monoton": 436, "more": [1, 2, 3, 6, 10, 78, 118, 162, 182, 183, 185, 186, 187, 190, 191, 203, 215, 216, 262, 263, 316, 323, 328, 337, 387, 392, 400, 401, 403, 404, 405, 406, 422, 477, 480, 481, 482, 483, 486, 488], "most": [2, 145, 240, 323, 466, 480, 481, 482, 483, 484], "move": [0, 2, 222, 488], "moveaxi": 0, "mpi": 318, "mpiexec": 481, "mpirun": 481, "mse": 298, "mse_loss": 323, "mtl": 2, "mtl_capture_en": 3, "mtlcommandbuff": 2, "mu": 468, "much": [1, 2, 5, 325, 326, 327, 355, 356, 357, 480, 484], "multi": [7, 145, 330, 331, 332, 333, 334, 335, 483, 485], "multidimension": 207, "multiheadattent": [5, 323], "multipl": [0, 1, 8, 14, 90, 141, 143, 162, 163, 203, 223, 236, 237, 379, 392, 470, 471, 473, 480, 484, 487], "multipli": [0, 2, 38, 163, 236, 237, 336, 392, 401], "murtadha": 5, "must": [0, 1, 2, 3, 8, 90, 93, 144, 145, 161, 163, 185, 186, 188, 239, 240, 244, 247, 250, 251, 303, 401, 485], "mx": [1, 2, 3, 4, 5, 6, 38, 96, 97, 112, 123, 126, 142, 158, 177, 185, 186, 188, 189, 193, 246, 264, 298, 308, 323, 325, 326, 327, 328, 339, 348, 351, 355, 356, 357, 359, 366, 370, 385, 401, 402, 403, 404, 405, 406, 407, 408, 409, 411, 419, 422, 423, 424, 428, 431, 438, 448, 450, 453, 455, 477, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489], "my": [5, 8], "my_devic": 489, "my_path": 264, "myexp": [1, 142], "myexp_strid": 1, "mymlp": 453, "n": [0, 1, 2, 5, 30, 90, 98, 99, 100, 101, 102, 103, 104, 140, 145, 146, 148, 149, 151, 152, 155, 157, 167, 168, 244, 279, 295, 299, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 349, 384, 401, 430, 435, 481], "n_kv": 145, "n_q": 145, "n_t": 343, "naiv": [2, 482], "naive_add": 482, "name": [1, 2, 142, 163, 193, 236, 237, 262, 263, 264, 265, 323, 344, 363, 366, 368, 481, 483, 487], "named_modul": 323, "nan": [0, 16, 82, 171, 172, 174, 224], "nan_to_num": 0, "nanobind": [2, 400], "nanobind_add_modul": 2, "nativ": 8, "natur": [0, 194, 196, 484], "nb": 2, "nb_domain": 2, "nb_func": 400, "nb_modul": 2, "nb_static": 2, "nbyte": 2, "nc": 328, "ndarrai": [30, 483, 484, 486], "ndhwc": [332, 335, 338], "ndim": [0, 1, 2, 158, 188, 190, 401], "ne": 1, "nearest": [1, 401], "necessari": 323, "necessarili": 292, "need": [1, 2, 5, 6, 7, 8, 82, 236, 323, 377, 378, 392, 400, 477, 481, 482, 484, 485, 486, 488], "neg": [0, 118, 158, 175, 224, 258, 293, 351, 355, 356, 357, 379, 425, 433, 435, 483], "negat": [0, 225], "negative_slop": [351, 419], "neginf": [0, 224], "neighbor": 401, "neither": [164, 298], "nelem": 2, "nervou": 5, "nest": [78, 94, 312, 323, 453, 479, 482], "nesterov": 468, "network": [5, 7, 328, 337, 340, 403, 404, 450, 453, 467, 481], "neural": [5, 7, 340, 403, 404, 436, 450, 453, 467], "never": [5, 484], "new": [0, 2, 6, 91, 118, 222, 226, 256, 278, 294, 300, 310, 311, 371, 379, 453, 455, 466, 471, 480, 483, 484, 485], "new_tre": 311, "next": [2, 5, 6, 215], "nh": [343, 349, 384], "nhwc": [328, 331, 334], "nice": [482, 484], "nlc": [328, 330, 333], "nld": [343, 349, 384], "nlh": [343, 349, 384], "nll": [425, 433], "nll_loss": 323, "nn": [2, 5, 6, 264, 310, 323, 450, 453, 455, 464, 466, 480, 484], "nobodi": 5, "node": [94, 136, 302, 311, 312], "nois": 4, "noisi": 4, "nomins": 2, "non": [0, 1, 2, 8, 207, 374, 384, 436, 453], "none": [1, 2, 5, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 119, 121, 122, 125, 126, 127, 128, 129, 130, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 214, 218, 219, 220, 221, 222, 223, 224, 225, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 263, 264, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 302, 303, 304, 305, 306, 309, 310, 311, 312, 314, 325, 326, 327, 341, 355, 356, 357, 359, 363, 364, 371, 376, 379, 384, 392, 400, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 457, 475, 483], "nonlinear": [384, 480], "nonzero": 483, "noop": 376, "nor": [2, 164, 298], "norm": [5, 143, 308, 344, 435, 461, 462], "norm1": 5, "norm2": 5, "norm_first": 400, "normal": [1, 2, 4, 5, 141, 142, 143, 185, 244, 250, 323, 325, 326, 327, 328, 344, 348, 350, 355, 356, 357, 383, 400, 403, 405, 485, 488], "not_equ": 0, "notabl": [5, 7], "notat": [116, 309, 368], "note": [0, 1, 2, 5, 8, 16, 18, 83, 90, 94, 100, 103, 104, 112, 145, 153, 154, 163, 171, 188, 210, 236, 240, 300, 306, 323, 383, 401, 455, 485, 487], "noth": [5, 323, 484], "notic": [5, 482, 487], "now": [1, 2, 5, 8, 382, 480, 481, 485], "np": [1, 5, 6, 481, 485, 486], "npy": [193, 261, 487], "npz": [5, 193, 264, 265, 366, 370, 487], "nuclear": 188, "nullopt": 0, "num": [0, 5, 192, 249], "num_class": [6, 455], "num_decoder_lay": 400, "num_embed": [340, 381], "num_encoder_lay": 400, "num_epoch": [6, 455], "num_exampl": 4, "num_featur": [4, 328], "num_group": 344, "num_head": [5, 379, 400], "num_it": 4, "num_lay": [5, 6, 455], "num_param": 323, "num_paramet": 380, "num_sampl": 240, "num_split": 0, "number": [0, 2, 11, 18, 61, 70, 94, 99, 100, 101, 103, 104, 116, 140, 145, 163, 164, 168, 178, 192, 224, 231, 236, 237, 240, 243, 245, 249, 251, 255, 258, 259, 290, 291, 295, 298, 301, 302, 306, 323, 328, 330, 331, 332, 333, 334, 335, 337, 338, 344, 348, 379, 380, 400, 401, 403, 404, 405, 406, 469, 471, 472, 477, 480, 482, 489], "number_of_el": 0, "numer": [5, 141, 143, 188, 198, 202, 272, 328, 344, 348, 350, 383, 422, 423, 425, 435, 456, 457, 458, 459, 460, 461, 467, 480, 484], "numpi": [2, 5, 6, 7, 13, 16, 18, 87, 88, 89, 91, 128, 129, 133, 165, 166, 171, 179, 180, 181, 198, 203, 205, 221, 223, 227, 233, 254, 257, 282, 484, 486, 487], "nw": 1, "nwhc": 337, "o": [2, 8, 145, 349], "o_t": 349, "obj": 262, "object": [3, 10, 30, 50, 78, 94, 142, 177, 264, 302, 309, 310, 311, 312, 316, 337, 400, 479], "observ": 5, "occupi": [116, 163, 236, 237], "occur": 485, "odim": 6, "odot": [343, 349], "off": [5, 8, 484], "offer": 430, "offset": [0, 1, 2, 5, 46, 83, 118, 141, 144, 293], "often": 338, "ok": [366, 482], "okai": [480, 484], "old": 5, "omit": [459, 461, 481], "onc": [2, 8, 480], "one": [0, 2, 5, 8, 38, 78, 84, 93, 99, 100, 101, 103, 104, 138, 140, 141, 143, 144, 188, 196, 203, 237, 240, 277, 282, 316, 376, 401, 424, 481, 488], "ones": [0, 2, 5, 229, 264, 295, 377, 378, 455, 481, 483], "ones_lik": 0, "onli": [1, 2, 5, 7, 8, 82, 90, 99, 100, 101, 103, 104, 185, 186, 188, 217, 236, 244, 300, 323, 363, 364, 366, 371, 373, 376, 377, 378, 453, 480, 481, 482, 487, 488], "onlin": 458, "op": [1, 2, 230, 300, 364, 484], "open": [3, 8, 18, 247, 251], "openmpi": 481, "oper": [3, 5, 7, 9, 37, 84, 85, 86, 101, 145, 162, 163, 233, 235, 272, 280, 287, 314, 323, 400, 462, 480, 481, 482, 483, 484, 485, 486, 488, 489], "operand": [130, 131, 162], "opportun": 480, "opt": [463, 481], "optim": [1, 3, 4, 6, 7, 377, 480, 481, 482, 484], "option": [0, 3, 5, 14, 15, 17, 18, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 83, 84, 85, 86, 90, 94, 95, 98, 99, 100, 101, 102, 103, 104, 105, 108, 109, 110, 111, 112, 116, 117, 118, 121, 122, 123, 125, 126, 127, 140, 141, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 162, 163, 164, 168, 175, 176, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 202, 204, 206, 207, 216, 220, 224, 228, 231, 232, 234, 236, 237, 239, 240, 241, 243, 244, 245, 246, 247, 249, 250, 251, 255, 256, 258, 272, 273, 274, 277, 278, 279, 283, 285, 286, 290, 292, 293, 294, 295, 296, 297, 298, 299, 302, 304, 306, 309, 310, 311, 312, 325, 326, 327, 328, 330, 331, 332, 333, 334, 335, 343, 349, 352, 355, 356, 357, 359, 363, 364, 366, 371, 376, 379, 381, 382, 384, 387, 392, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 456, 457, 458, 459, 460, 461, 462, 464, 467, 468, 469, 477, 480, 487, 489], "ord": 188, "order": [0, 1, 28, 83, 101, 131, 185, 186, 188, 232, 236, 292, 323, 344, 377, 389, 464, 480, 482], "ordinari": 170, "org": [344, 348, 350, 358, 383, 414, 436], "origin": [5, 118, 308, 328, 372, 403, 404, 405, 406, 456, 457, 458, 459, 461, 462, 485], "orthonorm": 167, "ostream": 2, "ostringstream": 2, "other": [0, 2, 5, 7, 177, 188, 323, 365, 453, 462, 480, 481, 483, 484, 486], "other_input": 323, "otherwis": [18, 101, 123, 216, 246, 306, 309, 310, 311, 312, 364, 366, 376, 398, 400, 401, 416, 422, 427, 434, 446, 447, 484, 485], "our": [1, 2, 5, 6, 389, 456, 457, 458, 459, 461, 462, 481], "out": [0, 1, 2, 8, 90, 142, 337, 338, 373, 480, 481, 482, 483], "out_ax": [302, 482], "out_channel": [330, 331, 332, 333, 334, 335], "out_dim": [323, 453], "out_dtyp": 2, "out_idx": 2, "out_mask": 90, "out_proj": [5, 453], "out_ptr": 2, "out_shap": [1, 2], "outer": [0, 480, 484], "outlier": 430, "output": [0, 1, 2, 5, 8, 15, 16, 17, 18, 28, 83, 90, 91, 94, 96, 97, 108, 109, 110, 111, 112, 130, 140, 141, 142, 143, 144, 145, 152, 155, 156, 157, 161, 162, 164, 167, 168, 171, 188, 192, 202, 204, 206, 207, 220, 224, 228, 229, 232, 234, 235, 239, 240, 241, 243, 244, 245, 247, 250, 251, 264, 265, 272, 277, 279, 283, 287, 293, 295, 298, 299, 300, 301, 302, 303, 304, 305, 328, 330, 331, 332, 333, 334, 335, 348, 352, 379, 382, 398, 400, 401, 403, 404, 405, 406, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 447, 450, 480, 481, 482, 483, 484, 485, 486, 487, 488], "output_dim": [6, 323, 352, 382], "output_directori": 2, "output_dtyp": [1, 142], "output_fil": 5, "output_nam": [1, 142], "output_shap": [1, 142], "outsid": [142, 158], "over": [0, 2, 5, 6, 15, 17, 26, 27, 28, 29, 98, 99, 100, 101, 102, 103, 104, 108, 109, 110, 111, 148, 151, 154, 157, 170, 188, 190, 192, 202, 204, 206, 220, 232, 234, 260, 272, 273, 279, 283, 290, 292, 299, 328, 330, 331, 332, 333, 334, 335, 344, 350, 383, 424, 469, 472, 481, 482], "overal": 2, "overhead": [480, 484, 488], "overlap": 1, "overload": 18, "overrid": [2, 132], "overview": 3, "overwrit": 5, "own": [8, 485], "owndata": 485, "p": [8, 239, 323, 336, 337, 338, 435, 459, 461], "pack": [163, 236, 237], "packag": [2, 4, 6, 8, 318, 450, 481], "package_data": 2, "pad": [0, 1, 98, 99, 100, 101, 102, 103, 104, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 325, 326, 327, 330, 331, 332, 333, 334, 335, 355, 356, 357], "pad_valu": 0, "pad_width": [0, 231], "padding_hi": 0, "padding_lo": 0, "page": 486, "pain": 5, "pair": [0, 2, 231, 366, 387], "pairwis": 435, "pan": 5, "paper": [328, 392, 456, 457, 458, 459, 461, 462], "parallel": [481, 488], "param": [298, 323, 450, 482], "paramet": [0, 1, 2, 4, 5, 6, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 215, 216, 217, 218, 220, 221, 222, 223, 224, 225, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 348, 349, 350, 351, 352, 355, 356, 357, 359, 360, 363, 364, 366, 371, 372, 373, 376, 377, 378, 379, 380, 381, 382, 383, 384, 387, 389, 392, 396, 398, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 415, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 447, 449, 450, 453, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 466, 467, 468, 469, 470, 471, 472, 473, 475, 480, 481, 482, 484], "parameter_scal": 457, "parametr": [380, 437], "pars": 5, "parse_arg": 5, "parser": 5, "part": [1, 2, 169, 252, 482, 483], "partial": [377, 378, 480, 484], "particip": [121, 122, 125, 126, 127], "particular": [236, 344], "particularli": 480, "partit": [0, 28], "pass": [1, 2, 5, 6, 65, 79, 230, 231, 298, 307, 309, 310, 311, 323, 364, 376, 377, 378, 389, 480, 481, 484], "password": 481, "path": [3, 8, 131, 218, 264, 265, 306, 311, 366, 481], "pattern": [323, 484], "peak": [212, 214], "penalti": 468, "pep": 485, "per": [5, 6, 116, 145, 163, 236, 237, 306, 328, 344, 348, 350, 383, 475, 480, 481, 484], "perceptron": 7, "perf_count": 480, "perfectli": 484, "perform": [0, 1, 2, 3, 5, 7, 14, 90, 101, 108, 109, 110, 111, 127, 130, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 162, 163, 167, 185, 186, 203, 237, 259, 272, 286, 323, 344, 400, 405, 406, 455, 480, 481, 483, 484, 488], "perhap": [2, 5], "perm": 6, "permtuat": 246, "permut": [0, 6], "persist": 8, "pg": 188, "phi": [341, 412], "physic": 481, "pi": [134, 341, 392, 413, 482], "pick": 2, "pip": [2, 8], "pipelin": 2, "pixel": 337, "place": [2, 5, 38, 258, 259, 306, 481, 484, 485], "placehold": 480, "plai": [2, 5], "plain": 389, "plan": [2, 480], "platform": 8, "plu": [0, 196], "point": [0, 2, 4, 5, 8, 83, 160, 237, 316], "pointer": 2, "pool": [325, 326, 327, 355, 356, 357, 488], "popul": 2, "portion": 336, "posinf": [0, 224], "posit": [0, 5, 28, 118, 144, 158, 164, 176, 182, 183, 222, 224, 232, 244, 258, 293, 298, 310, 323, 330, 331, 332, 333, 334, 335, 379, 387, 392, 425, 435], "possibl": [274, 340, 381, 480, 481, 483, 488], "possibli": [5, 14, 90, 162, 203, 308], "postur": 5, "potenti": 216, "power": [0, 482, 485], "practic": [2, 480], "pre": [8, 145, 422], "preced": 344, "precis": [0, 2, 5, 139, 145, 323, 341, 383, 422, 463, 480], "preclud": 323, "pred": [426, 430], "predic": [306, 371], "predict": [422, 425, 426, 427, 428, 429, 430, 432, 433, 434], "prefix": [302, 309], "prelu": 323, "prepar": [2, 5], "prepend": [3, 203], "preprint": [5, 456, 462], "preprocessor": 8, "present": 1, "preserv": [256, 482], "press": [5, 188], "pressur": 2, "pretti": [480, 484], "prevent": [280, 435, 485], "previou": [215, 216, 217], "primal": [1, 2, 112, 178, 301], "primit": 482, "print": [1, 2, 4, 5, 6, 8, 308, 309, 310, 311, 313, 323, 477, 480, 481, 482, 483, 484, 485, 486], "prior": [235, 286, 287], "priorit": 482, "privat": 2, "prng": [239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 477], "prob": 422, "probabl": [8, 247, 336, 337, 338, 382, 422, 424, 428, 488], "problem": [4, 6, 323], "process": [5, 101, 105, 120, 121, 122, 123, 125, 126, 127, 310, 311, 338, 340, 400, 479, 481], "processor": 8, "prod": [0, 1], "produc": [0, 2, 8, 94, 379, 450], "product": [0, 2, 14, 83, 110, 170, 178, 184, 203, 230, 234, 290, 301, 379, 486], "profil": 3, "program": [212, 481], "programmat": 378, "project": [3, 5, 379], "project_source_dir": 2, "promot": 2, "promote_typ": 2, "promoted_dtyp": 2, "prompt": 5, "propag": [482, 483], "properti": [31, 38, 47, 51, 61, 62, 68, 70, 372, 375, 465, 482], "proportion": 308, "protocol": 485, "provid": [0, 2, 5, 83, 116, 164, 246, 258, 290, 298, 310, 312, 318, 323, 359, 364, 366, 376, 377, 378, 381, 382, 400, 401, 449, 453, 481, 487, 489], "pseudo": 477, "pth": 5, "public": [2, 323], "pun": 0, "pure": [1, 323, 455], "purpos": [1, 188], "purs": 5, "push": 2, "push_back": 2, "put": [0, 1, 6, 235, 480, 481], "put_along_axi": 0, "py": [2, 5, 8, 481], "pypi": 8, "python": [1, 3, 5, 50, 68, 78, 136, 309, 310, 311, 312, 313, 453, 463, 464, 466, 479, 481, 482, 485], "python_requir": 2, "pytorch": [5, 7, 341, 344, 482], "pytorch_compat": 344, "q": [145, 189], "qualifi": 481, "quantiz": [0, 116, 163, 193, 237, 381, 382], "quantized_matmul": 0, "quantizedembed": 323, "quantizedlinear": 323, "quarter": 5, "queri": [5, 145, 217, 379], "query_input_dim": 379, "query_proj": 5, "question": [5, 484], "queue": 3, "quick": [2, 7], "quit": [482, 485], "quotient": [0, 128, 129, 160], "r": [2, 5, 189, 298, 337, 343], "r_t": 343, "race": 488, "radian": [0, 115], "rag": 5, "rain": 5, "rais": [0, 5, 188, 216, 233, 274, 366], "ram": 5, "random": [1, 2, 3, 4, 5, 6, 7, 142, 325, 326, 327, 328, 348, 355, 356, 357, 366, 373, 480, 482, 488, 489], "randomli": [4, 5, 246, 336, 337, 338], "rang": [0, 2, 3, 4, 5, 6, 8, 18, 158, 162, 192, 404, 406, 413, 414, 455, 469, 470, 471, 472, 473, 477, 480, 482, 484, 488], "rank": [0, 125, 126, 127, 431, 481], "rate": [4, 455, 456, 457, 458, 459, 460, 461, 462, 467, 468], "rather": [2, 482, 488], "ratio": [0, 24], "rceil": 90, "re": [6, 8, 450], "readabl": 3, "readi": 2, "real": [0, 152, 153, 154, 155, 156, 157, 182, 183, 185, 186], "realli": 350, "reason": [1, 5, 483], "reboot": 8, "receiv": [125, 126, 306, 471, 485], "reciproc": [0, 260], "reclaim": 215, "recommend": [8, 216, 462], "recompil": [94, 480], "record": [3, 212, 484], "recreat": [313, 455], "rectifi": [351, 385, 386, 405, 406, 419, 438, 439], "recurr": [343, 349, 384], "recurs": [323, 363, 364, 369, 374, 376, 453], "recv": 126, "redirect": 2, "reduc": [0, 1, 8, 15, 17, 26, 27, 122, 202, 204, 206, 220, 234, 279, 283, 299, 312, 328, 400, 430], "reduct": [15, 17, 122, 202, 204, 220, 234, 312, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 481], "redund": 482, "refer": [188, 348, 358, 372, 403, 404, 405, 406, 414, 436, 483], "reflect": [372, 480, 483, 485], "regard": 341, "regardless": [83, 145], "regist": [2, 6], "register_librari": 2, "regress": [7, 430], "regular": [38, 337, 436, 460, 480, 483], "regularli": 2, "reimplement": 2, "rel": [16, 171, 457, 480], "relative_step": 457, "relax": 216, "relev": 2, "reli": [1, 2], "relu": [323, 380, 400, 437, 450], "relu6": 323, "remain": [0, 5, 217, 298, 311, 336, 337, 338, 481], "remaind": [0, 129], "remov": [0, 118, 203, 240, 277, 424], "rep": [0, 291], "repeat": [0, 291], "repeatedli": 4, "repetit": 255, "replac": [0, 5, 224, 377, 378, 400, 434], "replai": 3, "repli": 5, "repo": [4, 6, 8, 480], "report": [210, 216], "repres": [2, 5, 120, 123, 163, 431, 435, 485], "represent": [5, 236, 300, 309, 313], "request": 2, "requir": [1, 2, 5, 323, 481, 484, 485], "requires_grad": 482, "rerun": [480, 484], "rescal": 308, "research": 7, "reset": 214, "reset_peak_memori": 212, "reshap": [0, 5, 188, 401, 483], "resid": 217, "resolv": 2, "resourc": 2, "respect": [2, 4, 6, 141, 143, 162, 163, 164, 236, 298, 310, 323, 328, 341, 344, 348, 350, 453, 482, 486], "respons": 2, "rest": [5, 144, 310, 311, 387], "restart": 8, "restor": 258, "result": [0, 5, 14, 18, 38, 78, 83, 94, 141, 143, 163, 188, 203, 237, 244, 255, 278, 310, 311, 312, 392, 422, 480, 482, 485], "resum": 5, "return": [0, 1, 2, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 50, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 211, 215, 216, 217, 220, 221, 222, 223, 224, 225, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 249, 250, 251, 252, 253, 254, 255, 256, 257, 259, 260, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 323, 343, 349, 359, 360, 361, 363, 364, 365, 366, 367, 368, 369, 373, 374, 376, 377, 378, 384, 402, 403, 404, 405, 406, 407, 408, 409, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 450, 453, 463, 479, 480, 481, 482, 483, 484, 485, 487, 488], "return_metadata": 193, "revers": [0, 2, 41, 42, 43, 44, 83, 108, 109, 110, 111, 294, 392], "rf": 8, "rfft": 152, "rfft2": 153, "rfftn": 154, "rho": 456, "rhs_indic": [0, 162, 163], "rhs_mask": 90, "right": [0, 1, 2, 8, 236, 257, 258, 341, 401, 413, 414, 425, 427, 435], "right_shift": 0, "rm": [5, 8, 143, 457], "rmsnorm": [5, 323], "rmsprop": 455, "rnn": [323, 343], "roadcast": 247, "robust": 430, "roform": [5, 387], "roll": 0, "root": [0, 5, 143, 260, 275, 383], "rope": [5, 323], "rosetta": 8, "rotari": [5, 144, 387], "rotat": [144, 387], "round": [0, 236], "routin": 2, "row": [0, 1, 2, 83, 140, 142, 168, 236, 295], "row_contigu": 2, "rpath": 2, "rsqrt": 0, "rtol": [0, 16, 171], "rule": [2, 455], "run": [1, 2, 3, 5, 6, 7, 8, 9, 142, 230, 314, 328, 359, 456, 457, 459, 460, 461, 480, 481, 484, 488, 489], "runtim": [5, 123, 318, 480, 481], "runtime_error": 2, "safetensor": [8, 193, 263, 366, 370, 455, 484, 487], "sai": [2, 5, 450, 484], "said": 5, "sake": 482, "same": [0, 2, 5, 8, 16, 38, 82, 91, 94, 99, 100, 101, 103, 104, 105, 121, 141, 143, 152, 155, 156, 157, 163, 164, 171, 178, 231, 240, 258, 259, 300, 301, 303, 311, 323, 326, 327, 328, 336, 344, 348, 356, 357, 381, 402, 403, 404, 405, 406, 407, 408, 409, 424, 435, 453, 463, 477, 480, 481, 483, 488], "sampl": [2, 4, 5, 192, 239, 240, 241, 243, 244, 247, 250, 251, 403, 404, 405, 406, 408, 409, 425, 431, 435, 477, 480], "sat": 5, "save": [3, 5, 7, 193, 218, 236, 262, 263, 264, 265, 370, 484], "save_gguf": 487, "save_safetensor": [370, 455, 487], "save_weight": 323, "savez": [5, 370, 487], "savez_compress": 487, "saw": [5, 482], "scalar": [0, 2, 13, 14, 16, 30, 50, 78, 82, 87, 88, 89, 90, 91, 93, 128, 129, 133, 160, 161, 164, 165, 166, 167, 171, 179, 180, 181, 192, 198, 199, 200, 201, 203, 205, 221, 223, 224, 227, 231, 233, 239, 247, 250, 251, 254, 257, 262, 282, 298, 300, 303, 307, 435, 482, 484, 486], "scale": [0, 2, 5, 14, 116, 141, 143, 144, 145, 163, 167, 236, 237, 243, 245, 308, 337, 338, 350, 379, 387, 388, 392, 401, 440, 457], "scale_arr": 2, "scale_factor": 401, "scale_paramet": 457, "scatter": 0, "scatter_add": 0, "scatter_max": 0, "scatter_min": 0, "scatter_prod": 0, "schedul": [2, 216, 455, 469, 470, 471, 472, 473, 475, 488], "schema": 3, "scipi": 167, "scope": 323, "score": [5, 145, 431], "sdk": 8, "se": 1, "second": [5, 8, 118, 177, 179, 199, 201, 203, 257, 284, 293, 298, 326, 327, 356, 357, 423, 431, 457, 459, 460, 461, 482, 488], "second_layer_a": 484, "second_layer_b": 484, "secret": 5, "section": [1, 5, 8, 274, 435, 480, 481, 482], "see": [1, 2, 5, 6, 8, 10, 11, 32, 33, 34, 35, 36, 39, 40, 41, 42, 43, 44, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 188, 215, 262, 263, 306, 316, 323, 328, 329, 337, 339, 341, 345, 346, 347, 353, 354, 362, 380, 381, 382, 385, 386, 387, 388, 390, 392, 393, 394, 395, 396, 397, 399, 401, 403, 404, 405, 406, 412, 413, 414, 440, 480, 481, 482, 483, 486, 488], "seed": 242, "seen": 485, "select": [0, 3, 8, 185, 186, 292, 303, 359, 363, 371], "self": [5, 6, 9, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 112, 314, 323, 436, 453], "selu": 323, "semant": [13, 87, 88, 89, 91, 128, 129, 133, 165, 166, 179, 180, 181, 198, 203, 205, 221, 223, 227, 233, 254, 257, 282, 488], "semi": [182, 183, 244], "send": 481, "sennrich": 5, "sensit": 430, "sentencepiec": 5, "separ": [5, 65, 79, 344, 431], "sequenc": [5, 15, 17, 33, 34, 56, 57, 58, 59, 63, 71, 74, 75, 76, 80, 83, 91, 101, 125, 138, 142, 147, 148, 150, 151, 153, 154, 156, 157, 161, 202, 204, 206, 220, 228, 234, 239, 240, 241, 243, 244, 245, 247, 250, 251, 256, 272, 274, 277, 279, 283, 290, 291, 294, 299, 304, 328, 330, 333, 343, 349, 384, 400, 477, 488], "sequenti": [323, 450], "seri": 8, "serial": 455, "set": [2, 5, 6, 8, 94, 112, 119, 121, 122, 123, 125, 126, 127, 132, 141, 144, 209, 215, 216, 217, 266, 267, 281, 341, 350, 352, 362, 364, 371, 372, 373, 376, 377, 382, 387, 398, 423, 435, 447, 453, 455, 457, 464, 477, 482, 484], "set_byt": 2, "set_compute_pipeline_st": 2, "set_data": 2, "set_default_devic": 2, "set_dtyp": 323, "set_input_arrai": 2, "set_memory_limit": 215, "set_output_arrai": 2, "set_vector_byt": 2, "setup": [2, 4, 6, 8, 480], "sever": [5, 8, 98, 99, 100, 101, 102, 103, 104, 264, 265, 480, 487], "sgd": [4, 6, 455, 462, 464, 469, 470, 473, 480], "shade": [1, 2], "shall": 5, "shape": [0, 2, 3, 5, 6, 65, 82, 83, 90, 91, 94, 98, 99, 100, 101, 102, 103, 104, 118, 121, 125, 126, 142, 145, 146, 149, 152, 155, 156, 157, 161, 162, 167, 178, 187, 191, 203, 228, 229, 239, 240, 241, 243, 244, 245, 247, 250, 251, 256, 258, 300, 301, 303, 304, 305, 323, 325, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 348, 349, 352, 355, 356, 357, 366, 384, 402, 403, 404, 405, 406, 407, 408, 409, 424, 435, 455, 480, 482, 483, 486, 488], "shapeless": [0, 94], "share": [7, 116, 163, 236, 237, 300, 481], "shazeer": 5, "shift": [0, 179, 257, 258, 328], "shop": 5, "should": [1, 2, 4, 5, 6, 8, 83, 118, 121, 141, 142, 143, 145, 178, 208, 217, 218, 235, 236, 287, 293, 298, 301, 306, 309, 323, 330, 331, 332, 333, 334, 335, 337, 338, 373, 379, 389, 424, 426, 431, 453, 479, 480, 481, 482, 484, 485, 489], "show": [8, 316, 480], "shown": 2, "shuffl": 6, "side": [0, 231, 325, 326, 327, 355, 356, 357, 480], "sigma": [341, 342, 343, 349, 391, 403, 404, 405, 406, 414, 415, 420, 441, 442], "sigmoid": [0, 5, 323, 353, 390, 414, 420, 422, 442], "sign": [0, 16, 171, 316, 462], "signal": [105, 401], "signatur": [1, 142], "signedinteg": [11, 177], "signific": 236, "silent": [155, 156, 157], "silicon": [2, 5, 7, 8, 488], "silu": 323, "simd": 1, "simd_sum": 1, "simdgroup": 1, "simdgroup_s": 1, "similar": [5, 163, 177, 310, 377, 378, 379, 423, 485, 487], "similarli": [2, 8, 203, 482, 484], "simpl": [2, 5, 6, 323, 340, 449, 455, 480, 481, 482, 484], "simple_axpbi": 2, "simple_tim": 2, "simplest": [2, 323, 481], "simpli": [2, 5, 8, 339, 351, 385, 411, 419, 438, 448, 453, 480, 481, 482], "simplic": 0, "simultan": 1, "sin": [0, 112, 392, 482, 486], "sinc": [1, 2, 5, 6, 163, 212, 453, 462, 471, 485, 488], "sine": [0, 21, 22, 270, 271, 482], "sing": 188, "singer": 458, "singl": [2, 6, 136, 178, 193, 207, 231, 301, 326, 327, 356, 357, 480, 483, 487], "singleton": [0, 15, 17, 26, 27, 123, 202, 203, 204, 206, 220, 234, 279, 283, 299], "singular": [188, 190], "sinh": 0, "sinusoid": 392, "sinusoidalpositionalencod": 323, "size": [0, 1, 2, 5, 6, 51, 68, 90, 99, 100, 103, 104, 116, 138, 141, 142, 143, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 161, 163, 167, 168, 177, 184, 188, 211, 216, 217, 236, 237, 240, 256, 274, 277, 300, 306, 323, 325, 326, 327, 330, 331, 332, 333, 334, 335, 340, 348, 355, 356, 357, 381, 382, 401, 457, 481, 484, 485], "size_in_megabyt": 217, "size_t": [0, 2], "skip": [3, 83], "slice": [0, 483], "slice_s": 0, "slice_upd": 0, "slide": [325, 326, 327, 355, 356, 357], "slight": [5, 484], "slightli": [387, 488], "slope": 351, "slot": 481, "slow": 480, "slowli": 5, "small": [5, 139, 141, 143, 328, 344, 350, 383, 425, 430, 435, 480, 481, 488], "smaller": [0, 8, 232, 462, 480], "smallest": 188, "smile": 5, "smooth": [424, 434, 467], "smooth_l1_loss": 323, "sned": 127, "snippet": 481, "so": [1, 2, 5, 8, 164, 167, 298, 336, 401, 455, 480, 481, 484, 488], "softmax": [0, 5, 145, 323, 354, 421, 424], "softmin": 323, "softplu": [323, 358, 436], "softshrink": 323, "softsign": 323, "solv": 323, "some": [0, 2, 4, 5, 6, 364, 376, 455, 464, 480, 482, 484], "someon": 5, "someth": [4, 5, 483], "sonoma": 8, "soon": 5, "sort": [0, 28, 29, 232, 292], "sourc": [0, 1, 2, 3, 60, 125, 126, 142, 222, 294, 481], "space": [0, 2, 192, 422, 433], "spars": [0, 207], "spatial": [99, 100, 101, 103, 104, 325, 326, 327, 344, 355, 356, 357, 401], "speak": [5, 188], "special": 2, "specif": [1, 2, 8, 481, 482], "specifi": [0, 2, 18, 37, 99, 100, 101, 103, 104, 118, 153, 154, 161, 164, 184, 188, 192, 222, 228, 235, 240, 255, 284, 286, 287, 290, 293, 294, 298, 302, 304, 328, 398, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 447, 481, 482, 488], "speed": [1, 2], "spent": 5, "split": [0, 342, 344, 415], "splittabl": 477, "sqrt": [0, 5, 134, 145, 167, 328, 341, 344, 348, 350, 352, 383, 392, 403, 404, 405, 406, 413, 456, 458, 459, 460, 467, 480], "squar": [0, 4, 5, 143, 168, 187, 191, 260, 275, 298, 310, 323, 383, 432, 434, 456, 457, 459, 460, 461, 482, 485], "squeez": [0, 401, 480], "src": [0, 125, 126], "ssh": 481, "stabil": [141, 143, 328, 344, 348, 350, 383, 422, 423, 425, 456, 457, 458, 459, 460, 461, 467], "stabl": [198, 202, 272, 430], "stable_abi": 2, "stack": [0, 480], "standard": [0, 1, 6, 50, 78, 203, 241, 245, 279, 400, 403, 405, 408, 481, 486], "starmap": [5, 310], "start": [0, 1, 2, 4, 5, 7, 8, 18, 144, 192, 218, 274, 312, 480, 483, 488], "start_axi": [0, 49, 158], "start_captur": 3, "state": [5, 6, 323, 343, 349, 384, 455, 464, 477, 480], "static": 8, "static_cast": 2, "std": [0, 2, 408], "step": [0, 3, 5, 6, 18, 323, 343, 349, 384, 457, 464, 469, 471, 472, 473, 480, 481], "step_decai": 455, "step_siz": 473, "still": [5, 8, 188, 480, 484], "stochast": [458, 459, 461, 468, 484], "stood": 5, "stop": [0, 2, 5, 18, 192, 219, 280, 482, 483], "stop_captur": 3, "stop_gradi": [0, 482], "storag": 83, "store": 5, "str": [2, 105, 130, 131, 142, 164, 185, 186, 188, 193, 207, 209, 218, 261, 262, 263, 264, 265, 298, 309, 313, 359, 360, 363, 364, 366, 368, 370, 376, 401, 405, 406, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435], "straight": 5, "strang": 5, "stream": [2, 7, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 114, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 133, 134, 135, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 243, 244, 245, 246, 247, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 299, 300, 303, 304, 305, 481, 488], "streamcontext": 281, "streamordevic": [0, 2], "street": 5, "strength": [462, 468], "strict": [123, 165, 180, 364, 366, 376], "strictli": [188, 217], "stride": [0, 2, 83, 98, 99, 100, 101, 102, 103, 104, 325, 326, 327, 330, 331, 332, 333, 334, 335, 355, 356, 357, 387, 483], "string": [0, 2, 131, 142, 209, 231, 485, 487], "structur": [2, 463, 482], "stub": 8, "style": [2, 13, 16, 87, 88, 89, 128, 129, 133, 165, 166, 171, 179, 180, 181, 198, 203, 205, 221, 223, 227, 233, 254, 257, 282], "su": 5, "sub": [0, 6, 118, 249, 293, 306], "subarrai": [118, 274], "subclass": 453, "subdivid": 1, "subdtyp": 177, "subgradi": 458, "sublinear": 457, "submodul": [5, 6, 323, 360, 364, 365, 376, 378], "subscript": [130, 131], "subsect": 5, "subsequ": 455, "subset": [323, 363], "substanti": 8, "subtract": [0, 38], "subtyp": [177, 316], "sudo": [8, 217], "sum": [0, 2, 4, 13, 111, 122, 170, 188, 202, 272, 290, 293, 323, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 481, 483, 485], "sum_": [188, 430], "sum_i": 421, "sum_j": [443, 444], "summat": [130, 131], "super": [5, 6, 323, 453], "superset": [310, 463], "support": [1, 2, 5, 7, 8, 16, 90, 100, 103, 104, 145, 158, 167, 171, 182, 183, 185, 186, 187, 189, 190, 191, 193, 203, 236, 244, 481, 482, 483, 485, 487], "suppos": [482, 488], "sure": [2, 3, 5, 8, 323, 480], "surpass": [405, 406], "surpris": 5, "sw": 1, "swap": [0, 105, 216, 284, 378], "swapax": [0, 112], "swiglu": 5, "swish": [390, 442], "switch": 8, "symbol": 462, "symmetr": [99, 100, 103, 104, 182, 183, 185, 186], "symmetri": [185, 186], "synchron": [2, 480], "syntax": [38, 483], "synthet": 4, "sysctl": 217, "system": [5, 8, 209, 210, 211, 217], "t": [0, 1, 2, 5, 8, 134, 142, 145, 163, 182, 183, 237, 298, 323, 343, 349, 384, 456, 457, 458, 459, 460, 461, 462, 467, 468, 480, 482, 488], "t_kv": 145, "t_q": 145, "tabl": [1, 188, 316, 340], "take": [0, 2, 5, 6, 87, 88, 89, 94, 162, 164, 178, 205, 221, 229, 237, 287, 298, 301, 302, 305, 311, 312, 325, 326, 327, 355, 356, 357, 379, 422, 477, 481, 482, 483, 487, 488, 489], "take_along_axi": [0, 483], "taken": [118, 286, 293], "talk": 481, "tan": 0, "tangent": [0, 2, 23, 24, 25, 112, 178, 288, 289, 399, 448], "tangent_i": 2, "tangent_x": 2, "tanh": [0, 323, 341, 343, 349, 358, 384, 413, 436], "target": [2, 298, 422, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 480], "target_include_directori": 2, "target_link_librari": 2, "target_link_opt": 2, "target_sourc": 2, "task": [216, 430], "tau": 468, "tcp": 481, "tell": [5, 480, 485], "temp": 5, "templat": [0, 1, 2, 142], "ten": 484, "tend": 462, "tensor": [193, 290, 435, 485], "tensordot": 0, "term": [2, 425, 456, 457, 458, 459, 460, 461, 467], "termin": 8, "test": [6, 8, 481], "test_imag": 6, "test_label": 6, "text": [5, 341, 343, 349, 358, 384, 391, 398, 403, 404, 405, 406, 413, 416, 417, 418, 425, 426, 427, 430, 431, 434, 436, 437, 440, 441, 446, 447, 457, 462], "textrm": [236, 341, 342, 412, 415], "tf": 485, "tgp_size": 2, "th": [108, 109, 110, 111, 117, 140, 185, 471], "than": [1, 2, 5, 78, 105, 118, 129, 144, 162, 165, 166, 180, 181, 182, 183, 185, 186, 187, 190, 191, 203, 215, 217, 308, 310, 387, 398, 401, 431, 434, 447, 457, 462, 480, 482, 488], "thank": 484, "thei": [1, 2, 4, 5, 8, 16, 105, 163, 171, 389, 426, 453, 462, 479, 480, 481, 484, 486, 487, 488], "them": [0, 2, 5, 121, 323, 364, 376, 481, 488], "themselv": [2, 480], "thi": [0, 1, 2, 5, 6, 8, 15, 16, 17, 18, 26, 27, 28, 29, 83, 112, 132, 142, 162, 163, 167, 171, 178, 182, 183, 185, 186, 187, 188, 189, 190, 191, 198, 202, 203, 204, 206, 208, 210, 217, 220, 232, 234, 240, 267, 272, 273, 274, 279, 283, 286, 292, 299, 308, 311, 312, 323, 336, 337, 338, 342, 343, 349, 360, 361, 363, 364, 367, 368, 369, 374, 376, 377, 378, 379, 382, 384, 398, 403, 404, 405, 406, 413, 414, 415, 422, 430, 447, 453, 464, 479, 480, 481, 482, 484, 485, 487], "thing": [2, 5, 481], "third": [184, 327, 357], "thompson": 337, "those": [2, 5, 323], "though": [2, 5, 480, 484, 485], "thousand": 484, "thread": [1, 2], "thread_index_in_simdgroup": 1, "thread_position_in_grid": [1, 2, 142], "threadgroup": [1, 2, 142], "threads_per_simdgroup": 1, "three": [5, 86, 327, 357, 401], "threefri": 477, "threshold": [398, 427, 434, 447], "through": [1, 2, 280, 400, 462, 480, 482, 485], "throw": [2, 94, 123], "thu": [5, 323], "thumb": 455, "tic": 480, "tieleman": 467, "tile": [0, 145], "time": [2, 5, 8, 216, 291, 323, 343, 349, 384, 480, 482, 484, 488], "timeit": [480, 482], "titl": 2, "tmp": [1, 142], "to_quant": 306, "to_stream": 2, "toc": 480, "togeth": [0, 1, 2, 6, 236, 310, 311, 481], "tok_embed": 5, "token": [5, 340, 381], "told": 5, "toler": [0, 16, 171], "too": [177, 480, 484], "took": 5, "tool": 8, "top": [2, 292, 352, 401], "topk": 0, "torch": [5, 485], "torch_weight": 5, "total": [217, 482], "total_norm": 308, "tpi": 480, "trace": [0, 3, 480], "trace_fil": 3, "tracer": 377, "track": [2, 323, 328], "track_running_stat": 328, "trade": 484, "tradit": [5, 144, 337, 338, 387], "train": [5, 6, 323, 328, 336, 337, 338, 362, 364, 376, 403, 404], "train_imag": [6, 455], "train_label": [6, 455], "trainabl": [6, 307, 323, 453], "trainable_paramet": [323, 363, 464], "transform": [1, 5, 7, 112, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 167, 307, 323, 328, 344, 350, 352, 363, 364, 376, 382, 387, 483], "transformerencod": 264, "transit": 471, "translat": [141, 350], "transpos": [0, 5, 31, 102, 103, 104, 163, 237, 333, 334, 335], "treat": [0, 2, 153, 154, 156, 157, 286, 401, 480], "tree": [7, 94, 136, 164, 298, 302, 309, 310, 311, 312, 313, 463, 464, 466, 475, 482], "tree_flatten": [264, 310, 313, 323, 455], "tree_map": [311, 323, 481], "tree_unflatten": [5, 455], "trembl": 5, "tri": 0, "triangl": [185, 186, 295], "triangular": [182, 183, 191], "tril": 0, "trilinear": 401, "triplet": 435, "triplet_loss": 323, "triu": 0, "true": [0, 1, 2, 4, 5, 16, 41, 42, 43, 44, 82, 94, 108, 109, 110, 111, 142, 144, 163, 171, 177, 182, 183, 188, 193, 207, 216, 237, 272, 303, 306, 309, 310, 311, 312, 316, 323, 328, 330, 331, 332, 333, 334, 335, 343, 344, 348, 349, 350, 352, 363, 364, 366, 373, 376, 382, 384, 387, 392, 400, 401, 422, 430, 457], "truncat": [146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 250], "truth": [4, 424, 434], "try": [2, 8], "tupl": [0, 30, 65, 68, 79, 95, 99, 100, 101, 103, 104, 125, 129, 131, 136, 138, 178, 185, 188, 189, 190, 231, 236, 256, 258, 277, 298, 301, 309, 310, 311, 312, 313, 325, 326, 327, 331, 332, 334, 335, 355, 356, 357, 366, 368, 389, 401, 457, 459, 460, 461, 462, 479, 482], "tutori": 2, "twice": 488, "two": [0, 2, 13, 14, 16, 24, 82, 85, 87, 88, 89, 90, 118, 128, 133, 147, 150, 156, 162, 163, 165, 166, 171, 180, 181, 182, 183, 184, 185, 186, 187, 189, 190, 191, 198, 203, 205, 221, 223, 227, 230, 284, 312, 326, 342, 349, 356, 415, 423, 480, 481, 482, 483, 488], "txt": 2, "type": [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 209, 215, 216, 217, 220, 221, 222, 223, 224, 225, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 249, 250, 251, 252, 253, 254, 255, 256, 257, 259, 260, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 282, 283, 284, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 308, 309, 312, 323, 371, 400, 402, 403, 404, 405, 406, 407, 408, 409, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 480, 483], "type_nam": 2, "type_to_nam": 2, "typenam": [0, 1, 2], "typic": [0, 145, 340, 455, 480, 484], "u": [1, 2, 182, 185, 186, 190, 352, 378, 475, 484], "u_": 456, "u_t": 456, "uint": [1, 2, 142], "uint16": [11, 316], "uint3": 1, "uint32": [11, 26, 27, 28, 29, 240, 316], "uint64": [11, 316], "uint8": [11, 316], "ultra": 5, "unabl": 8, "unam": 8, "unari": 480, "unchang": [144, 280, 387], "uncheck": 8, "uncompress": 264, "undefin": [0, 28, 112, 182, 183, 232, 244, 483], "under": [2, 188], "underli": [2, 300], "understand": [5, 403, 404], "unexpect": [2, 18], "unfreez": [323, 364], "unfrozen": 376, "unifi": 7, "uniform": [3, 323, 352, 366, 404, 406, 450, 477, 480, 482, 488], "uniformli": 251, "unintend": 0, "union": [18, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 84, 85, 86, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 175, 176, 177, 185, 186, 209, 262, 281], "uniqu": [2, 477], "unique_ptr": 2, "unit": [329, 339, 341, 342, 343, 351, 385, 386, 388, 390, 403, 404, 405, 406, 410, 411, 412, 413, 414, 415, 419, 438, 439, 440, 442], "unittest": 8, "univers": 188, "unless": [5, 16, 171, 188, 453], "unlik": [5, 16, 171, 337, 338, 372], "unnecessari": [2, 5], "unnorm": [240, 422, 424], "unscal": 457, "unsign": [163, 236, 237, 316], "unsignedinteg": 11, "unspecifi": [15, 17, 18, 26, 27, 28, 29, 95, 108, 109, 110, 111, 161, 202, 204, 206, 220, 228, 232, 234, 255, 272, 273, 279, 283, 286, 292, 293, 299, 304, 489], "unsqueez": 5, "unsupport": 193, "until": [2, 484, 486], "unus": 2, "up": [1, 2, 5, 112, 480], "upcast": 2, "updat": [0, 1, 2, 4, 5, 6, 38, 94, 306, 310, 312, 328, 359, 360, 366, 371, 372, 373, 378, 455, 457, 460, 462, 463, 464, 468, 469, 470, 471, 472, 473, 480, 481, 484], "update_modul": 323, "uplo": [185, 186], "upon": [5, 310, 311], "upper": [182, 183, 185, 186, 191, 236, 247, 250, 251, 409], "upsampl": 323, "us": [0, 3, 4, 5, 6, 7, 8, 18, 38, 83, 112, 116, 119, 121, 122, 125, 126, 127, 129, 142, 144, 158, 163, 179, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 203, 210, 211, 212, 215, 217, 236, 237, 255, 256, 257, 258, 285, 309, 312, 316, 318, 323, 326, 327, 337, 340, 341, 343, 349, 352, 356, 357, 359, 363, 370, 377, 379, 381, 382, 384, 387, 392, 400, 401, 405, 406, 413, 414, 423, 450, 453, 455, 456, 457, 459, 460, 461, 462, 463, 464, 477, 479, 480, 481, 482, 483, 486, 488], "usag": [112, 400, 480], "user": [2, 5, 323], "usual": [340, 381, 479, 484], "util": [1, 2, 5, 7, 8, 264, 323, 455, 481], "v": [5, 105, 145, 185, 323, 364, 485], "v_": [456, 458, 459, 460, 461, 467, 468], "v_t": [456, 458, 459, 460, 461, 467, 468], "val": [0, 30, 161], "valid": [6, 105, 158, 302, 309, 364, 376, 479], "valid_parameter_filt": 359, "valu": [0, 1, 4, 5, 11, 12, 16, 18, 26, 27, 50, 78, 82, 93, 140, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 171, 184, 188, 190, 192, 209, 217, 224, 231, 235, 239, 240, 241, 243, 244, 245, 247, 250, 251, 258, 262, 286, 287, 298, 302, 307, 309, 310, 311, 312, 316, 326, 327, 329, 336, 337, 338, 339, 345, 348, 352, 356, 357, 363, 379, 380, 396, 398, 400, 402, 422, 423, 424, 425, 426, 427, 429, 430, 431, 432, 433, 434, 447, 453, 457, 460, 469, 470, 472, 473, 482], "value_and_grad": [6, 112, 323, 377, 453, 455, 466, 480, 482, 485, 486], "value_and_grad_fn": 484, "value_cach": 5, "value_dim": 379, "value_input_dim": 379, "value_output_dim": 379, "value_proj": 5, "valueerror": [188, 366, 482], "values_hat": 5, "van": 188, "var": [0, 328, 344, 348, 350, 425], "variabl": [8, 94, 119, 132, 164, 178, 298, 301, 302, 481], "varianc": [0, 279, 299, 328, 344, 425], "variant": [5, 434, 461], "variou": 188, "vector": [0, 2, 4, 7, 170, 178, 188, 286, 301, 302, 340, 424, 486], "verbos": [1, 142], "veri": [5, 379, 481, 484, 488], "verifi": [4, 8], "versa": 258, "version": [2, 8, 116, 198, 202, 236, 272, 302, 477, 482, 483], "versu": 480, "via": [8, 112, 463, 466, 481, 484, 485], "vice": 258, "video": 338, "view": [0, 3, 83, 485], "virtual": 2, "vjp": [2, 112, 486], "vmap": [2, 112, 482, 484, 486], "vmap_add": 482, "vocab_s": 5, "vocabulari": [340, 381], "void": [1, 2], "vt": 190, "w": [0, 1, 4, 99, 100, 103, 104, 116, 163, 185, 236, 237, 298, 311, 328, 331, 332, 334, 335, 337, 338, 352, 455, 468, 482], "w1": [5, 308], "w2": [5, 308], "w3": 5, "w_": [343, 349, 384, 456, 457, 458, 459, 460, 461, 462, 467, 468], "w_1": 236, "w_g": 236, "w_i": [116, 236], "w_in": 1, "w_q": 236, "w_star": 4, "w_stride": 1, "w_t": [456, 458, 459, 460, 461, 462, 467, 468], "wa": [5, 83, 125, 126, 481, 484], "wai": [2, 5, 8, 323, 401, 480, 481, 482, 483], "wait": [2, 5, 216], "walk": 5, "walkthrough": 2, "walsh": 167, "want": [1, 5, 481, 482, 488], "warm": [2, 480], "warmup": [471, 472], "warmup_init": 457, "watch": [5, 480], "wd": 462, "we": [0, 1, 2, 4, 5, 6, 116, 125, 126, 163, 236, 237, 323, 340, 381, 389, 460, 462, 477, 479, 480, 481, 482, 484, 488], "weight": [0, 4, 98, 99, 100, 101, 102, 103, 104, 141, 143, 310, 323, 366, 370, 381, 382, 422, 424, 453, 457, 460, 462, 464, 468, 482, 484], "weight_decai": [457, 460, 462, 468], "weight_fil": 5, "weights_fp16": 484, "well": [5, 323, 364, 376, 379, 484], "wen": 5, "went": 5, "were": [5, 488], "wet": 5, "what": [2, 5, 310], "whatsoev": 5, "whc": 337, "when": [0, 1, 2, 5, 7, 8, 94, 101, 112, 127, 182, 183, 185, 186, 187, 188, 190, 191, 193, 330, 331, 332, 333, 334, 335, 401, 405, 406, 422, 428, 434, 453, 455, 471, 477, 480, 481, 488], "where": [0, 6, 140, 171, 183, 236, 298, 302, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 343, 344, 348, 349, 350, 352, 363, 380, 383, 384, 398, 405, 406, 411, 412, 414, 425, 431, 437, 440, 442, 447, 464, 481, 482, 483], "wherea": 482, "whether": [142, 163, 185, 186, 191, 237, 343, 349, 363, 379, 384, 422, 425, 431], "which": [0, 1, 2, 5, 6, 7, 8, 18, 37, 83, 94, 101, 118, 121, 122, 125, 126, 127, 136, 144, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 164, 172, 173, 174, 175, 176, 178, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 207, 218, 236, 240, 241, 255, 256, 258, 261, 262, 263, 264, 265, 277, 278, 286, 293, 298, 301, 302, 306, 326, 327, 337, 338, 341, 356, 357, 359, 363, 387, 422, 424, 427, 431, 434, 450, 463, 464, 477, 480, 481, 482, 483, 484, 488, 489], "while": [2, 3, 5, 8, 256, 387, 484, 485], "whistl": 2, "who": 5, "whose": [140, 306, 307], "why": 5, "wide": 484, "width": [326, 327, 328, 331, 332, 334, 335, 337, 338, 356, 357, 381, 382], "window": [8, 325, 326, 327, 355, 356, 357], "wipe": 8, "wire": 217, "wired_limit_mb": 217, "wise": [0, 2, 12, 13, 19, 20, 21, 22, 23, 24, 25, 87, 88, 89, 92, 106, 107, 128, 129, 133, 134, 135, 137, 139, 159, 160, 165, 166, 171, 179, 180, 181, 194, 195, 196, 197, 198, 199, 200, 201, 205, 221, 223, 225, 227, 233, 253, 254, 257, 260, 268, 269, 270, 271, 275, 276, 282, 288, 289, 329, 337, 338, 347, 358, 380, 391, 410, 417, 418, 420, 421, 436, 437, 439, 442, 443, 444, 445, 480], "wish": 8, "with_logit": 422, "within": [0, 3, 28, 171], "without": [1, 5, 7, 280, 379, 449, 479, 480, 481, 484, 485, 488], "wk": 5, "wl": 2, "wo": 5, "word": 0, "work": [2, 3, 5, 216, 480, 481, 482, 483, 484], "workhors": 323, "world": [313, 481], "worri": [1, 484], "would": [2, 5, 401, 481, 483, 484, 485, 488], "wq": 5, "wrap": [112, 323], "write": [0, 1, 2, 5, 323, 485], "written": 2, "wrt": 307, "wv": 5, "x": [0, 1, 2, 4, 5, 6, 38, 90, 112, 121, 122, 126, 127, 134, 139, 141, 142, 143, 163, 167, 168, 188, 237, 241, 246, 259, 264, 268, 296, 297, 303, 310, 312, 323, 325, 326, 327, 328, 329, 339, 341, 342, 344, 348, 350, 351, 352, 355, 356, 357, 358, 359, 380, 383, 385, 391, 392, 398, 401, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 434, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 453, 455, 462, 480, 481, 482, 483, 484, 485, 486, 488], "x1": 423, "x2": 423, "x86_64": 8, "x_1": [423, 431], "x_2": [423, 431], "x_cast": 2, "x_grad": 1, "x_i": [421, 443, 444], "x_j": [443, 444], "x_offset": 2, "x_ptr": 2, "x_shape": 1, "x_stride": 2, "x_t": [343, 349, 384], "x_view": 485, "xcode": 8, "xcodeproj": 3, "xcrun": 8, "xf": 349, "xg": 349, "xi": 349, "xn": 343, "xo": 349, "xor": 89, "xr": 343, "xy": [0, 207], "xz": 343, "x\u00b2": 485, "y": [0, 2, 4, 5, 6, 38, 112, 167, 303, 323, 328, 337, 344, 348, 350, 352, 383, 426, 431, 434, 455, 458, 480, 481, 482, 484, 485], "y_": [426, 430], "y_cast": 2, "y_hat": 323, "y_offset": 2, "y_ptr": 2, "y_stride": 2, "ye": 5, "year": 5, "yet": [5, 188, 323, 453, 464, 482, 483, 484, 486], "yield": [5, 6, 477], "you": [2, 3, 5, 6, 7, 8, 217, 323, 392, 400, 450, 477, 480, 481, 482, 483, 485, 487, 488], "your": [2, 5, 8, 453, 482, 484], "z": [2, 343, 480, 484], "z_t": 343, "zeiler": 456, "zero": [0, 140, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 184, 207, 214, 295, 296, 297, 305, 323, 325, 326, 327, 336, 337, 338, 366, 402, 403, 404, 405, 406, 407, 408, 409, 450, 455, 457, 483], "zero_grad": 482, "zeros_lik": 0, "zhang": 5, "zip": [5, 6], "zip_saf": 2}, "titles": ["Operations", "Custom Metal Kernels", "Custom Extensions in MLX", "Metal Debugger", "Linear Regression", "LLM inference", "Multi-Layer Perceptron", "MLX", "Build and Install", "mlx.core.Device", "mlx.core.Dtype", "mlx.core.DtypeCategory", "mlx.core.abs", "mlx.core.add", "mlx.core.addmm", "mlx.core.all", "mlx.core.allclose", "mlx.core.any", "mlx.core.arange", "mlx.core.arccos", "mlx.core.arccosh", "mlx.core.arcsin", "mlx.core.arcsinh", "mlx.core.arctan", "mlx.core.arctan2", "mlx.core.arctanh", "mlx.core.argmax", "mlx.core.argmin", "mlx.core.argpartition", "mlx.core.argsort", "mlx.core.array", "mlx.core.array.T", "mlx.core.array.abs", "mlx.core.array.all", "mlx.core.array.any", "mlx.core.array.argmax", "mlx.core.array.argmin", "mlx.core.array.astype", "mlx.core.array.at", "mlx.core.array.conj", "mlx.core.array.cos", "mlx.core.array.cummax", "mlx.core.array.cummin", "mlx.core.array.cumprod", "mlx.core.array.cumsum", "mlx.core.array.diag", "mlx.core.array.diagonal", "mlx.core.array.dtype", "mlx.core.array.exp", "mlx.core.array.flatten", "mlx.core.array.item", "mlx.core.array.itemsize", "mlx.core.array.log", "mlx.core.array.log10", "mlx.core.array.log1p", "mlx.core.array.log2", "mlx.core.array.logsumexp", "mlx.core.array.max", "mlx.core.array.mean", "mlx.core.array.min", "mlx.core.array.moveaxis", "mlx.core.array.nbytes", "mlx.core.array.ndim", "mlx.core.array.prod", "mlx.core.array.reciprocal", "mlx.core.array.reshape", "mlx.core.array.round", "mlx.core.array.rsqrt", "mlx.core.array.shape", "mlx.core.array.sin", "mlx.core.array.size", "mlx.core.array.split", "mlx.core.array.sqrt", "mlx.core.array.square", "mlx.core.array.squeeze", "mlx.core.array.std", "mlx.core.array.sum", "mlx.core.array.swapaxes", "mlx.core.array.tolist", "mlx.core.array.transpose", "mlx.core.array.var", "mlx.core.array.view", "mlx.core.array_equal", "mlx.core.as_strided", "mlx.core.atleast_1d", "mlx.core.atleast_2d", "mlx.core.atleast_3d", "mlx.core.bitwise_and", "mlx.core.bitwise_or", "mlx.core.bitwise_xor", "mlx.core.block_masked_mm", "mlx.core.broadcast_to", "mlx.core.ceil", "mlx.core.clip", "mlx.core.compile", "mlx.core.concatenate", "mlx.core.conj", "mlx.core.conjugate", "mlx.core.conv1d", "mlx.core.conv2d", "mlx.core.conv3d", "mlx.core.conv_general", "mlx.core.conv_transpose1d", "mlx.core.conv_transpose2d", "mlx.core.conv_transpose3d", "mlx.core.convolve", "mlx.core.cos", "mlx.core.cosh", "mlx.core.cummax", "mlx.core.cummin", "mlx.core.cumprod", "mlx.core.cumsum", "mlx.core.custom_function", "mlx.core.default_device", "mlx.core.default_stream", "mlx.core.degrees", "mlx.core.dequantize", "mlx.core.diag", "mlx.core.diagonal", "mlx.core.disable_compile", "mlx.core.distributed.Group", "mlx.core.distributed.all_gather", "mlx.core.distributed.all_sum", "mlx.core.distributed.init", "mlx.core.distributed.is_available", "mlx.core.distributed.recv", "mlx.core.distributed.recv_like", "mlx.core.distributed.send", "mlx.core.divide", "mlx.core.divmod", "mlx.core.einsum", "mlx.core.einsum_path", "mlx.core.enable_compile", "mlx.core.equal", "mlx.core.erf", "mlx.core.erfinv", "mlx.core.eval", "mlx.core.exp", "mlx.core.expand_dims", "mlx.core.expm1", "mlx.core.eye", "mlx.core.fast.layer_norm", "mlx.core.fast.metal_kernel", "mlx.core.fast.rms_norm", "mlx.core.fast.rope", "mlx.core.fast.scaled_dot_product_attention", "mlx.core.fft.fft", "mlx.core.fft.fft2", "mlx.core.fft.fftn", "mlx.core.fft.ifft", "mlx.core.fft.ifft2", "mlx.core.fft.ifftn", "mlx.core.fft.irfft", "mlx.core.fft.irfft2", "mlx.core.fft.irfftn", "mlx.core.fft.rfft", "mlx.core.fft.rfft2", "mlx.core.fft.rfftn", "mlx.core.flatten", "mlx.core.floor", "mlx.core.floor_divide", "mlx.core.full", "mlx.core.gather_mm", "mlx.core.gather_qmm", "mlx.core.grad", "mlx.core.greater", "mlx.core.greater_equal", "mlx.core.hadamard_transform", "mlx.core.identity", "mlx.core.imag", "mlx.core.inner", "mlx.core.isclose", "mlx.core.isfinite", "mlx.core.isinf", "mlx.core.isnan", "mlx.core.isneginf", "mlx.core.isposinf", "mlx.core.issubdtype", "mlx.core.jvp", "mlx.core.left_shift", "mlx.core.less", "mlx.core.less_equal", "mlx.core.linalg.cholesky", "mlx.core.linalg.cholesky_inv", "mlx.core.linalg.cross", "mlx.core.linalg.eigh", "mlx.core.linalg.eigvalsh", "mlx.core.linalg.inv", "mlx.core.linalg.norm", "mlx.core.linalg.qr", "mlx.core.linalg.svd", "mlx.core.linalg.tri_inv", "mlx.core.linspace", "mlx.core.load", "mlx.core.log", "mlx.core.log10", "mlx.core.log1p", "mlx.core.log2", "mlx.core.logaddexp", "mlx.core.logical_and", "mlx.core.logical_not", "mlx.core.logical_or", "mlx.core.logsumexp", "mlx.core.matmul", "mlx.core.max", "mlx.core.maximum", "mlx.core.mean", "mlx.core.meshgrid", "mlx.core.metal.clear_cache", "mlx.core.metal.device_info", "mlx.core.metal.get_active_memory", "mlx.core.metal.get_cache_memory", "mlx.core.metal.get_peak_memory", "mlx.core.metal.is_available", "mlx.core.metal.reset_peak_memory", "mlx.core.metal.set_cache_limit", "mlx.core.metal.set_memory_limit", "mlx.core.metal.set_wired_limit", "mlx.core.metal.start_capture", "mlx.core.metal.stop_capture", "mlx.core.min", "mlx.core.minimum", "mlx.core.moveaxis", "mlx.core.multiply", "mlx.core.nan_to_num", "mlx.core.negative", "mlx.core.new_stream", "mlx.core.not_equal", "mlx.core.ones", "mlx.core.ones_like", "mlx.core.outer", "mlx.core.pad", "mlx.core.partition", "mlx.core.power", "mlx.core.prod", "mlx.core.put_along_axis", "mlx.core.quantize", "mlx.core.quantized_matmul", "mlx.core.radians", "mlx.core.random.bernoulli", "mlx.core.random.categorical", "mlx.core.random.gumbel", "mlx.core.random.key", "mlx.core.random.laplace", "mlx.core.random.multivariate_normal", "mlx.core.random.normal", "mlx.core.random.permutation", "mlx.core.random.randint", "mlx.core.random.seed", "mlx.core.random.split", "mlx.core.random.truncated_normal", "mlx.core.random.uniform", "mlx.core.real", "mlx.core.reciprocal", "mlx.core.remainder", "mlx.core.repeat", "mlx.core.reshape", "mlx.core.right_shift", "mlx.core.roll", "mlx.core.round", "mlx.core.rsqrt", "mlx.core.save", "mlx.core.save_gguf", "mlx.core.save_safetensors", "mlx.core.savez", "mlx.core.savez_compressed", "mlx.core.set_default_device", "mlx.core.set_default_stream", "mlx.core.sigmoid", "mlx.core.sign", "mlx.core.sin", "mlx.core.sinh", "mlx.core.softmax", "mlx.core.sort", "mlx.core.split", "mlx.core.sqrt", "mlx.core.square", "mlx.core.squeeze", "mlx.core.stack", "mlx.core.std", "mlx.core.stop_gradient", "mlx.core.stream", "mlx.core.subtract", "mlx.core.sum", "mlx.core.swapaxes", "mlx.core.synchronize", "mlx.core.take", "mlx.core.take_along_axis", "mlx.core.tan", "mlx.core.tanh", "mlx.core.tensordot", "mlx.core.tile", "mlx.core.topk", "mlx.core.trace", "mlx.core.transpose", "mlx.core.tri", "mlx.core.tril", "mlx.core.triu", "mlx.core.value_and_grad", "mlx.core.var", "mlx.core.view", "mlx.core.vjp", "mlx.core.vmap", "mlx.core.where", "mlx.core.zeros", "mlx.core.zeros_like", "mlx.nn.quantize", "mlx.nn.value_and_grad", "mlx.optimizers.clip_grad_norm", "mlx.utils.tree_flatten", "mlx.utils.tree_map", "mlx.utils.tree_map_with_path", "mlx.utils.tree_reduce", "mlx.utils.tree_unflatten", "mlx.core.Stream", "Array", "Data Types", "Devices and Streams", "Distributed Communication", "Fast", "FFT", "Linear Algebra", "Metal", "Neural Networks", "mlx.nn.ALiBi", "mlx.nn.AvgPool1d", "mlx.nn.AvgPool2d", "mlx.nn.AvgPool3d", "mlx.nn.BatchNorm", "mlx.nn.CELU", "mlx.nn.Conv1d", "mlx.nn.Conv2d", "mlx.nn.Conv3d", "mlx.nn.ConvTranspose1d", "mlx.nn.ConvTranspose2d", "mlx.nn.ConvTranspose3d", "mlx.nn.Dropout", "mlx.nn.Dropout2d", "mlx.nn.Dropout3d", "mlx.nn.ELU", "mlx.nn.Embedding", "mlx.nn.GELU", "mlx.nn.GLU", "mlx.nn.GRU", "mlx.nn.GroupNorm", "mlx.nn.HardShrink", "mlx.nn.HardTanh", "mlx.nn.Hardswish", "mlx.nn.InstanceNorm", "mlx.nn.LSTM", "mlx.nn.LayerNorm", "mlx.nn.LeakyReLU", "mlx.nn.Linear", "mlx.nn.LogSigmoid", "mlx.nn.LogSoftmax", "mlx.nn.MaxPool1d", "mlx.nn.MaxPool2d", "mlx.nn.MaxPool3d", "mlx.nn.Mish", "mlx.nn.Module.apply", "mlx.nn.Module.apply_to_modules", "mlx.nn.Module.children", "mlx.nn.Module.eval", "mlx.nn.Module.filter_and_map", "mlx.nn.Module.freeze", "mlx.nn.Module.leaf_modules", "mlx.nn.Module.load_weights", "mlx.nn.Module.modules", "mlx.nn.Module.named_modules", "mlx.nn.Module.parameters", "mlx.nn.Module.save_weights", "mlx.nn.Module.set_dtype", "mlx.nn.Module.state", "mlx.nn.Module.train", "mlx.nn.Module.trainable_parameters", "mlx.nn.Module.training", "mlx.nn.Module.unfreeze", "mlx.nn.Module.update", "mlx.nn.Module.update_modules", "mlx.nn.MultiHeadAttention", "mlx.nn.PReLU", "mlx.nn.QuantizedEmbedding", "mlx.nn.QuantizedLinear", "mlx.nn.RMSNorm", "mlx.nn.RNN", "mlx.nn.ReLU", "mlx.nn.ReLU6", "mlx.nn.RoPE", "mlx.nn.SELU", "mlx.nn.Sequential", "mlx.nn.SiLU", "mlx.nn.Sigmoid", "mlx.nn.SinusoidalPositionalEncoding", "mlx.nn.Softmax", "mlx.nn.Softmin", "mlx.nn.Softplus", "mlx.nn.Softshrink", "mlx.nn.Softsign", "mlx.nn.Step", "mlx.nn.Tanh", "mlx.nn.Transformer", "mlx.nn.Upsample", "mlx.nn.init.constant", "mlx.nn.init.glorot_normal", "mlx.nn.init.glorot_uniform", "mlx.nn.init.he_normal", "mlx.nn.init.he_uniform", "mlx.nn.init.identity", "mlx.nn.init.normal", "mlx.nn.init.uniform", "mlx.nn.celu", "mlx.nn.elu", "mlx.nn.gelu", "mlx.nn.gelu_approx", "mlx.nn.gelu_fast_approx", "mlx.nn.glu", "mlx.nn.hard_shrink", "mlx.nn.hard_tanh", "mlx.nn.hardswish", "mlx.nn.leaky_relu", "mlx.nn.log_sigmoid", "mlx.nn.log_softmax", "mlx.nn.losses.binary_cross_entropy", "mlx.nn.losses.cosine_similarity_loss", "mlx.nn.losses.cross_entropy", "mlx.nn.losses.gaussian_nll_loss", "mlx.nn.losses.hinge_loss", "mlx.nn.losses.huber_loss", "mlx.nn.losses.kl_div_loss", "mlx.nn.losses.l1_loss", "mlx.nn.losses.log_cosh_loss", "mlx.nn.losses.margin_ranking_loss", "mlx.nn.losses.mse_loss", "mlx.nn.losses.nll_loss", "mlx.nn.losses.smooth_l1_loss", "mlx.nn.losses.triplet_loss", "mlx.nn.mish", "mlx.nn.prelu", "mlx.nn.relu", "mlx.nn.relu6", "mlx.nn.selu", "mlx.nn.sigmoid", "mlx.nn.silu", "mlx.nn.softmax", "mlx.nn.softmin", "mlx.nn.softplus", "mlx.nn.softshrink", "mlx.nn.step", "mlx.nn.tanh", "Functions", "Initializers", "Layers", "Loss Functions", "Module", "Operations", "Optimizers", "mlx.optimizers.AdaDelta", "mlx.optimizers.Adafactor", "mlx.optimizers.Adagrad", "mlx.optimizers.Adam", "mlx.optimizers.AdamW", "mlx.optimizers.Adamax", "mlx.optimizers.Lion", "mlx.optimizers.Optimizer.apply_gradients", "mlx.optimizers.Optimizer.init", "mlx.optimizers.Optimizer.state", "mlx.optimizers.Optimizer.update", "mlx.optimizers.RMSprop", "mlx.optimizers.SGD", "mlx.optimizers.cosine_decay", "mlx.optimizers.exponential_decay", "mlx.optimizers.join_schedules", "mlx.optimizers.linear_schedule", "mlx.optimizers.step_decay", "Common Optimizers", "Optimizer", "Schedulers", "Random", "Transforms", "Tree Utils", "Compilation", "Distributed Communication", "Function Transforms", "Indexing Arrays", "Lazy Evaluation", "Conversion to NumPy and Other Frameworks", "Quick Start Guide", "Saving and Loading Arrays", "Unified Memory", "Using Streams"], "titleterms": {"A": 488, "In": 483, "The": 323, "ab": [12, 32], "adadelta": 456, "adafactor": 457, "adagrad": 458, "adam": 459, "adamax": 461, "adamw": 460, "add": 13, "addmm": 14, "algebra": 321, "alibi": 324, "all": [5, 15, 33, 481], "all_gath": 121, "all_sum": 122, "allclos": 16, "ani": [17, 34], "api": [7, 8], "appli": 359, "apply_gradi": 463, "apply_to_modul": 360, "arang": 18, "arcco": 19, "arccosh": 20, "arcsin": 21, "arcsinh": 22, "arctan": 23, "arctan2": 24, "arctanh": 25, "argmax": [26, 35], "argmin": [27, 36], "argpartit": 28, "argsort": 29, "arrai": [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 315, 483, 487], "array_equ": 82, "as_strid": 83, "astyp": 37, "atleast_1d": 84, "atleast_2d": 85, "atleast_3d": 86, "attent": 5, "automat": 482, "avgpool1d": 325, "avgpool2d": 326, "avgpool3d": 327, "back": 2, "basic": [480, 486], "batchnorm": 328, "benchmark": 5, "bernoulli": 239, "binari": 8, "binary_cross_entropi": 422, "bind": 2, "bitwise_and": 87, "bitwise_or": 88, "bitwise_xor": 89, "block_masked_mm": 90, "broadcast_to": 91, "build": [2, 8], "c": [7, 8], "categor": 240, "ceil": 92, "celu": [329, 410], "children": 361, "choleski": 182, "cholesky_inv": 183, "class": 323, "clear_cach": 208, "clip": 93, "clip_grad_norm": 308, "cmake": 2, "co": [40, 106], "code": [2, 5], "common": 474, "commun": [318, 481], "compil": [94, 480], "complex": 1, "comput": 484, "concaten": 95, "conj": [39, 96], "conjug": 97, "constant": 402, "conv1d": [98, 330], "conv2d": [99, 331], "conv3d": [100, 332], "conv_gener": 101, "conv_transpose1d": 102, "conv_transpose2d": 103, "conv_transpose3d": 104, "convers": 485, "convert": 5, "convolv": 105, "convtranspose1d": 333, "convtranspose2d": 334, "convtranspose3d": 335, "core": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 314], "cosh": 107, "cosine_decai": 469, "cosine_similarity_loss": 423, "cpu": 2, "cross": 184, "cross_entropi": 424, "cummax": [41, 108], "cummin": [42, 109], "cumprod": [43, 110], "cumsum": [44, 111], "custom": [1, 2], "custom_funct": 112, "data": 316, "debug": 480, "debugg": 3, "default_devic": 113, "default_stream": 114, "degre": 115, "dequant": 116, "devic": [9, 317], "device_info": 209, "diag": [45, 117], "diagon": [46, 118], "differ": 483, "differenti": 482, "disable_compil": 119, "distribut": [120, 121, 122, 123, 124, 125, 126, 127, 318, 481], "divid": 128, "divmod": 129, "download": [2, 5], "dropout": 336, "dropout2d": 337, "dropout3d": 338, "dtype": [10, 47], "dtypecategori": 11, "eigh": 185, "eigvalsh": 186, "einsum": 130, "einsum_path": 131, "elu": [339, 411], "embed": 340, "enable_compil": 132, "encod": 5, "end": 2, "equal": 133, "erf": 134, "erfinv": 135, "eval": [136, 362], "evalu": 484, "exampl": [1, 2, 7, 480, 481, 488], "exp": [48, 137], "expand_dim": 138, "expm1": 139, "exponential_decai": 470, "extens": 2, "ey": 140, "fast": [141, 142, 143, 144, 145, 319], "fft": [146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 320], "fft2": 147, "fftn": 148, "filter_and_map": 363, "flatten": [49, 158], "floor": 159, "floor_divid": 160, "format": 487, "found": 8, "framework": 485, "freez": 364, "from": [8, 483], "full": [5, 161], "function": [449, 452, 480, 482, 486], "further": 7, "gather_mm": 162, "gather_qmm": 163, "gaussian_nll_loss": 425, "gelu": [341, 412], "gelu_approx": 413, "gelu_fast_approx": 414, "gener": 5, "get": 481, "get_active_memori": 210, "get_cache_memori": 211, "get_peak_memori": 212, "glorot_norm": 403, "glorot_uniform": 404, "glu": [342, 415], "gpu": 2, "grad": [164, 323], "graph": [480, 484, 486], "greater": 165, "greater_equ": 166, "grid": 1, "group": 120, "groupnorm": 344, "gru": 343, "guid": 486, "gumbel": 241, "hadamard_transform": 167, "hard_shrink": 416, "hard_tanh": 417, "hardshrink": 345, "hardswish": [347, 418], "hardtanh": 346, "he_norm": 405, "he_uniform": 406, "hinge_loss": 426, "host": 481, "huber_loss": 427, "ident": [168, 407], "ifft": 149, "ifft2": 150, "ifftn": 151, "imag": 169, "implement": [2, 5], "index": 483, "infer": 5, "init": [123, 402, 403, 404, 405, 406, 407, 408, 409, 464], "initi": 450, "inner": 170, "inspect": 323, "instal": [7, 8, 481], "instancenorm": 348, "introduc": 2, "inv": 187, "irfft": 152, "irfft2": 153, "irfftn": 154, "is_avail": [124, 213], "isclos": 171, "isfinit": 172, "isinf": 173, "isnan": 174, "isneginf": 175, "isposinf": 176, "issubdtyp": 177, "item": 50, "items": 51, "jax": 485, "join_schedul": 471, "jvp": 178, "kei": 242, "kernel": 1, "kl_div_loss": 428, "l1_loss": 429, "laplac": 243, "layer": [5, 6, 451], "layer_norm": 141, "layernorm": 350, "lazi": 484, "leaf_modul": 365, "leaky_relu": 419, "leakyrelu": 351, "left_shift": 179, "less": 180, "less_equ": 181, "linalg": [182, 183, 184, 185, 186, 187, 188, 189, 190, 191], "linear": [4, 321, 352], "linear_schedul": 472, "linspac": 192, "lion": 462, "llm": 5, "load": [5, 193, 455, 487], "load_weight": 366, "log": [52, 194], "log10": [53, 195], "log1p": [54, 196], "log2": [55, 197], "log_cosh_loss": 430, "log_sigmoid": 420, "log_softmax": 421, "logaddexp": 198, "logical_and": 199, "logical_not": 200, "logical_or": 201, "logsigmoid": 353, "logsoftmax": 354, "logsumexp": [56, 202], "loss": [422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 452], "lstm": 349, "margin_ranking_loss": 431, "matmul": 203, "max": [57, 204], "maximum": 205, "maxpool1d": 355, "maxpool2d": 356, "maxpool3d": 357, "mean": [58, 206], "memori": 488, "meshgrid": 207, "metal": [1, 3, 8, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 322], "metal_kernel": 142, "min": [59, 220], "minim": 8, "minimum": 221, "mish": [358, 436], "mlx": [2, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473], "model": 5, "modul": [323, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 453], "moveaxi": [60, 222], "mpi": 481, "mse_loss": 432, "multi": 6, "multiheadattent": 379, "multipli": 223, "multivariate_norm": 244, "named_modul": 368, "nan_to_num": 224, "nbyte": 61, "ndim": 62, "neg": 225, "network": 323, "neural": 323, "new_stream": 226, "nll_loss": 433, "nn": [306, 307, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448], "norm": 188, "normal": [245, 408], "not_equ": 227, "numpi": [483, 485], "ones": 228, "ones_lik": 229, "onli": 484, "oper": [0, 2, 454], "optim": [308, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475], "option": 8, "other": 485, "outer": 230, "pad": 231, "paramet": [323, 369], "partit": 232, "perceptron": 6, "permut": 246, "place": 483, "power": 233, "prelu": [380, 437], "primit": 2, "prod": [63, 234], "pure": 480, "put": 5, "put_along_axi": 235, "python": [2, 7, 8], "pytorch": 485, "qr": 189, "quantiz": [236, 306], "quantized_matmul": 237, "quantizedembed": 381, "quantizedlinear": 382, "quick": [323, 486], "radian": 238, "randint": 247, "random": [239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 477], "read": 7, "real": 252, "reciproc": [64, 253], "recv": 125, "recv_lik": 126, "reduc": 481, "refer": 7, "regress": 4, "relu": [385, 438], "relu6": [386, 439], "remaind": 254, "remot": 481, "repeat": 255, "requir": 8, "reset_peak_memori": 214, "reshap": [65, 256], "result": 2, "rfft": 155, "rfft2": 156, "rfftn": 157, "right_shift": 257, "rms_norm": 143, "rmsnorm": 383, "rmsprop": 467, "rnn": 384, "roll": 258, "rope": [144, 387], "round": [66, 259], "rsqrt": [67, 260], "sampl": 1, "save": [261, 455, 487], "save_gguf": 262, "save_safetensor": 263, "save_weight": 370, "savez": 264, "savez_compress": 265, "scaled_dot_product_attent": 145, "schedul": 476, "script": [2, 5], "seed": 248, "selu": [388, 440], "send": 127, "sequenti": 389, "serial": 487, "set": 481, "set_cache_limit": 215, "set_default_devic": 266, "set_default_stream": 267, "set_dtyp": 371, "set_memory_limit": 216, "set_wired_limit": 217, "setuptool": 2, "sgd": 468, "shape": [1, 68], "shell": 8, "sigmoid": [268, 391, 441], "sign": 269, "silu": [390, 442], "simpl": [1, 488], "sin": [69, 270], "sinh": 271, "sinusoidalpositionalencod": 392, "size": [8, 70], "smooth_l1_loss": 434, "softmax": [272, 393, 443], "softmin": [394, 444], "softplu": [395, 445], "softshrink": [396, 446], "softsign": 397, "sort": 273, "sourc": 8, "specifi": 489, "speedup": 480, "split": [71, 249, 274], "sqrt": [72, 275], "squar": [73, 276], "squeez": [74, 277], "stack": 278, "start": [323, 481, 486], "start_captur": 218, "state": [372, 465], "std": [75, 279], "step": [398, 447], "step_decai": 473, "stop_captur": 219, "stop_gradi": 280, "stream": [281, 314, 317, 489], "stride": 1, "subtract": 282, "sum": [76, 283], "support": 316, "svd": 190, "swapax": [77, 284], "synchron": 285, "t": 31, "take": 286, "take_along_axi": 287, "tan": 288, "tanh": [289, 399, 448], "tensordot": 290, "tensorflow": 485, "tile": 291, "togeth": 5, "tolist": 78, "topk": 292, "trace": 293, "train": [373, 375, 480, 481], "trainable_paramet": 374, "transform": [2, 400, 478, 480, 482, 484, 486], "transpos": [79, 294], "tree": 479, "tree_flatten": 309, "tree_map": 310, "tree_map_with_path": 311, "tree_reduc": 312, "tree_unflatten": 313, "tri": 295, "tri_inv": 191, "tril": 296, "triplet_loss": 435, "triu": 297, "troubleshoot": 8, "truncated_norm": 250, "tune": 481, "type": 316, "unfreez": 376, "unifi": 488, "uniform": [251, 409], "up": 481, "updat": [323, 377, 466, 483], "update_modul": 378, "upsampl": 401, "us": [1, 2, 484, 489], "usag": [2, 7], "util": [309, 310, 311, 312, 313, 479], "valu": 323, "value_and_grad": [298, 307], "var": [80, 299], "vector": 482, "view": [81, 300], "vjp": [1, 301], "vmap": 302, "weight": 5, "what": 484, "when": 484, "where": 303, "why": 484, "workflow": 3, "x86": 8, "xcode": 3, "you": 484, "zero": 304, "zeros_lik": 305}})
\ No newline at end of file
diff --git a/docs/build/html/sort_8h_source.html b/docs/build/html/sort_8h_source.html
index 374bc71d8..8fbbbc12e 100644
--- a/docs/build/html/sort_8h_source.html
+++ b/docs/build/html/sort_8h_source.html
@@ -464,8 +464,8 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>  <span class="keyword">using </span>val_t = <span class="keyword">typename</span> sort_kernel::val_t;</div>
 <div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span>  <span class="keyword">using </span>idx_t = <span class="keyword">typename</span> sort_kernel::idx_t;</div>
 <div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span> </div>
-<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>  <span class="keyword">auto</span> in_block_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.y, nc_shape, in_nc_strides, nc_dim);</div>
-<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>  <span class="keyword">auto</span> out_block_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.y, nc_shape, out_nc_strides, nc_dim);</div>
+<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>  <span class="keyword">auto</span> in_block_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.y, nc_shape, in_nc_strides, nc_dim);</div>
+<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span>  <span class="keyword">auto</span> out_block_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.y, nc_shape, out_nc_strides, nc_dim);</div>
 <div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span>  inp += in_block_idx;</div>
 <div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span>  out += out_block_idx;</div>
 <div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span> </div>
@@ -616,7 +616,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00496" name="l00496"></a><span class="lineno">  496</span>      BLOCK_THREADS,</div>
 <div class="line"><a id="l00497" name="l00497"></a><span class="lineno">  497</span>      N_PER_THREAD&gt;;</div>
 <div class="line"><a id="l00498" name="l00498"></a><span class="lineno">  498</span> </div>
-<div class="line"><a id="l00499" name="l00499"></a><span class="lineno">  499</span>  <span class="keyword">auto</span> block_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.y, nc_shape, nc_strides, nc_dim);</div>
+<div class="line"><a id="l00499" name="l00499"></a><span class="lineno">  499</span>  <span class="keyword">auto</span> block_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.y, nc_shape, nc_strides, nc_dim);</div>
 <div class="line"><a id="l00500" name="l00500"></a><span class="lineno">  500</span>  inp += block_idx;</div>
 <div class="line"><a id="l00501" name="l00501"></a><span class="lineno">  501</span>  out_vals += tid.y * size_sorted_axis;</div>
 <div class="line"><a id="l00502" name="l00502"></a><span class="lineno">  502</span>  out_idxs += tid.y * size_sorted_axis;</div>
@@ -818,11 +818,11 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00694" name="l00694"></a><span class="lineno">  694</span>  }</div>
 <div class="line"><a id="l00695" name="l00695"></a><span class="lineno">  695</span>}</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
 <div class="ttc" id="asort_8h_html_a0386011c52d03e60885a31e6fbd903dd"><div class="ttname"><a href="sort_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a></div><div class="ttdeci">#define MLX_MTL_CONST</div><div class="ttdef"><b>Definition</b> sort.h:3</div></div>
 <div class="ttc" id="asort_8h_html_a32cbe4163b8b0f5cb2c97b256119a4b2"><div class="ttname"><a href="sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2">mb_block_partition</a></div><div class="ttdeci">void mb_block_partition(device idx_t *block_partitions, const device val_t *dev_vals, const device idx_t *dev_idxs, const constant int &amp;size_sorted_axis, const constant int &amp;merge_tiles, const constant int &amp;n_blocks, uint3 tid, uint3 lid, uint3 tgp_dims)</div><div class="ttdef"><b>Definition</b> sort.h:525</div></div>
 <div class="ttc" id="asort_8h_html_a4ee3de195a6f9c33aa91ac52461808ad"><div class="ttname"><a href="sort_8h.html#a4ee3de195a6f9c33aa91ac52461808ad">block_sort_nc</a></div><div class="ttdeci">void block_sort_nc(const device T *inp, device U *out, const constant int &amp;size_sorted_axis, const constant int &amp;in_stride_sorted_axis, const constant int &amp;out_stride_sorted_axis, const constant int &amp;nc_dim, const constant int *nc_shape, const constant size_t *in_nc_strides, const constant size_t *out_nc_strides, uint3 tid, uint3 lid)</div><div class="ttdef"><b>Definition</b> sort.h:338</div></div>
@@ -848,7 +848,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="astruct_less_than_html"><div class="ttname"><a href="struct_less_than.html">LessThan</a></div><div class="ttdef"><b>Definition</b> sort.h:23</div></div>
 <div class="ttc" id="astruct_less_than_html_a2798eb377b411c93a4ed30cf35caade2"><div class="ttname"><a href="struct_less_than.html#a2798eb377b411c93a4ed30cf35caade2">LessThan::operator()</a></div><div class="ttdeci">METAL_FUNC bool operator()(T a, T b)</div><div class="ttdef"><b>Definition</b> sort.h:26</div></div>
 <div class="ttc" id="astruct_less_than_html_abf97a6b0163048e4ba96460939dbd3a3"><div class="ttname"><a href="struct_less_than.html#abf97a6b0163048e4ba96460939dbd3a3">LessThan::init</a></div><div class="ttdeci">static constexpr constant T init</div><div class="ttdef"><b>Definition</b> sort.h:24</div></div>
-<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:17</div></div>
+<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:23</div></div>
 <div class="ttc" id="astruct_thread_sort_html"><div class="ttname"><a href="struct_thread_sort.html">ThreadSort</a></div><div class="ttdef"><b>Definition</b> sort.h:37</div></div>
 <div class="ttc" id="astruct_thread_sort_html_ad9ab3e6b47f7e9b91c0f3b773596986d"><div class="ttname"><a href="struct_thread_sort.html#ad9ab3e6b47f7e9b91c0f3b773596986d">ThreadSort::sort</a></div><div class="ttdeci">static METAL_FUNC void sort(thread val_t(&amp;vals)[N_PER_THREAD], thread idx_t(&amp;idxs)[N_PER_THREAD])</div><div class="ttdef"><b>Definition</b> sort.h:38</div></div>
 </div><!-- fragment --></div><!-- contents -->
diff --git a/docs/build/html/steel__attention_8h.html b/docs/build/html/steel__attention_8h.html
new file mode 100644
index 000000000..3fc196451
--- /dev/null
+++ b/docs/build/html/steel__attention_8h.html
@@ -0,0 +1,226 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h File Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li><li class="navelem"><a class="el" href="dir_5aea41cce495e77a0857a0aecf063e33.html">kernels</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#nested-classes">Classes</a> &#124;
+<a href="#func-members">Functions</a> &#124;
+<a href="#var-members">Variables</a>  </div>
+  <div class="headertitle"><div class="title">steel_attention.h File Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><a href="steel__attention_8h_source.html">Go to the source code of this file.</a></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
+Classes</h2></td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_transform_scale.html">TransformScale&lt; T &gt;</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_max_op.html">MaxOp</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_sum_op.html">SumOp</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_mul_op.html">MulOp</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_sub_op.html">SubOp</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_exp_sub_op.html">ExpSubOp</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">struct &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="struct_div_op.html">DivOp</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
+Functions</h2></td></tr>
+<tr class="memitem:a5423b2a414f5e3c14166d568dedfbd33" id="r_a5423b2a414f5e3c14166d568dedfbd33"><td class="memTemplParams" colspan="2">template&lt;typename T , int BQ, int BK, int BD, int WM, int WN, typename AccumType  = float&gt; </td></tr>
+<tr class="memitem:a5423b2a414f5e3c14166d568dedfbd33"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a5423b2a414f5e3c14166d568dedfbd33">attention</a> (const device T *Q, const device T *K, const device T *V, device T *O, const constant <a class="el" href="structmlx_1_1steel_1_1_attn_params.html">AttnParams</a> *params, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</td></tr>
+<tr class="separator:a5423b2a414f5e3c14166d568dedfbd33"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="var-members" name="var-members"></a>
+Variables</h2></td></tr>
+<tr class="memitem:a171fdea1b23976453f5dc5e6b3161982" id="r_a171fdea1b23976453f5dc5e6b3161982"><td class="memItemLeft" align="right" valign="top">constant bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a171fdea1b23976453f5dc5e6b3161982">align_Q</a></td></tr>
+<tr class="separator:a171fdea1b23976453f5dc5e6b3161982"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a8bdd2cecf97aa5b033152b1d0f0d2416" id="r_a8bdd2cecf97aa5b033152b1d0f0d2416"><td class="memItemLeft" align="right" valign="top">constant bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8bdd2cecf97aa5b033152b1d0f0d2416">align_K</a></td></tr>
+<tr class="separator:a8bdd2cecf97aa5b033152b1d0f0d2416"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Function Documentation</h2>
+<a id="a5423b2a414f5e3c14166d568dedfbd33" name="a5423b2a414f5e3c14166d568dedfbd33"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a5423b2a414f5e3c14166d568dedfbd33">&#9670;&#160;</a></span>attention()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int BQ, int BK, int BD, int WM, int WN, typename AccumType  = float&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">void attention </td>
+          <td>(</td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>Q</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>K</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>V</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">device T *</td>          <td class="paramname"><span class="paramname"><em>O</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant <a class="el" href="structmlx_1_1steel_1_1_attn_params.html">AttnParams</a> *</td>          <td class="paramname"><span class="paramname"><em>params</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_lane_id</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_group_id</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>tid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>lid</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<h2 class="groupheader">Variable Documentation</h2>
+<a id="a8bdd2cecf97aa5b033152b1d0f0d2416" name="a8bdd2cecf97aa5b033152b1d0f0d2416"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a8bdd2cecf97aa5b033152b1d0f0d2416">&#9670;&#160;</a></span>align_K</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">constant bool align_K</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a171fdea1b23976453f5dc5e6b3161982" name="a171fdea1b23976453f5dc5e6b3161982"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a171fdea1b23976453f5dc5e6b3161982">&#9670;&#160;</a></span>align_Q</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">constant bool align_Q</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/steel__attention_8h_source.html b/docs/build/html/steel__attention_8h_source.html
new file mode 100644
index 000000000..d38622d02
--- /dev/null
+++ b/docs/build/html/steel__attention_8h_source.html
@@ -0,0 +1,508 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h Source File</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="dir_938ab0ecf10b8b860ff766c820f665fd.html">mlx</a></li><li class="navelem"><a class="el" href="dir_1d446c9bd3c99228254c9484e0bc5c06.html">backend</a></li><li class="navelem"><a class="el" href="dir_d0c977ea65824390717cdb7efc36c157.html">metal</a></li><li class="navelem"><a class="el" href="dir_70a37effa88bcbd6b791977fa1e64356.html">kernels</a></li><li class="navelem"><a class="el" href="dir_76215a6c54e2b67053e723fc2395583c.html">steel</a></li><li class="navelem"><a class="el" href="dir_e1756c7634b0c14aead026895ad71c6d.html">attn</a></li><li class="navelem"><a class="el" href="dir_5aea41cce495e77a0857a0aecf063e33.html">kernels</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">steel_attention.h</div></div>
+</div><!--header-->
+<div class="contents">
+<a href="steel__attention_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2024 Apple Inc.</span></div>
+<div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="keyword">using namespace </span><a class="code hl_namespace" href="namespacemlx_1_1steel.html">mlx::steel</a>;</div>
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span> </div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span><span class="comment">// GEMM kernels</span></div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span> </div>
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno"><a class="line" href="steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982">    9</a></span>constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982">align_Q</a> [[function_constant(200)]];</div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno"><a class="line" href="steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">   10</a></span>constant <span class="keywordtype">bool</span> <a class="code hl_variable" href="steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">align_K</a> [[function_constant(201)]];</div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span> </div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00013" data-start="{" data-end="};">
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno"><a class="line" href="struct_transform_scale.html">   13</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_transform_scale.html">TransformScale</a> {</div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno"><a class="line" href="struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6">   14</a></span>  T <a class="code hl_variable" href="struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6">scale</a>;</div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno"><a class="line" href="struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70">   15</a></span>  METAL_FUNC <a class="code hl_function" href="struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70">TransformScale</a>(T scale_) : <a class="code hl_variable" href="struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6">scale</a>(scale_) {}</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span> </div>
+<div class="foldopen" id="foldopen00017" data-start="{" data-end="}">
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno"><a class="line" href="struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16">   17</a></span>  METAL_FUNC T <a class="code hl_function" href="struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16">apply</a>(T x)<span class="keyword"> const </span>{</div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    <span class="keywordflow">return</span> <a class="code hl_variable" href="struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6">scale</a> * x;</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>  }</div>
+</div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>};</div>
+</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span> </div>
+<div class="foldopen" id="foldopen00022" data-start="{" data-end="};">
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno"><a class="line" href="struct_max_op.html">   22</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_max_op.html">MaxOp</a> {</div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00024" data-start="{" data-end="}">
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno"><a class="line" href="struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e">   24</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> T <a class="code hl_function" href="struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e">apply</a>(T x, T y) {</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>    <span class="keywordflow">return</span> <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a>(x, y);</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>  }</div>
+</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>};</div>
+</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span> </div>
+<div class="foldopen" id="foldopen00029" data-start="{" data-end="};">
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno"><a class="line" href="struct_sum_op.html">   29</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_sum_op.html">SumOp</a> {</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00031" data-start="{" data-end="}">
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno"><a class="line" href="struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d">   31</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> T <a class="code hl_function" href="struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d">apply</a>(T x, T y) {</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    <span class="keywordflow">return</span> x + y;</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  }</div>
+</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>};</div>
+</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span> </div>
+<div class="foldopen" id="foldopen00036" data-start="{" data-end="};">
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno"><a class="line" href="struct_mul_op.html">   36</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_mul_op.html">MulOp</a> {</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00038" data-start="{" data-end="}">
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno"><a class="line" href="struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756">   38</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> T <a class="code hl_function" href="struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756">apply</a>(T x, T y) {</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>    <span class="keywordflow">return</span> x * y;</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  }</div>
+</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>};</div>
+</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span> </div>
+<div class="foldopen" id="foldopen00043" data-start="{" data-end="};">
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno"><a class="line" href="struct_sub_op.html">   43</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_sub_op.html">SubOp</a> {</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00045" data-start="{" data-end="}">
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno"><a class="line" href="struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143">   45</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> T <a class="code hl_function" href="struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143">apply</a>(T x, T y) {</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>    <span class="keywordflow">return</span> x - y;</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>  }</div>
+</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>};</div>
+</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span> </div>
+<div class="foldopen" id="foldopen00050" data-start="{" data-end="};">
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno"><a class="line" href="struct_exp_sub_op.html">   50</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_exp_sub_op.html">ExpSubOp</a> {</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00052" data-start="{" data-end="}">
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno"><a class="line" href="struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334">   52</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> T <a class="code hl_function" href="struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334">apply</a>(T x, T y) {</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>    <span class="keywordflow">return</span> fast::exp(x - y);</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>  }</div>
+</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>};</div>
+</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span> </div>
+<div class="foldopen" id="foldopen00057" data-start="{" data-end="};">
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno"><a class="line" href="struct_div_op.html">   57</a></span><span class="keyword">struct </span><a class="code hl_struct" href="struct_div_op.html">DivOp</a> {</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>  <span class="keyword">template</span> &lt;<span class="keyword">typename</span> T&gt;</div>
+<div class="foldopen" id="foldopen00059" data-start="{" data-end="}">
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno"><a class="line" href="struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221">   59</a></span>  METAL_FUNC <span class="keyword">static</span> <span class="keyword">constexpr</span> T <a class="code hl_function" href="struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221">apply</a>(T x, T y) {</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>    <span class="keywordflow">return</span> x / y;</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>  }</div>
+</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>};</div>
+</div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span> </div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span><span class="comment">// clang-format off</span></div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    <span class="keywordtype">int</span> BQ,</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    <span class="keywordtype">int</span> BK,</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>    <span class="keywordtype">int</span> BD,</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>    <span class="keywordtype">int</span> WM,</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>    <span class="keywordtype">int</span> WN,</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    <span class="keyword">typename</span> AccumType = <span class="keywordtype">float</span>&gt;</div>
+<div class="foldopen" id="foldopen00073" data-start="{" data-end="}">
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno"><a class="line" href="steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33">   73</a></span>[[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] <span class="keywordtype">void</span> <a class="code hl_function" href="steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33">attention</a>(</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>    <span class="keyword">const</span> device T* Q [[buffer(0)]],</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    <span class="keyword">const</span> device T* K [[buffer(1)]],</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    <span class="keyword">const</span> device T* V [[buffer(2)]],</div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    device T* O [[buffer(3)]],</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    <span class="keyword">const</span> constant <a class="code hl_struct" href="structmlx_1_1steel_1_1_attn_params.html">AttnParams</a>* params [[buffer(4)]],</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    uint3 lid [[thread_position_in_threadgroup]]) { <span class="comment">// clang-format on</span></div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span> </div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>  <span class="comment">// Pacifying compiler</span></div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>  (void)lid;</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span> </div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>  <span class="comment">// Move to correct block</span></div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>  ulong3 tidl{tid.x, tid.y, tid.z};</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span> </div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>  Q += tidl.z * params-&gt;Q_strides[0] + <span class="comment">// Batch</span></div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>      tidl.y * params-&gt;Q_strides[1] + <span class="comment">// Head</span></div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>      tidl.x * BQ * params-&gt;Q_strides[2]; <span class="comment">// Seqeunce</span></div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span> </div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  ulong kv_head_idx = int(tid.y) / params-&gt;gqa_factor;</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>  K += tidl.z * params-&gt;K_strides[0] + <span class="comment">// Batch</span></div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>      kv_head_idx * params-&gt;K_strides[1]; <span class="comment">// Head</span></div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span> </div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  V += tidl.z * params-&gt;V_strides[0] + <span class="comment">// Batch</span></div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>      kv_head_idx * params-&gt;V_strides[1]; <span class="comment">// Head</span></div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span> </div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>  O += tidl.z * params-&gt;O_strides[0] + <span class="comment">// Batch</span></div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>      tidl.y * params-&gt;O_strides[1] + <span class="comment">// Head</span></div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>      tidl.x * BQ * params-&gt;O_strides[2]; <span class="comment">// Seqeunce</span></div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span> </div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>  <span class="comment">// Prepare threadgroup memory</span></div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> padQ = 0; <span class="comment">// 16 / sizeof(T);</span></div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> padK = 0; <span class="comment">// 16 / sizeof(T);</span></div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> padV = 0; <span class="comment">// 16 / sizeof(T);</span></div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span> </div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> LDQ_tgp = BD + padQ;</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> LDK_tgp = BK + padK;</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> LDV_tgp = BD + padV;</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span> </div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  threadgroup T Qs[BQ * (BD + padQ)];</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>  threadgroup T Ks[(BK + padK) * BD];</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  threadgroup T Vs[BK * (BD + padV)];</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span> </div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>  <span class="comment">// Prepare block loaders</span></div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  <span class="keyword">using </span>QBlockLoader = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_t.html">BlockLoaderT</a>&lt;</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>      <span class="comment">/* typename T = */</span> T,</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>      <span class="comment">/* short BROWS = */</span> BQ,</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>      <span class="comment">/* short BCOLS = */</span> BD,</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>      <span class="comment">/* short kDstStrRow = */</span> LDQ_tgp,</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>      <span class="comment">/* short kDstStrCol = */</span> 1,</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>      <span class="comment">/* short reduction_dim = */</span> 1,</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>      <span class="comment">/* short tgp_size = */</span> WM * WN * 32&gt;;</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span> </div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>  <span class="comment">// K is loaded in transposed</span></div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>  <span class="keyword">using </span>KBlockLoader = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_t.html">BlockLoaderT</a>&lt;</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>      <span class="comment">/* typename T = */</span> T,</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>      <span class="comment">/* short BROWS = */</span> BK,</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>      <span class="comment">/* short BCOLS = */</span> BD,</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>      <span class="comment">/* short kDstStrRow = */</span> 1,</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>      <span class="comment">/* short kDstStrCol = */</span> LDK_tgp,</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>      <span class="comment">/* short reduction_dim = */</span> 0,</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>      <span class="comment">/* short tgp_size = */</span> WM * WN * 32&gt;;</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span> </div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>  <span class="keyword">using </span>VBlockLoader = <a class="code hl_struct" href="structmlx_1_1steel_1_1_block_loader_t.html">BlockLoaderT</a>&lt;</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>      <span class="comment">/* typename T = */</span> T,</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>      <span class="comment">/* short BROWS = */</span> BK,</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>      <span class="comment">/* short BCOLS = */</span> BD,</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>      <span class="comment">/* short kDstStrRow = */</span> LDV_tgp,</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>      <span class="comment">/* short kDstStrCol = */</span> 1,</div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>      <span class="comment">/* short reduction_dim = */</span> 0,</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>      <span class="comment">/* short tgp_size = */</span> WM * WN * 32&gt;;</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span> </div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>  QBlockLoader loader_q(</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>      Q, params-&gt;Q_strides[2], Qs, simd_group_id, simd_lane_id);</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>  KBlockLoader loader_k(</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>      K, params-&gt;K_strides[2], Ks, simd_group_id, simd_lane_id);</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>  VBlockLoader loader_v(</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>      V, params-&gt;V_strides[2], Vs, simd_group_id, simd_lane_id);</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span> </div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>  <a class="code hl_struct" href="struct_transform_scale.html">TransformScale&lt;T&gt;</a> ts(<span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(params-&gt;scale));</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span> </div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>  <span class="comment">// Prepare MMA tiles</span></div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kFragSize = 8; <span class="comment">// MMAFrag size</span></div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>  <span class="keyword">using </span>MMAFrag_acc_t = <a class="code hl_struct" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag&lt;AccumType, kFragSize, kFragSize&gt;</a>;</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span> </div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> kNWarps = WM * WN;</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>      BQ &gt;= (kNWarps * kFragSize) &amp;&amp; BQ % (kNWarps * kFragSize) == 0,</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>      <span class="stringliteral">&quot;Each simdgroup must host atleast 1 simdgroup matrix along Q sequence.&quot;</span>);</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span> </div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>  <span class="comment">// Q seq frags per warp</span></div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> TQ = BQ / (kNWarps * kFragSize);</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>  <span class="comment">// KV sequence frags (all warps load the same frags)</span></div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> TK = BK / kFragSize;</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>  <span class="comment">// HeadDim frags (all warps load the same frags)</span></div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> TD = BD / kFragSize;</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span> </div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>  <span class="keyword">static_assert</span>(TQ == 1, <span class="stringliteral">&quot;Check TQ&quot;</span>);</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span> </div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TQ, 1, MMAFrag_acc_t&gt;</a> Qtile;</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, 1, TK, MMAFrag_acc_t&gt;</a> Ktile;</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TQ, TK, MMAFrag_acc_t&gt;</a> Stile;</div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TK, TD, MMAFrag_acc_t&gt;</a> Vtile;</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>  <a class="code hl_struct" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile&lt;AccumType, TQ, TD, MMAFrag_acc_t&gt;</a> Otile;</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span> </div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>  Otile.<a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">clear</a>();</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span> </div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>  <span class="comment">// Prepare mma tile offsets</span></div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>  <span class="keyword">const</span> short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> sm = simd_coord.y;</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> sn = simd_coord.x;</div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> tm = kFragSize * TQ * simd_group_id;</div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span> </div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> Qs_offset = (tm + sm) * LDQ_tgp + sn;</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> Ks_offset = sm * LDK_tgp + sn;</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>  <span class="keyword">const</span> <span class="keywordtype">short</span> Vs_offset = sm * LDV_tgp + sn;</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span> </div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> Qs_tile_stride = kFragSize;</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> Ks_tile_stride = kFragSize * LDK_tgp;</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span> </div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span> </div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>  <span class="comment">// Load Q blocks apply scale</span></div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>  <span class="keywordflow">if</span> (!<a class="code hl_variable" href="steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982">align_Q</a> &amp;&amp; <span class="keywordtype">int</span>(tid.x) == (params-&gt;NQ_aligned)) {</div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>    loader_q.load_safe(short2(BD, params-&gt;qL - params-&gt;NQ_aligned * BQ));</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>    loader_q.load_unsafe();</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>  }</div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>  loader_q.apply_inplace_op(ts);</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span> </div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>  <span class="comment">// Init row reduction variables</span></div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> kRowsPT = <span class="keyword">decltype</span>(Stile)::kRowsPerThread;</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span> </div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>  AccumType max_score[kRowsPT];</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>  AccumType sum_score[kRowsPT] = {0};</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span> </div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>  <span class="comment">// Init to -Inf</span></div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>  <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>  <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kRowsPT; ++i) {</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>    max_score[i] = <a class="code hl_struct" href="struct_limits.html">Limits&lt;AccumType&gt;::min</a>;</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>  }</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span> </div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>  <span class="comment">// Loop over KV seq length</span></div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> kb = 0; kb &lt; params-&gt;NK; kb++) {</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>    <span class="comment">// Load K block and apply scale</span></div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>    <span class="keywordflow">if</span> (!<a class="code hl_variable" href="steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">align_K</a> &amp;&amp; kb == (params-&gt;NK_aligned)) {</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>      loader_k.load_safe(short2(BD, params-&gt;kL - params-&gt;NK_aligned * BK));</div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>      loader_k.load_unsafe();</div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>    }</div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span> </div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span> </div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>    <span class="comment">// Do S = Q @ K.T</span></div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    Stile.<a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">clear</a>();</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span> </div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> dd = 0; dd &lt; TD; dd++) {</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>      simdgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span> </div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>      Qtile.template load&lt;T, 1, 1, LDQ_tgp, 1&gt;(</div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>          &amp;Qs[Qs_offset + dd * Qs_tile_stride]);</div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>      Ktile.template load&lt;T, 1, 1, LDK_tgp, 1&gt;(</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>          &amp;Ks[Ks_offset + dd * Ks_tile_stride]);</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span> </div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>      simdgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span> </div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>      <a class="code hl_function" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">tile_matmad</a>(Stile, Qtile, Ktile, Stile);</div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>    }</div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span> </div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>    <span class="comment">// Mask out of length sequence</span></div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>    <span class="keywordflow">if</span> (!<a class="code hl_variable" href="steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">align_K</a> &amp;&amp; kb == (params-&gt;NK_aligned)) {</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>      <span class="keyword">using </span>stile_t = <span class="keyword">decltype</span>(Stile);</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>      <span class="keyword">using </span>selem_t = <span class="keyword">typename</span> stile_t::elem_type;</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>      <span class="keyword">constexpr</span> <span class="keyword">auto</span> neg_inf = -metal::numeric_limits&lt;selem_t&gt;::infinity();</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>      <span class="keyword">const</span> <span class="keywordtype">short</span> lim = params-&gt;kL - params-&gt;NK_aligned * BK;</div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span> </div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>      <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; stile_t::kTileRows; i++) {</div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>        <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; stile_t::kTileCols; j++) {</div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>          <span class="keywordtype">short</span> col_pos = sn + (j * stile_t::kFragCols);</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>          <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>          <span class="keywordflow">for</span> (<span class="keywordtype">short</span> jj = 0; jj &lt; stile_t::MMAFrag_t::kElemCols; jj++) {</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>            <span class="keywordflow">if</span> ((col_pos + jj) &gt;= lim) {</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>              Stile.<a class="code hl_function" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(i, j)[jj] = neg_inf;</div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>            }</div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>          }</div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>        }</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>      }</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>    }</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span> </div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>    simdgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span> </div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>    <span class="comment">// Load V blocks</span></div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>    <span class="keywordflow">if</span> (!<a class="code hl_variable" href="steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">align_K</a> &amp;&amp; kb == (params-&gt;NK_aligned)) {</div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>      loader_v.load_safe(short2(BD, params-&gt;kL - params-&gt;NK_aligned * BK));</div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>      loader_v.load_unsafe();</div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>    }</div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span> </div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>    <span class="comment">// Do softmax</span></div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span> </div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>    <span class="comment">// Temp variables</span></div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>    AccumType new_max[kRowsPT];</div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>    AccumType factor[kRowsPT];</div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kRowsPT; ++i) {</div>
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>      new_max[i] = max_score[i];</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>    }</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span> </div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>    <span class="comment">// Row max</span></div>
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>    Stile.template row_reduce&lt;MaxOp&gt;(new_max);</div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span> </div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>    <span class="comment">// exp(Si - rowmax(Si))</span></div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>    Stile.template row_bin_op&lt;ExpSubOp&gt;(new_max);</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span> </div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>    <span class="comment">// Factor exp(rowmax(Si) - rowmax(Si-1))</span></div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kRowsPT; ++i) {</div>
+<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>      factor[i] = fast::exp(max_score[i] - new_max[i]);</div>
+<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>    }</div>
+<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span> </div>
+<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>    <span class="comment">// Save max for next iteration</span></div>
+<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kRowsPT; ++i) {</div>
+<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>      max_score[i] = new_max[i];</div>
+<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>    }</div>
+<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span> </div>
+<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>    <span class="comment">// Row Sum</span></div>
+<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>    AccumType sum_score_tmp[kRowsPT] = {0};</div>
+<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span>    Stile.template row_reduce&lt;SumOp&gt;(sum_score_tmp);</div>
+<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span> </div>
+<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>    <span class="comment">// Update norm</span></div>
+<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>    <a class="code hl_define" href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div>
+<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; kRowsPT; ++i) {</div>
+<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>      sum_score[i] = sum_score[i] * factor[i] + sum_score_tmp[i];</div>
+<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span>    }</div>
+<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span> </div>
+<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>    <span class="comment">// Update O</span></div>
+<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>    Otile.template row_bin_op&lt;MulOp&gt;(factor);</div>
+<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span> </div>
+<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span>    <span class="comment">// Load V into registers</span></div>
+<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>    Vtile.template load&lt;T, 1, 1, LDV_tgp, 1&gt;(&amp;Vs[Vs_offset]);</div>
+<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span> </div>
+<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span>    simdgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span> </div>
+<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>    <span class="comment">// Do O = S @ V</span></div>
+<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>    <a class="code hl_function" href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">tile_matmad</a>(Otile, Stile, Vtile, Otile);</div>
+<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span> </div>
+<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span>    <span class="comment">// Prepare for next iteration</span></div>
+<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span>    loader_k.next();</div>
+<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>    loader_v.next();</div>
+<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>  }</div>
+<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span> </div>
+<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>  <span class="comment">// Normalize output</span></div>
+<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span>  Otile.template row_bin_op&lt;DivOp&gt;(sum_score);</div>
+<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>  threadgroup_barrier(mem_flags::mem_none);</div>
+<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span> </div>
+<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span>  <span class="comment">// Store results</span></div>
+<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span>  O += (tm + sm) * params-&gt;O_strides[2] + sn;</div>
+<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span> </div>
+<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span>  <span class="keywordflow">if</span> (!<a class="code hl_variable" href="steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982">align_Q</a> &amp;&amp; <span class="keywordtype">int</span>(tid.x) == (params-&gt;NQ_aligned)) {</div>
+<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>    <span class="keyword">auto</span> dst_tile_dims =</div>
+<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span>        short2(BD - sn, params-&gt;qL - BQ * params-&gt;NQ_aligned - (tm + sm));</div>
+<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span> </div>
+<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span>    <span class="keywordflow">if</span> (dst_tile_dims.x &lt;= 0 || dst_tile_dims.y &lt;= 0)</div>
+<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>      <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span> </div>
+<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>    Otile.template store_safe&lt;T, 1, 1&gt;(O, params-&gt;O_strides[2], dst_tile_dims);</div>
+<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span>  } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span>    Otile.template store&lt;T, 1, 1&gt;(O, params-&gt;O_strides[2]);</div>
+<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>  }</div>
+<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>}</div>
+</div>
+<div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> attn.h:19</div></div>
+<div class="ttc" id="anamespacemlx_1_1steel_html_ad583e6038efc119542410f43b603d4ad"><div class="ttname"><a href="namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad">mlx::steel::tile_matmad</a></div><div class="ttdeci">METAL_FUNC void tile_matmad(thread MMATile&lt; T, M, N &gt; &amp;D, thread MMATile&lt; U, M, K &gt; &amp;A, thread MMATile&lt; U, K, N &gt; &amp;B, thread MMATile&lt; T, M, N &gt; &amp;C)</div><div class="ttdef"><b>Definition</b> mma.h:413</div></div>
+<div class="ttc" id="asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6"><div class="ttname"><a href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div><div class="ttdeci">#define STEEL_PRAGMA_UNROLL</div><div class="ttdef"><b>Definition</b> defines.h:4</div></div>
+<div class="ttc" id="asteel__attention_8h_html_a171fdea1b23976453f5dc5e6b3161982"><div class="ttname"><a href="steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982">align_Q</a></div><div class="ttdeci">constant bool align_Q</div><div class="ttdef"><b>Definition</b> steel_attention.h:9</div></div>
+<div class="ttc" id="asteel__attention_8h_html_a5423b2a414f5e3c14166d568dedfbd33"><div class="ttname"><a href="steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33">attention</a></div><div class="ttdeci">void attention(const device T *Q, const device T *K, const device T *V, device T *O, const constant AttnParams *params, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</div><div class="ttdef"><b>Definition</b> steel_attention.h:73</div></div>
+<div class="ttc" id="asteel__attention_8h_html_a8bdd2cecf97aa5b033152b1d0f0d2416"><div class="ttname"><a href="steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416">align_K</a></div><div class="ttdeci">constant bool align_K</div><div class="ttdef"><b>Definition</b> steel_attention.h:10</div></div>
+<div class="ttc" id="astruct_div_op_html"><div class="ttname"><a href="struct_div_op.html">DivOp</a></div><div class="ttdef"><b>Definition</b> steel_attention.h:57</div></div>
+<div class="ttc" id="astruct_div_op_html_a1b8df47142dc6ea15315ce3a310f9221"><div class="ttname"><a href="struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221">DivOp::apply</a></div><div class="ttdeci">static METAL_FUNC constexpr T apply(T x, T y)</div><div class="ttdef"><b>Definition</b> steel_attention.h:59</div></div>
+<div class="ttc" id="astruct_exp_sub_op_html"><div class="ttname"><a href="struct_exp_sub_op.html">ExpSubOp</a></div><div class="ttdef"><b>Definition</b> steel_attention.h:50</div></div>
+<div class="ttc" id="astruct_exp_sub_op_html_a00e457a01cb38f959dfd789455e7f334"><div class="ttname"><a href="struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334">ExpSubOp::apply</a></div><div class="ttdeci">static METAL_FUNC constexpr T apply(T x, T y)</div><div class="ttdef"><b>Definition</b> steel_attention.h:52</div></div>
+<div class="ttc" id="astruct_limits_html"><div class="ttname"><a href="struct_limits.html">Limits</a></div><div class="ttdef"><b>Definition</b> utils.h:23</div></div>
+<div class="ttc" id="astruct_max_op_html"><div class="ttname"><a href="struct_max_op.html">MaxOp</a></div><div class="ttdef"><b>Definition</b> steel_attention.h:22</div></div>
+<div class="ttc" id="astruct_max_op_html_ab3d3c3040017a13c170e7bdd1ffac46e"><div class="ttname"><a href="struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e">MaxOp::apply</a></div><div class="ttdeci">static METAL_FUNC constexpr T apply(T x, T y)</div><div class="ttdef"><b>Definition</b> steel_attention.h:24</div></div>
+<div class="ttc" id="astruct_mul_op_html"><div class="ttname"><a href="struct_mul_op.html">MulOp</a></div><div class="ttdef"><b>Definition</b> steel_attention.h:36</div></div>
+<div class="ttc" id="astruct_mul_op_html_a1b93d804653d92fc7e46747de9e9c756"><div class="ttname"><a href="struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756">MulOp::apply</a></div><div class="ttdeci">static METAL_FUNC constexpr T apply(T x, T y)</div><div class="ttdef"><b>Definition</b> steel_attention.h:38</div></div>
+<div class="ttc" id="astruct_sub_op_html"><div class="ttname"><a href="struct_sub_op.html">SubOp</a></div><div class="ttdef"><b>Definition</b> steel_attention.h:43</div></div>
+<div class="ttc" id="astruct_sub_op_html_ad211f879a212ed0e98136217ca8e4143"><div class="ttname"><a href="struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143">SubOp::apply</a></div><div class="ttdeci">static METAL_FUNC constexpr T apply(T x, T y)</div><div class="ttdef"><b>Definition</b> steel_attention.h:45</div></div>
+<div class="ttc" id="astruct_sum_op_html"><div class="ttname"><a href="struct_sum_op.html">SumOp</a></div><div class="ttdef"><b>Definition</b> steel_attention.h:29</div></div>
+<div class="ttc" id="astruct_sum_op_html_aa9563a98cbbe1b1921ade0c63ab38b4d"><div class="ttname"><a href="struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d">SumOp::apply</a></div><div class="ttdeci">static METAL_FUNC constexpr T apply(T x, T y)</div><div class="ttdef"><b>Definition</b> steel_attention.h:31</div></div>
+<div class="ttc" id="astruct_transform_scale_html"><div class="ttname"><a href="struct_transform_scale.html">TransformScale</a></div><div class="ttdef"><b>Definition</b> steel_attention.h:13</div></div>
+<div class="ttc" id="astruct_transform_scale_html_a9dd329422e5b8da43486cdce17132e16"><div class="ttname"><a href="struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16">TransformScale::apply</a></div><div class="ttdeci">METAL_FUNC T apply(T x) const</div><div class="ttdef"><b>Definition</b> steel_attention.h:17</div></div>
+<div class="ttc" id="astruct_transform_scale_html_aa56b8e107acf16fdf77006625c2b8bc6"><div class="ttname"><a href="struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6">TransformScale::scale</a></div><div class="ttdeci">T scale</div><div class="ttdef"><b>Definition</b> steel_attention.h:14</div></div>
+<div class="ttc" id="astruct_transform_scale_html_ae109cf7c963ba13df96977e7563f7b70"><div class="ttname"><a href="struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70">TransformScale::TransformScale</a></div><div class="ttdeci">METAL_FUNC TransformScale(T scale_)</div><div class="ttdef"><b>Definition</b> steel_attention.h:15</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_attn_params_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></div><div class="ttdef"><b>Definition</b> params.h:12</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_base_m_m_a_frag_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a></div><div class="ttdef"><b>Definition</b> mma.h:23</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_block_loader_t_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a></div><div class="ttdef"><b>Definition</b> loader.h:153</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a></div><div class="ttdef"><b>Definition</b> mma.h:178</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_a1a6b1446e8c8da46885bbaa8e8fdc7e4"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">mlx::steel::MMATile::frag_at</a></div><div class="ttdeci">METAL_FUNC constexpr thread frag_type &amp; frag_at(const short i, const short j)</div><div class="ttdef"><b>Definition</b> mma.h:256</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_m_m_a_tile_html_aa97a98e423827a889c13a92217626ec7"><div class="ttname"><a href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">mlx::steel::MMATile::clear</a></div><div class="ttdeci">METAL_FUNC constexpr void clear()</div><div class="ttdef"><b>Definition</b> mma.h:249</div></div>
+</div><!-- fragment --></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/steel__gemm__fused_8h_source.html b/docs/build/html/steel__gemm__fused_8h_source.html
index d758f32b0..685b0c304 100644
--- a/docs/build/html/steel__gemm__fused_8h_source.html
+++ b/docs/build/html/steel__gemm__fused_8h_source.html
@@ -196,7 +196,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>      <span class="keywordflow">if</span> (<a class="code hl_variable" href="steel__gemm__fused_8h.html#a3fe4e4382bda8a419557a5e6f77bc084">use_out_source</a>) {</div>
 <div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>        <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* indx_C_bstrides =</div>
 <div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>            indx_B_bstrides + params-&gt;batch_ndim;</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>        <span class="keyword">auto</span> indx_offset_C = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>        <span class="keyword">auto</span> indx_offset_C = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(</div>
 <div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>            tid.z, batch_shape, indx_C_bstrides, params-&gt;batch_ndim);</div>
 <div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>        indx_C = C_indices[indx_offset_C];</div>
 <div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>      }</div>
@@ -213,18 +213,18 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    <span class="keywordtype">int</span> batch_ndim_A = operand_batch_ndim.x;</div>
 <div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape_A = operand_shape;</div>
 <div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* batch_strides_A = operand_strides;</div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    A += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(indx_A, batch_shape_A, batch_strides_A, batch_ndim_A);</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    A += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(indx_A, batch_shape_A, batch_strides_A, batch_ndim_A);</div>
 <div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span> </div>
 <div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>    <span class="keywordtype">int</span> batch_ndim_B = operand_batch_ndim.y;</div>
 <div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape_B = batch_shape_A + batch_ndim_A;</div>
 <div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* batch_strides_B = batch_strides_A + batch_ndim_A;</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    B += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(indx_B, batch_shape_B, batch_strides_B, batch_ndim_B);</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    B += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(indx_B, batch_shape_B, batch_strides_B, batch_ndim_B);</div>
 <div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span> </div>
 <div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="steel__gemm__fused_8h.html#a3fe4e4382bda8a419557a5e6f77bc084">use_out_source</a>) {</div>
 <div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>      <span class="keywordtype">int</span> batch_ndim_C = operand_batch_ndim.z;</div>
 <div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>      <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape_C = batch_shape_B + batch_ndim_B;</div>
 <div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>      <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* batch_strides_C = batch_strides_B + batch_ndim_B;</div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>      C += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(indx_C, batch_shape_C, batch_strides_C, batch_ndim_C);</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>      C += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(indx_C, batch_shape_C, batch_strides_C, batch_ndim_C);</div>
 <div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    }</div>
 <div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span> </div>
 <div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>  }</div>
@@ -243,7 +243,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span> </div>
 <div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>      <span class="keywordflow">if</span> (<a class="code hl_variable" href="steel__gemm__fused_8h.html#a3fe4e4382bda8a419557a5e6f77bc084">use_out_source</a>) {</div>
 <div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>        <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* C_bstrides = B_bstrides + params-&gt;batch_ndim;</div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>        C += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.z, batch_shape, C_bstrides, params-&gt;batch_ndim);</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>        C += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.z, batch_shape, C_bstrides, params-&gt;batch_ndim);</div>
 <div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>      }</div>
 <div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    } <span class="keywordflow">else</span> {</div>
 <div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>      A += params-&gt;batch_stride_a * tid.z;</div>
@@ -505,8 +505,8 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00415" name="l00415"></a><span class="lineno">  415</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html_aaf4974425147d6f26d031691e321637f"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a></div><div class="ttdeci">METAL_FUNC ulong2 elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:7</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
-<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> loader_channel_l.h:14</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
+<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> attn.h:19</div></div>
 <div class="ttc" id="asteel__gemm__fused_8h_html_a3fe4e4382bda8a419557a5e6f77bc084"><div class="ttname"><a href="steel__gemm__fused_8h.html#a3fe4e4382bda8a419557a5e6f77bc084">use_out_source</a></div><div class="ttdeci">constant bool use_out_source</div><div class="ttdef"><b>Definition</b> steel_gemm_fused.h:11</div></div>
 <div class="ttc" id="asteel__gemm__fused_8h_html_a55af226dc74b0026b7d4b865142a6d21"><div class="ttname"><a href="steel__gemm__fused_8h.html#a55af226dc74b0026b7d4b865142a6d21">align_M</a></div><div class="ttdeci">constant bool align_M</div><div class="ttdef"><b>Definition</b> steel_gemm_fused.h:14</div></div>
 <div class="ttc" id="asteel__gemm__fused_8h_html_a60efac3ac3b7cd64d096bbae38a3ac69"><div class="ttname"><a href="steel__gemm__fused_8h.html#a60efac3ac3b7cd64d096bbae38a3ac69">do_gather</a></div><div class="ttdeci">constant bool do_gather</div><div class="ttdef"><b>Definition</b> steel_gemm_fused.h:18</div></div>
diff --git a/docs/build/html/steel__gemm__masked_8h_source.html b/docs/build/html/steel__gemm__masked_8h_source.html
index 2b8e60031..a609873d2 100644
--- a/docs/build/html/steel__gemm__masked_8h_source.html
+++ b/docs/build/html/steel__gemm__masked_8h_source.html
@@ -211,7 +211,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span> </div>
 <div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  <span class="keywordflow">if</span> (params-&gt;batch_ndim &gt; 1) {</div>
 <div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    <span class="keywordflow">if</span> (has_output_mask) {</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>      out_mask += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>      out_mask += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(</div>
 <div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>          tid.z, batch_shape, mask_batch_strides, params-&gt;batch_ndim);</div>
 <div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span> </div>
 <div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>      mask_batch_strides += params-&gt;batch_ndim;</div>
@@ -580,7 +580,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00479" name="l00479"></a><span class="lineno">  479</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* mask_batch_strides =</div>
 <div class="line"><a id="l00480" name="l00480"></a><span class="lineno">  480</span>        batch_strides + 2 * params-&gt;batch_ndim;</div>
 <div class="line"><a id="l00481" name="l00481"></a><span class="lineno">  481</span>    out_mask +=</div>
-<div class="line"><a id="l00482" name="l00482"></a><span class="lineno">  482</span>        <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(tid.z, batch_shape, mask_batch_strides, params-&gt;batch_ndim);</div>
+<div class="line"><a id="l00482" name="l00482"></a><span class="lineno">  482</span>        <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a>(tid.z, batch_shape, mask_batch_strides, params-&gt;batch_ndim);</div>
 <div class="line"><a id="l00483" name="l00483"></a><span class="lineno">  483</span> </div>
 <div class="line"><a id="l00484" name="l00484"></a><span class="lineno">  484</span>    <span class="keywordflow">if</span> (has_operand_mask) {</div>
 <div class="line"><a id="l00485" name="l00485"></a><span class="lineno">  485</span>      <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* mask_strides_lhs =</div>
@@ -818,10 +818,10 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00719" name="l00719"></a><span class="lineno">  719</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html_aaf4974425147d6f26d031691e321637f"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a></div><div class="ttdeci">METAL_FUNC ulong2 elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:7</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
-<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> loader_channel_l.h:14</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:93</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
+<div class="ttc" id="anamespacemetal_html_a6653b28c9473087141eddce39878d4d3"><div class="ttname"><a href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">metal::min</a></div><div class="ttdeci">METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> attn.h:19</div></div>
 <div class="ttc" id="asteel_2defines_8h_html"><div class="ttname"><a href="steel_2defines_8h.html">defines.h</a></div></div>
 <div class="ttc" id="asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6"><div class="ttname"><a href="steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6">STEEL_PRAGMA_UNROLL</a></div><div class="ttdeci">#define STEEL_PRAGMA_UNROLL</div><div class="ttdef"><b>Definition</b> defines.h:4</div></div>
 <div class="ttc" id="asteel__gemm__masked_8h_html_af805e998b2046ee30c2b4be813e3af97"><div class="ttname"><a href="steel__gemm__masked_8h.html#af805e998b2046ee30c2b4be813e3af97">block_masked_gemm</a></div><div class="ttdeci">void block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device out_mask_t *out_mask, const device op_mask_t *lhs_mask, const device op_mask_t *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</div><div class="ttdef"><b>Definition</b> steel_gemm_masked.h:53</div></div>
diff --git a/docs/build/html/steel__gemm__splitk_8h_source.html b/docs/build/html/steel__gemm__splitk_8h_source.html
index 1a0d5936a..c5844e588 100644
--- a/docs/build/html/steel__gemm__splitk_8h_source.html
+++ b/docs/build/html/steel__gemm__splitk_8h_source.html
@@ -321,7 +321,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>}</div>
 </div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
-<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> loader_channel_l.h:14</div></div>
+<div class="ttc" id="anamespacemlx_1_1steel_html"><div class="ttname"><a href="namespacemlx_1_1steel.html">mlx::steel</a></div><div class="ttdef"><b>Definition</b> attn.h:19</div></div>
 <div class="ttc" id="asteel__gemm__splitk_8h_html_a3be6e095a0a026d3ecf57a3e67f76188"><div class="ttname"><a href="steel__gemm__splitk_8h.html#a3be6e095a0a026d3ecf57a3e67f76188">gemm_splitk</a></div><div class="ttdeci">void gemm_splitk(const device T *A, const device T *B, device U *C, const constant GEMMSpiltKParams *params, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</div><div class="ttdef"><b>Definition</b> steel_gemm_splitk.h:21</div></div>
 <div class="ttc" id="asteel__gemm__splitk_8h_html_abeb921bf1dc7941125188ddd390b0907"><div class="ttname"><a href="steel__gemm__splitk_8h.html#abeb921bf1dc7941125188ddd390b0907">gemm_splitk_accum</a></div><div class="ttdeci">void gemm_splitk_accum(const device AccT *C_split, device OutT *D, const constant int &amp;k_partitions, const constant int &amp;partition_stride, const constant int &amp;ldd, uint2 gid)</div><div class="ttdef"><b>Definition</b> steel_gemm_splitk.h:172</div></div>
 <div class="ttc" id="asteel__gemm__splitk_8h_html_acc33fdfaaf3eb3a0629b3d52c7043dc1"><div class="ttname"><a href="steel__gemm__splitk_8h.html#acc33fdfaaf3eb3a0629b3d52c7043dc1">gemm_splitk_accum_axpby</a></div><div class="ttdeci">void gemm_splitk_accum_axpby(const device AccT *C_split, device OutT *D, const constant int &amp;k_partitions, const constant int &amp;partition_stride, const constant int &amp;ldd, const device OutT *C, const constant int &amp;ldc, const constant int &amp;fdc, const constant float &amp;alpha, const constant float &amp;beta, uint2 gid)</div><div class="ttdef"><b>Definition</b> steel_gemm_splitk.h:199</div></div>
diff --git a/docs/build/html/struct___m_l_x___b_float16.html b/docs/build/html/struct___m_l_x___b_float16.html
index 3a69ede2a..334409152 100644
--- a/docs/build/html/struct___m_l_x___b_float16.html
+++ b/docs/build/html/struct___m_l_x___b_float16.html
@@ -94,7 +94,7 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h_source.html">bf16.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html">bf16.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
 Classes</h2></td></tr>
@@ -536,7 +536,7 @@ template&lt;typename T , typename  = typename enable_if&lt;can_convert_from_bflo
 </div>
 </div>
 <hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/<a class="el" href="backend_2metal_2kernels_2bf16_8h_source.html">bf16.h</a></li>
+<li>mlx/backend/metal/kernels/metal_3_0/<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html">bf16.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html b/docs/build/html/struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html
index e340d914d..06872a354 100644
--- a/docs/build/html/struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html
+++ b/docs/build/html/struct___m_l_x___b_float16_1_1bits__to__bfloat__struct.html
@@ -92,9 +92,9 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h_source.html">bf16.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html">bf16.h</a>&gt;</code></p>
 <hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/<a class="el" href="backend_2metal_2kernels_2bf16_8h_source.html">bf16.h</a></li>
+<li>mlx/backend/metal/kernels/metal_3_0/<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html">bf16.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/structlooped__elem__to__loc_3_010_00_01offset__t_01_4-members.html b/docs/build/html/struct_div_op-members.html
similarity index 70%
rename from docs/build/html/structlooped__elem__to__loc_3_010_00_01offset__t_01_4-members.html
rename to docs/build/html/struct_div_op-members.html
index f77f8a500..d1a8235f3 100644
--- a/docs/build/html/structlooped__elem__to__loc_3_010_00_01offset__t_01_4-members.html
+++ b/docs/build/html/struct_div_op-members.html
@@ -84,15 +84,13 @@ $(function(){ initResizable(false); });
 </div><!-- top -->
 <div id="doc-content">
 <div class="header">
-  <div class="headertitle"><div class="title">looped_elem_to_loc&lt; 0, offset_t &gt; Member List</div></div>
+  <div class="headertitle"><div class="title">DivOp Member List</div></div>
 </div><!--header-->
 <div class="contents">
 
-<p>This is the complete list of members for <a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 0, offset_t &gt;</a>, including all inherited members.</p>
+<p>This is the complete list of members for <a class="el" href="struct_div_op.html">DivOp</a>, including all inherited members.</p>
 <table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a8c7aaffda0ca500d9f9566e5e74217a2">location</a>(offset_t idx, const constant int *shape, const constant size_t *strides, int ndim)</td><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 0, offset_t &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#aa1e9e1009c16befb9a730835836436e0">next</a>(const constant int *, const constant size_t *)</td><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 0, offset_t &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html#a1064cdfdcef779b5628ce5357a6fe4f0">next</a>(int, const constant int *, const constant size_t *)</td><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 0, offset_t &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221">apply</a>(T x, T y)</td><td class="entry"><a class="el" href="struct_div_op.html">DivOp</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/struct_div_op.html b/docs/build/html/struct_div_op.html
new file mode 100644
index 000000000..3dfdaeb19
--- /dev/null
+++ b/docs/build/html/struct_div_op.html
@@ -0,0 +1,144 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: DivOp Struct Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-static-methods">Static Public Member Functions</a> &#124;
+<a href="struct_div_op-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">DivOp Struct Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
+Static Public Member Functions</h2></td></tr>
+<tr class="memitem:a1b8df47142dc6ea15315ce3a310f9221" id="r_a1b8df47142dc6ea15315ce3a310f9221"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:a1b8df47142dc6ea15315ce3a310f9221"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr T&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1b8df47142dc6ea15315ce3a310f9221">apply</a> (T x, T y)</td></tr>
+<tr class="separator:a1b8df47142dc6ea15315ce3a310f9221"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="a1b8df47142dc6ea15315ce3a310f9221" name="a1b8df47142dc6ea15315ce3a310f9221"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1b8df47142dc6ea15315ce3a310f9221">&#9670;&#160;</a></span>apply()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr T DivOp::apply </td>
+          <td>(</td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/kernels/<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structlooped__elem__to__loc_3_011_00_01offset__t_01_4-members.html b/docs/build/html/struct_exp_sub_op-members.html
similarity index 65%
rename from docs/build/html/structlooped__elem__to__loc_3_011_00_01offset__t_01_4-members.html
rename to docs/build/html/struct_exp_sub_op-members.html
index 159b40e1b..9ef4a277c 100644
--- a/docs/build/html/structlooped__elem__to__loc_3_011_00_01offset__t_01_4-members.html
+++ b/docs/build/html/struct_exp_sub_op-members.html
@@ -84,16 +84,13 @@ $(function(){ initResizable(false); });
 </div><!-- top -->
 <div id="doc-content">
 <div class="header">
-  <div class="headertitle"><div class="title">looped_elem_to_loc&lt; 1, offset_t &gt; Member List</div></div>
+  <div class="headertitle"><div class="title">ExpSubOp Member List</div></div>
 </div><!--header-->
 <div class="contents">
 
-<p>This is the complete list of members for <a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 1, offset_t &gt;</a>, including all inherited members.</p>
+<p>This is the complete list of members for <a class="el" href="struct_exp_sub_op.html">ExpSubOp</a>, including all inherited members.</p>
 <table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a368d2a2204cee5055386954acd5ccb90">location</a>(offset_t, const constant int *, const constant size_t *, int)</td><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 1, offset_t &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a96cf2987c04210c9197e5237e425c4b4">next</a>(const constant int *, const constant size_t *strides)</td><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 1, offset_t &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#af2984b35f7d7300d4812e7872b3c8851">next</a>(int n, const constant int *, const constant size_t *strides)</td><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 1, offset_t &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html#a7aebc0b0656e3a55d0dbca27a57d600e">offset</a></td><td class="entry"><a class="el" href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html">looped_elem_to_loc&lt; 1, offset_t &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334">apply</a>(T x, T y)</td><td class="entry"><a class="el" href="struct_exp_sub_op.html">ExpSubOp</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/struct_exp_sub_op.html b/docs/build/html/struct_exp_sub_op.html
new file mode 100644
index 000000000..03089675a
--- /dev/null
+++ b/docs/build/html/struct_exp_sub_op.html
@@ -0,0 +1,144 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: ExpSubOp Struct Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-static-methods">Static Public Member Functions</a> &#124;
+<a href="struct_exp_sub_op-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">ExpSubOp Struct Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
+Static Public Member Functions</h2></td></tr>
+<tr class="memitem:a00e457a01cb38f959dfd789455e7f334" id="r_a00e457a01cb38f959dfd789455e7f334"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:a00e457a01cb38f959dfd789455e7f334"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr T&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a00e457a01cb38f959dfd789455e7f334">apply</a> (T x, T y)</td></tr>
+<tr class="separator:a00e457a01cb38f959dfd789455e7f334"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="a00e457a01cb38f959dfd789455e7f334" name="a00e457a01cb38f959dfd789455e7f334"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a00e457a01cb38f959dfd789455e7f334">&#9670;&#160;</a></span>apply()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr T ExpSubOp::apply </td>
+          <td>(</td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/kernels/<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_floor_divide.html b/docs/build/html/struct_floor_divide.html
index 3f033a78e..4ac05b199 100644
--- a/docs/build/html/struct_floor_divide.html
+++ b/docs/build/html/struct_floor_divide.html
@@ -105,7 +105,7 @@ Public Member Functions</h2></td></tr>
 <tr class="memitem:ae91719a15f7e643d552129f476089c6a"><td class="memTemplItemLeft" align="right" valign="top">half&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ae91719a15f7e643d552129f476089c6a">operator()</a> (half x, half y)</td></tr>
 <tr class="separator:ae91719a15f7e643d552129f476089c6a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a4aa9f858626583e02bd79f747229bbca" id="r_a4aa9f858626583e02bd79f747229bbca"><td class="memItemLeft" align="right" valign="top">template&lt;&gt; </td></tr>
-<tr class="memitem:a4aa9f858626583e02bd79f747229bbca"><td class="memTemplItemLeft" align="right" valign="top"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a4aa9f858626583e02bd79f747229bbca">operator()</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a4aa9f858626583e02bd79f747229bbca"><td class="memTemplItemLeft" align="right" valign="top"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a4aa9f858626583e02bd79f747229bbca">operator()</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a4aa9f858626583e02bd79f747229bbca"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Function Documentation</h2>
@@ -121,14 +121,14 @@ template&lt;&gt; </div>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> FloorDivide::operator() </td>
+          <td class="memname"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> FloorDivide::operator() </td>
           <td>(</td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
diff --git a/docs/build/html/struct_limits_3_01bfloat16__t_01_4.html b/docs/build/html/struct_limits_3_01bfloat16__t_01_4.html
index 3969642d4..671ed35f7 100644
--- a/docs/build/html/struct_limits_3_01bfloat16__t_01_4.html
+++ b/docs/build/html/struct_limits_3_01bfloat16__t_01_4.html
@@ -95,13 +95,13 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-attribs" name="pub-static-attribs"></a>
 Static Public Attributes</h2></td></tr>
-<tr class="memitem:a0ead3618da6718629ea9fa4670b5005f" id="r_a0ead3618da6718629ea9fa4670b5005f"><td class="memItemLeft" align="right" valign="top">static constexpr constant <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">max</a> = metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;::infinity()</td></tr>
+<tr class="memitem:a0ead3618da6718629ea9fa4670b5005f" id="r_a0ead3618da6718629ea9fa4670b5005f"><td class="memItemLeft" align="right" valign="top">static constexpr constant <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">max</a> = metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;::infinity()</td></tr>
 <tr class="separator:a0ead3618da6718629ea9fa4670b5005f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2fd1811b9f615b2b897904bc27d1cb49" id="r_a2fd1811b9f615b2b897904bc27d1cb49"><td class="memItemLeft" align="right" valign="top">static constexpr constant <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2fd1811b9f615b2b897904bc27d1cb49">min</a> = -metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;::infinity()</td></tr>
+<tr class="memitem:a2fd1811b9f615b2b897904bc27d1cb49" id="r_a2fd1811b9f615b2b897904bc27d1cb49"><td class="memItemLeft" align="right" valign="top">static constexpr constant <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2fd1811b9f615b2b897904bc27d1cb49">min</a> = -metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;::infinity()</td></tr>
 <tr class="separator:a2fd1811b9f615b2b897904bc27d1cb49"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6337dc35207b3f6f7185cd73eabac211" id="r_a6337dc35207b3f6f7185cd73eabac211"><td class="memItemLeft" align="right" valign="top">static constexpr constant <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6337dc35207b3f6f7185cd73eabac211">finite_max</a> = metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;<a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">::max</a>()</td></tr>
+<tr class="memitem:a6337dc35207b3f6f7185cd73eabac211" id="r_a6337dc35207b3f6f7185cd73eabac211"><td class="memItemLeft" align="right" valign="top">static constexpr constant <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6337dc35207b3f6f7185cd73eabac211">finite_max</a> = metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;<a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">::max</a>()</td></tr>
 <tr class="separator:a6337dc35207b3f6f7185cd73eabac211"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae4132a37154707cc31bbc1734636cf36" id="r_ae4132a37154707cc31bbc1734636cf36"><td class="memItemLeft" align="right" valign="top">static constexpr constant <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae4132a37154707cc31bbc1734636cf36">finite_min</a> = -metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;<a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">::max</a>()</td></tr>
+<tr class="memitem:ae4132a37154707cc31bbc1734636cf36" id="r_ae4132a37154707cc31bbc1734636cf36"><td class="memItemLeft" align="right" valign="top">static constexpr constant <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae4132a37154707cc31bbc1734636cf36">finite_min</a> = -metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;<a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">::max</a>()</td></tr>
 <tr class="separator:ae4132a37154707cc31bbc1734636cf36"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Data Documentation</h2>
@@ -115,7 +115,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> <a class="el" href="struct_limits.html">Limits</a>&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::finite_max = metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;<a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">::max</a>()</td>
+          <td class="memname">constant <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> <a class="el" href="struct_limits.html">Limits</a>&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::finite_max = metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;<a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">::max</a>()</td>
         </tr>
       </table>
   </td>
@@ -137,7 +137,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> <a class="el" href="struct_limits.html">Limits</a>&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::finite_min = -metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;<a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">::max</a>()</td>
+          <td class="memname">constant <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> <a class="el" href="struct_limits.html">Limits</a>&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::finite_min = -metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;<a class="el" href="#a0ead3618da6718629ea9fa4670b5005f">::max</a>()</td>
         </tr>
       </table>
   </td>
@@ -159,7 +159,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> <a class="el" href="struct_limits.html">Limits</a>&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max = metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;::infinity()</td>
+          <td class="memname">constant <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> <a class="el" href="struct_limits.html">Limits</a>&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max = metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;::infinity()</td>
         </tr>
       </table>
   </td>
@@ -181,7 +181,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> <a class="el" href="struct_limits.html">Limits</a>&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::min = -metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;::infinity()</td>
+          <td class="memname">constant <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> <a class="el" href="struct_limits.html">Limits</a>&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::min = -metal::numeric_limits&lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&gt;::infinity()</td>
         </tr>
       </table>
   </td>
diff --git a/docs/build/html/struct_looped_elem_to_loc-members.html b/docs/build/html/struct_looped_elem_to_loc-members.html
new file mode 100644
index 000000000..eb7e04817
--- /dev/null
+++ b/docs/build/html/struct_looped_elem_to_loc-members.html
@@ -0,0 +1,108 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">LoopedElemToLoc&lt; DIM, OffsetT, General &gt; Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html#af8285112846769aba2c0d8615f6f1364">dim</a></td><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html#acbd070b3193d9e87fb2c2db8db571333">index</a></td><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html#a8fbe77b4a774a30af5734dd9c5bd1f40">inner_looper</a></td><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html#aba051a428ad0934a9c6d04d4d3ee6e0e">location</a>()</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html#a5653be1c990722a4a215be27efe5648b">LoopedElemToLoc</a>(int dim)</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html#a54c743940bf96350f3be42bba5d28205">next</a>(const constant int *shape, const constant size_t *strides)</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html#a7da7bd04e79ba86f71c535b5a6ec1a2d">next</a>(int n, const constant int *shape, const constant size_t *strides)</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html#acdffe540c383a67417604b6080704791">offset</a></td><td class="entry"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc&lt; DIM, OffsetT, General &gt;</a></td><td class="entry"></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_looped_elem_to_loc.html b/docs/build/html/struct_looped_elem_to_loc.html
new file mode 100644
index 000000000..b8e76b15a
--- /dev/null
+++ b/docs/build/html/struct_looped_elem_to_loc.html
@@ -0,0 +1,317 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: LoopedElemToLoc&lt; DIM, OffsetT, General &gt; Struct Template Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-methods">Public Member Functions</a> &#124;
+<a href="#pub-attribs">Public Attributes</a> &#124;
+<a href="struct_looped_elem_to_loc-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">LoopedElemToLoc&lt; DIM, OffsetT, General &gt; Struct Template Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2utils_8h_source.html">utils.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
+Public Member Functions</h2></td></tr>
+<tr class="memitem:a5653be1c990722a4a215be27efe5648b" id="r_a5653be1c990722a4a215be27efe5648b"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5653be1c990722a4a215be27efe5648b">LoopedElemToLoc</a> (int <a class="el" href="#af8285112846769aba2c0d8615f6f1364">dim</a>)</td></tr>
+<tr class="separator:a5653be1c990722a4a215be27efe5648b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a54c743940bf96350f3be42bba5d28205" id="r_a54c743940bf96350f3be42bba5d28205"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a54c743940bf96350f3be42bba5d28205">next</a> (const constant int *shape, const constant size_t *strides)</td></tr>
+<tr class="separator:a54c743940bf96350f3be42bba5d28205"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a7da7bd04e79ba86f71c535b5a6ec1a2d" id="r_a7da7bd04e79ba86f71c535b5a6ec1a2d"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7da7bd04e79ba86f71c535b5a6ec1a2d">next</a> (int n, const constant int *shape, const constant size_t *strides)</td></tr>
+<tr class="separator:a7da7bd04e79ba86f71c535b5a6ec1a2d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aba051a428ad0934a9c6d04d4d3ee6e0e" id="r_aba051a428ad0934a9c6d04d4d3ee6e0e"><td class="memItemLeft" align="right" valign="top">OffsetT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aba051a428ad0934a9c6d04d4d3ee6e0e">location</a> ()</td></tr>
+<tr class="separator:aba051a428ad0934a9c6d04d4d3ee6e0e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
+Public Attributes</h2></td></tr>
+<tr class="memitem:af8285112846769aba2c0d8615f6f1364" id="r_af8285112846769aba2c0d8615f6f1364"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af8285112846769aba2c0d8615f6f1364">dim</a></td></tr>
+<tr class="separator:af8285112846769aba2c0d8615f6f1364"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a8fbe77b4a774a30af5734dd9c5bd1f40" id="r_a8fbe77b4a774a30af5734dd9c5bd1f40"><td class="memItemLeft" align="right" valign="top"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; DIM - 1, OffsetT, General &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8fbe77b4a774a30af5734dd9c5bd1f40">inner_looper</a></td></tr>
+<tr class="separator:a8fbe77b4a774a30af5734dd9c5bd1f40"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:acdffe540c383a67417604b6080704791" id="r_acdffe540c383a67417604b6080704791"><td class="memItemLeft" align="right" valign="top">OffsetT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acdffe540c383a67417604b6080704791">offset</a> {0}</td></tr>
+<tr class="separator:acdffe540c383a67417604b6080704791"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:acbd070b3193d9e87fb2c2db8db571333" id="r_acbd070b3193d9e87fb2c2db8db571333"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acbd070b3193d9e87fb2c2db8db571333">index</a> {0}</td></tr>
+<tr class="separator:acbd070b3193d9e87fb2c2db8db571333"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
+<a id="a5653be1c990722a4a215be27efe5648b" name="a5653be1c990722a4a215be27efe5648b"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a5653be1c990722a4a215be27efe5648b">&#9670;&#160;</a></span>LoopedElemToLoc()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int DIM, typename OffsetT  = size_t, bool General = true&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; DIM, OffsetT, General &gt;<a class="el" href="struct_looped_elem_to_loc.html">::LoopedElemToLoc</a> </td>
+          <td>(</td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>dim</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="aba051a428ad0934a9c6d04d4d3ee6e0e" name="aba051a428ad0934a9c6d04d4d3ee6e0e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aba051a428ad0934a9c6d04d4d3ee6e0e">&#9670;&#160;</a></span>location()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int DIM, typename OffsetT  = size_t, bool General = true&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">OffsetT <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; DIM, OffsetT, General &gt;::location </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a54c743940bf96350f3be42bba5d28205" name="a54c743940bf96350f3be42bba5d28205"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a54c743940bf96350f3be42bba5d28205">&#9670;&#160;</a></span>next() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int DIM, typename OffsetT  = size_t, bool General = true&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; DIM, OffsetT, General &gt;::next </td>
+          <td>(</td>
+          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>shape</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a7da7bd04e79ba86f71c535b5a6ec1a2d" name="a7da7bd04e79ba86f71c535b5a6ec1a2d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a7da7bd04e79ba86f71c535b5a6ec1a2d">&#9670;&#160;</a></span>next() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int DIM, typename OffsetT  = size_t, bool General = true&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; DIM, OffsetT, General &gt;::next </td>
+          <td>(</td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>shape</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<h2 class="groupheader">Member Data Documentation</h2>
+<a id="af8285112846769aba2c0d8615f6f1364" name="af8285112846769aba2c0d8615f6f1364"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#af8285112846769aba2c0d8615f6f1364">&#9670;&#160;</a></span>dim</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int DIM, typename OffsetT  = size_t, bool General = true&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">int <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; DIM, OffsetT, General &gt;::dim</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="acbd070b3193d9e87fb2c2db8db571333" name="acbd070b3193d9e87fb2c2db8db571333"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#acbd070b3193d9e87fb2c2db8db571333">&#9670;&#160;</a></span>index</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int DIM, typename OffsetT  = size_t, bool General = true&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">int <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; DIM, OffsetT, General &gt;::index {0}</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a8fbe77b4a774a30af5734dd9c5bd1f40" name="a8fbe77b4a774a30af5734dd9c5bd1f40"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a8fbe77b4a774a30af5734dd9c5bd1f40">&#9670;&#160;</a></span>inner_looper</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int DIM, typename OffsetT  = size_t, bool General = true&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt;DIM - 1, OffsetT, General&gt; <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; DIM, OffsetT, General &gt;::inner_looper</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="acdffe540c383a67417604b6080704791" name="acdffe540c383a67417604b6080704791"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#acdffe540c383a67417604b6080704791">&#9670;&#160;</a></span>offset</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int DIM, typename OffsetT  = size_t, bool General = true&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">OffsetT <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; DIM, OffsetT, General &gt;::offset {0}</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/<a class="el" href="backend_2metal_2kernels_2utils_8h_source.html">utils.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structlooped__elem__to__loc-members.html b/docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4-members.html
similarity index 60%
rename from docs/build/html/structlooped__elem__to__loc-members.html
rename to docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4-members.html
index bf1d293f5..a4d1c7be5 100644
--- a/docs/build/html/structlooped__elem__to__loc-members.html
+++ b/docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4-members.html
@@ -84,18 +84,17 @@ $(function(){ initResizable(false); });
 </div><!-- top -->
 <div id="doc-content">
 <div class="header">
-  <div class="headertitle"><div class="title">looped_elem_to_loc&lt; dim, offset_t &gt; Member List</div></div>
+  <div class="headertitle"><div class="title">LoopedElemToLoc&lt; 1, OffsetT, false &gt; Member List</div></div>
 </div><!--header-->
 <div class="contents">
 
-<p>This is the complete list of members for <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt; dim, offset_t &gt;</a>, including all inherited members.</p>
+<p>This is the complete list of members for <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a>, including all inherited members.</p>
 <table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="structlooped__elem__to__loc.html#a29b154409551fea0a4ef50bf320ebc0a">index</a></td><td class="entry"><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt; dim, offset_t &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structlooped__elem__to__loc.html#a42c76764640618d721c48ef6b4f59189">inner_looper</a></td><td class="entry"><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt; dim, offset_t &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(offset_t, const constant int *, const constant size_t *, int)</td><td class="entry"><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt; dim, offset_t &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(const constant int *shape, const constant size_t *strides)</td><td class="entry"><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt; dim, offset_t &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structlooped__elem__to__loc.html#add610f331ef8d7d2d1917050890f82b2">next</a>(int n, const constant int *shape, const constant size_t *strides)</td><td class="entry"><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt; dim, offset_t &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structlooped__elem__to__loc.html#a11ef1389c9224e9117fd6374d740e0e0">offset</a></td><td class="entry"><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt; dim, offset_t &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a89d9ec4dc2f2f0d77e27aa0c05f261ef">location</a>()</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a0e21977d9f23b6994773e8e4f3ee70de">LoopedElemToLoc</a>(int)</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#a03f3ca7a60bb85e36d7eba75e0e08b15">next</a>(const constant int *, const constant size_t *strides)</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af8f2b29946324756c09951b69e170dd8">next</a>(int n, const constant int *, const constant size_t *strides)</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html#af792b1fd4e8286f97b9b863c127a2d9a">offset</a></td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, false &gt;</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html b/docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html
similarity index 61%
rename from docs/build/html/structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html
rename to docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html
index 31a67fa63..68b15a3ab 100644
--- a/docs/build/html/structlooped__elem__to__loc_3_011_00_01offset__t_01_4.html
+++ b/docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=11"/>
 <meta name="generator" content="Doxygen 1.12.0"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: looped_elem_to_loc&lt; 1, offset_t &gt; Struct Template Reference</title>
+<title>MLX: LoopedElemToLoc&lt; 1, OffsetT, false &gt; Struct Template Reference</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -87,8 +87,8 @@ $(function(){ initResizable(false); });
   <div class="summary">
 <a href="#pub-methods">Public Member Functions</a> &#124;
 <a href="#pub-attribs">Public Attributes</a> &#124;
-<a href="structlooped__elem__to__loc_3_011_00_01offset__t_01_4-members.html">List of all members</a>  </div>
-  <div class="headertitle"><div class="title">looped_elem_to_loc&lt; 1, offset_t &gt; Struct Template Reference</div></div>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01false_01_4-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">LoopedElemToLoc&lt; 1, OffsetT, false &gt; Struct Template Reference</div></div>
 </div><!--header-->
 <div class="contents">
 
@@ -96,49 +96,37 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
-<tr class="memitem:a96cf2987c04210c9197e5237e425c4b4" id="r_a96cf2987c04210c9197e5237e425c4b4"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a96cf2987c04210c9197e5237e425c4b4">next</a> (const constant int *, const constant size_t *strides)</td></tr>
-<tr class="separator:a96cf2987c04210c9197e5237e425c4b4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af2984b35f7d7300d4812e7872b3c8851" id="r_af2984b35f7d7300d4812e7872b3c8851"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af2984b35f7d7300d4812e7872b3c8851">next</a> (int n, const constant int *, const constant size_t *strides)</td></tr>
-<tr class="separator:af2984b35f7d7300d4812e7872b3c8851"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a368d2a2204cee5055386954acd5ccb90" id="r_a368d2a2204cee5055386954acd5ccb90"><td class="memItemLeft" align="right" valign="top">offset_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a368d2a2204cee5055386954acd5ccb90">location</a> (offset_t, const constant int *, const constant size_t *, int)</td></tr>
-<tr class="separator:a368d2a2204cee5055386954acd5ccb90"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a0e21977d9f23b6994773e8e4f3ee70de" id="r_a0e21977d9f23b6994773e8e4f3ee70de"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0e21977d9f23b6994773e8e4f3ee70de">LoopedElemToLoc</a> (int)</td></tr>
+<tr class="separator:a0e21977d9f23b6994773e8e4f3ee70de"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a03f3ca7a60bb85e36d7eba75e0e08b15" id="r_a03f3ca7a60bb85e36d7eba75e0e08b15"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a03f3ca7a60bb85e36d7eba75e0e08b15">next</a> (const constant int *, const constant size_t *strides)</td></tr>
+<tr class="separator:a03f3ca7a60bb85e36d7eba75e0e08b15"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:af8f2b29946324756c09951b69e170dd8" id="r_af8f2b29946324756c09951b69e170dd8"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af8f2b29946324756c09951b69e170dd8">next</a> (int n, const constant int *, const constant size_t *strides)</td></tr>
+<tr class="separator:af8f2b29946324756c09951b69e170dd8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a89d9ec4dc2f2f0d77e27aa0c05f261ef" id="r_a89d9ec4dc2f2f0d77e27aa0c05f261ef"><td class="memItemLeft" align="right" valign="top">OffsetT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a89d9ec4dc2f2f0d77e27aa0c05f261ef">location</a> ()</td></tr>
+<tr class="separator:a89d9ec4dc2f2f0d77e27aa0c05f261ef"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
 Public Attributes</h2></td></tr>
-<tr class="memitem:a7aebc0b0656e3a55d0dbca27a57d600e" id="r_a7aebc0b0656e3a55d0dbca27a57d600e"><td class="memItemLeft" align="right" valign="top">offset_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7aebc0b0656e3a55d0dbca27a57d600e">offset</a> {0}</td></tr>
-<tr class="separator:a7aebc0b0656e3a55d0dbca27a57d600e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:af792b1fd4e8286f97b9b863c127a2d9a" id="r_af792b1fd4e8286f97b9b863c127a2d9a"><td class="memItemLeft" align="right" valign="top">OffsetT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af792b1fd4e8286f97b9b863c127a2d9a">offset</a> {0}</td></tr>
+<tr class="separator:af792b1fd4e8286f97b9b863c127a2d9a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
-<h2 class="groupheader">Member Function Documentation</h2>
-<a id="a368d2a2204cee5055386954acd5ccb90" name="a368d2a2204cee5055386954acd5ccb90"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a368d2a2204cee5055386954acd5ccb90">&#9670;&#160;</a></span>location()</h2>
+<h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
+<a id="a0e21977d9f23b6994773e8e4f3ee70de" name="a0e21977d9f23b6994773e8e4f3ee70de"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a0e21977d9f23b6994773e8e4f3ee70de">&#9670;&#160;</a></span>LoopedElemToLoc()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename offset_t &gt; </div>
+template&lt;typename OffsetT &gt; </div>
 <table class="mlabels">
   <tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">offset_t <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; 1, offset_t &gt;::location </td>
+          <td class="memname"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, false &gt;<a class="el" href="struct_looped_elem_to_loc.html">::LoopedElemToLoc</a> </td>
           <td>(</td>
-          <td class="paramtype">offset_t</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
-          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em></em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -150,19 +138,47 @@ template&lt;typename offset_t &gt; </div>
 
 </div>
 </div>
-<a id="a96cf2987c04210c9197e5237e425c4b4" name="a96cf2987c04210c9197e5237e425c4b4"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a96cf2987c04210c9197e5237e425c4b4">&#9670;&#160;</a></span>next() <span class="overload">[1/2]</span></h2>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="a89d9ec4dc2f2f0d77e27aa0c05f261ef" name="a89d9ec4dc2f2f0d77e27aa0c05f261ef"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a89d9ec4dc2f2f0d77e27aa0c05f261ef">&#9670;&#160;</a></span>location()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename offset_t &gt; </div>
+template&lt;typename OffsetT &gt; </div>
 <table class="mlabels">
   <tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">void <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; 1, offset_t &gt;::next </td>
+          <td class="memname">OffsetT <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, false &gt;::location </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a03f3ca7a60bb85e36d7eba75e0e08b15" name="a03f3ca7a60bb85e36d7eba75e0e08b15"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a03f3ca7a60bb85e36d7eba75e0e08b15">&#9670;&#160;</a></span>next() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OffsetT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, false &gt;::next </td>
           <td>(</td>
           <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
         </tr>
@@ -181,19 +197,19 @@ template&lt;typename offset_t &gt; </div>
 
 </div>
 </div>
-<a id="af2984b35f7d7300d4812e7872b3c8851" name="af2984b35f7d7300d4812e7872b3c8851"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#af2984b35f7d7300d4812e7872b3c8851">&#9670;&#160;</a></span>next() <span class="overload">[2/2]</span></h2>
+<a id="af8f2b29946324756c09951b69e170dd8" name="af8f2b29946324756c09951b69e170dd8"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#af8f2b29946324756c09951b69e170dd8">&#9670;&#160;</a></span>next() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename offset_t &gt; </div>
+template&lt;typename OffsetT &gt; </div>
 <table class="mlabels">
   <tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">void <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; 1, offset_t &gt;::next </td>
+          <td class="memname">void <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, false &gt;::next </td>
           <td>(</td>
           <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
         </tr>
@@ -218,16 +234,16 @@ template&lt;typename offset_t &gt; </div>
 </div>
 </div>
 <h2 class="groupheader">Member Data Documentation</h2>
-<a id="a7aebc0b0656e3a55d0dbca27a57d600e" name="a7aebc0b0656e3a55d0dbca27a57d600e"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7aebc0b0656e3a55d0dbca27a57d600e">&#9670;&#160;</a></span>offset</h2>
+<a id="af792b1fd4e8286f97b9b863c127a2d9a" name="af792b1fd4e8286f97b9b863c127a2d9a"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#af792b1fd4e8286f97b9b863c127a2d9a">&#9670;&#160;</a></span>offset</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename offset_t &gt; </div>
+template&lt;typename OffsetT &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">offset_t <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; 1, offset_t &gt;::offset {0}</td>
+          <td class="memname">OffsetT <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, false &gt;::offset {0}</td>
         </tr>
       </table>
 </div><div class="memdoc">
diff --git a/docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4-members.html b/docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4-members.html
new file mode 100644
index 000000000..f122e5240
--- /dev/null
+++ b/docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4-members.html
@@ -0,0 +1,107 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">LoopedElemToLoc&lt; 1, OffsetT, true &gt; Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a7be6bf560080472d61e74b522979ef1e">dim</a></td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a2cd3b616739b3d5b41e5b46ae335957d">index</a></td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a66b84b12f6c1494e5908989ed2849a9f">location</a>()</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#abf536c7162d36af7367e390789944c86">LoopedElemToLoc</a>(int dim)</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#af5a7c0cddeb52da88fa1140f44aec45c">next</a>(const constant int *shape, const constant size_t *strides)</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a8fe55b3a2fa8cd35af568085faed785d">next</a>(int n, const constant int *shape, const constant size_t *strides)</td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html#a3a18944c158e2747a6ddebb420299a3b">offset</a></td><td class="entry"><a class="el" href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html">LoopedElemToLoc&lt; 1, OffsetT, true &gt;</a></td><td class="entry"></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structlooped__elem__to__loc.html b/docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html
similarity index 56%
rename from docs/build/html/structlooped__elem__to__loc.html
rename to docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html
index f377b1c82..aa8215fbd 100644
--- a/docs/build/html/structlooped__elem__to__loc.html
+++ b/docs/build/html/struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=11"/>
 <meta name="generator" content="Doxygen 1.12.0"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: looped_elem_to_loc&lt; dim, offset_t &gt; Struct Template Reference</title>
+<title>MLX: LoopedElemToLoc&lt; 1, OffsetT, true &gt; Struct Template Reference</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -87,8 +87,8 @@ $(function(){ initResizable(false); });
   <div class="summary">
 <a href="#pub-methods">Public Member Functions</a> &#124;
 <a href="#pub-attribs">Public Attributes</a> &#124;
-<a href="structlooped__elem__to__loc-members.html">List of all members</a>  </div>
-  <div class="headertitle"><div class="title">looped_elem_to_loc&lt; dim, offset_t &gt; Struct Template Reference</div></div>
+<a href="struct_looped_elem_to_loc_3_011_00_01_offset_t_00_01true_01_4-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">LoopedElemToLoc&lt; 1, OffsetT, true &gt; Struct Template Reference</div></div>
 </div><!--header-->
 <div class="contents">
 
@@ -96,53 +96,41 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
-<tr class="memitem:a05558dabba889ee0d80ed4b567d901ca" id="r_a05558dabba889ee0d80ed4b567d901ca"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a05558dabba889ee0d80ed4b567d901ca">next</a> (const constant int *shape, const constant size_t *strides)</td></tr>
-<tr class="separator:a05558dabba889ee0d80ed4b567d901ca"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:add610f331ef8d7d2d1917050890f82b2" id="r_add610f331ef8d7d2d1917050890f82b2"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#add610f331ef8d7d2d1917050890f82b2">next</a> (int n, const constant int *shape, const constant size_t *strides)</td></tr>
-<tr class="separator:add610f331ef8d7d2d1917050890f82b2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:accc6d4957a8aeb38f5062754793b74d2" id="r_accc6d4957a8aeb38f5062754793b74d2"><td class="memItemLeft" align="right" valign="top">offset_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#accc6d4957a8aeb38f5062754793b74d2">location</a> (offset_t, const constant int *, const constant size_t *, int)</td></tr>
-<tr class="separator:accc6d4957a8aeb38f5062754793b74d2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abf536c7162d36af7367e390789944c86" id="r_abf536c7162d36af7367e390789944c86"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abf536c7162d36af7367e390789944c86">LoopedElemToLoc</a> (int <a class="el" href="#a7be6bf560080472d61e74b522979ef1e">dim</a>)</td></tr>
+<tr class="separator:abf536c7162d36af7367e390789944c86"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:af5a7c0cddeb52da88fa1140f44aec45c" id="r_af5a7c0cddeb52da88fa1140f44aec45c"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af5a7c0cddeb52da88fa1140f44aec45c">next</a> (const constant int *shape, const constant size_t *strides)</td></tr>
+<tr class="separator:af5a7c0cddeb52da88fa1140f44aec45c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a8fe55b3a2fa8cd35af568085faed785d" id="r_a8fe55b3a2fa8cd35af568085faed785d"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8fe55b3a2fa8cd35af568085faed785d">next</a> (int n, const constant int *shape, const constant size_t *strides)</td></tr>
+<tr class="separator:a8fe55b3a2fa8cd35af568085faed785d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a66b84b12f6c1494e5908989ed2849a9f" id="r_a66b84b12f6c1494e5908989ed2849a9f"><td class="memItemLeft" align="right" valign="top">OffsetT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a66b84b12f6c1494e5908989ed2849a9f">location</a> ()</td></tr>
+<tr class="separator:a66b84b12f6c1494e5908989ed2849a9f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
 Public Attributes</h2></td></tr>
-<tr class="memitem:a42c76764640618d721c48ef6b4f59189" id="r_a42c76764640618d721c48ef6b4f59189"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; dim - 1, offset_t &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a42c76764640618d721c48ef6b4f59189">inner_looper</a></td></tr>
-<tr class="separator:a42c76764640618d721c48ef6b4f59189"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a11ef1389c9224e9117fd6374d740e0e0" id="r_a11ef1389c9224e9117fd6374d740e0e0"><td class="memItemLeft" align="right" valign="top">offset_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a11ef1389c9224e9117fd6374d740e0e0">offset</a> {0}</td></tr>
-<tr class="separator:a11ef1389c9224e9117fd6374d740e0e0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a29b154409551fea0a4ef50bf320ebc0a" id="r_a29b154409551fea0a4ef50bf320ebc0a"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a29b154409551fea0a4ef50bf320ebc0a">index</a> {0}</td></tr>
-<tr class="separator:a29b154409551fea0a4ef50bf320ebc0a"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a7be6bf560080472d61e74b522979ef1e" id="r_a7be6bf560080472d61e74b522979ef1e"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7be6bf560080472d61e74b522979ef1e">dim</a></td></tr>
+<tr class="separator:a7be6bf560080472d61e74b522979ef1e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3a18944c158e2747a6ddebb420299a3b" id="r_a3a18944c158e2747a6ddebb420299a3b"><td class="memItemLeft" align="right" valign="top">OffsetT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3a18944c158e2747a6ddebb420299a3b">offset</a> {0}</td></tr>
+<tr class="separator:a3a18944c158e2747a6ddebb420299a3b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a2cd3b616739b3d5b41e5b46ae335957d" id="r_a2cd3b616739b3d5b41e5b46ae335957d"><td class="memItemLeft" align="right" valign="top">uint&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2cd3b616739b3d5b41e5b46ae335957d">index</a> {0}</td></tr>
+<tr class="separator:a2cd3b616739b3d5b41e5b46ae335957d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
-<h2 class="groupheader">Member Function Documentation</h2>
-<a id="accc6d4957a8aeb38f5062754793b74d2" name="accc6d4957a8aeb38f5062754793b74d2"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#accc6d4957a8aeb38f5062754793b74d2">&#9670;&#160;</a></span>location()</h2>
+<h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
+<a id="abf536c7162d36af7367e390789944c86" name="abf536c7162d36af7367e390789944c86"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#abf536c7162d36af7367e390789944c86">&#9670;&#160;</a></span>LoopedElemToLoc()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;int dim, typename offset_t  = size_t&gt; </div>
+template&lt;typename OffsetT &gt; </div>
 <table class="mlabels">
   <tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">offset_t <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; dim, offset_t &gt;::location </td>
+          <td class="memname"><a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, true &gt;<a class="el" href="struct_looped_elem_to_loc.html">::LoopedElemToLoc</a> </td>
           <td>(</td>
-          <td class="paramtype">offset_t</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>dim</em></span></td><td>)</td>
           <td></td>
-          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em></em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -154,19 +142,47 @@ template&lt;int dim, typename offset_t  = size_t&gt; </div>
 
 </div>
 </div>
-<a id="a05558dabba889ee0d80ed4b567d901ca" name="a05558dabba889ee0d80ed4b567d901ca"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a05558dabba889ee0d80ed4b567d901ca">&#9670;&#160;</a></span>next() <span class="overload">[1/2]</span></h2>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="a66b84b12f6c1494e5908989ed2849a9f" name="a66b84b12f6c1494e5908989ed2849a9f"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a66b84b12f6c1494e5908989ed2849a9f">&#9670;&#160;</a></span>location()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;int dim, typename offset_t  = size_t&gt; </div>
+template&lt;typename OffsetT &gt; </div>
 <table class="mlabels">
   <tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">void <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; dim, offset_t &gt;::next </td>
+          <td class="memname">OffsetT <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, true &gt;::location </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="af5a7c0cddeb52da88fa1140f44aec45c" name="af5a7c0cddeb52da88fa1140f44aec45c"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#af5a7c0cddeb52da88fa1140f44aec45c">&#9670;&#160;</a></span>next() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OffsetT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, true &gt;::next </td>
           <td>(</td>
           <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>shape</em></span>, </td>
         </tr>
@@ -185,19 +201,19 @@ template&lt;int dim, typename offset_t  = size_t&gt; </div>
 
 </div>
 </div>
-<a id="add610f331ef8d7d2d1917050890f82b2" name="add610f331ef8d7d2d1917050890f82b2"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#add610f331ef8d7d2d1917050890f82b2">&#9670;&#160;</a></span>next() <span class="overload">[2/2]</span></h2>
+<a id="a8fe55b3a2fa8cd35af568085faed785d" name="a8fe55b3a2fa8cd35af568085faed785d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a8fe55b3a2fa8cd35af568085faed785d">&#9670;&#160;</a></span>next() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;int dim, typename offset_t  = size_t&gt; </div>
+template&lt;typename OffsetT &gt; </div>
 <table class="mlabels">
   <tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">void <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; dim, offset_t &gt;::next </td>
+          <td class="memname">void <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, true &gt;::next </td>
           <td>(</td>
           <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
         </tr>
@@ -222,48 +238,48 @@ template&lt;int dim, typename offset_t  = size_t&gt; </div>
 </div>
 </div>
 <h2 class="groupheader">Member Data Documentation</h2>
-<a id="a29b154409551fea0a4ef50bf320ebc0a" name="a29b154409551fea0a4ef50bf320ebc0a"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a29b154409551fea0a4ef50bf320ebc0a">&#9670;&#160;</a></span>index</h2>
+<a id="a7be6bf560080472d61e74b522979ef1e" name="a7be6bf560080472d61e74b522979ef1e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a7be6bf560080472d61e74b522979ef1e">&#9670;&#160;</a></span>dim</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;int dim, typename offset_t  = size_t&gt; </div>
+template&lt;typename OffsetT &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">int <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; dim, offset_t &gt;::index {0}</td>
+          <td class="memname">int <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, true &gt;::dim</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a42c76764640618d721c48ef6b4f59189" name="a42c76764640618d721c48ef6b4f59189"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a42c76764640618d721c48ef6b4f59189">&#9670;&#160;</a></span>inner_looper</h2>
+<a id="a2cd3b616739b3d5b41e5b46ae335957d" name="a2cd3b616739b3d5b41e5b46ae335957d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a2cd3b616739b3d5b41e5b46ae335957d">&#9670;&#160;</a></span>index</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;int dim, typename offset_t  = size_t&gt; </div>
+template&lt;typename OffsetT &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname"><a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt;dim - 1, offset_t&gt; <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; dim, offset_t &gt;::inner_looper</td>
+          <td class="memname">uint <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, true &gt;::index {0}</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a11ef1389c9224e9117fd6374d740e0e0" name="a11ef1389c9224e9117fd6374d740e0e0"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a11ef1389c9224e9117fd6374d740e0e0">&#9670;&#160;</a></span>offset</h2>
+<a id="a3a18944c158e2747a6ddebb420299a3b" name="a3a18944c158e2747a6ddebb420299a3b"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a3a18944c158e2747a6ddebb420299a3b">&#9670;&#160;</a></span>offset</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;int dim, typename offset_t  = size_t&gt; </div>
+template&lt;typename OffsetT &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">offset_t <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; dim, offset_t &gt;::offset {0}</td>
+          <td class="memname">OffsetT <a class="el" href="struct_looped_elem_to_loc.html">LoopedElemToLoc</a>&lt; 1, OffsetT, true &gt;::offset {0}</td>
         </tr>
       </table>
 </div><div class="memdoc">
diff --git a/docs/build/html/struct_m_l_x_fast_attention_params-members.html b/docs/build/html/struct_m_l_x_fast_attention_params-members.html
deleted file mode 100644
index af8536a62..000000000
--- a/docs/build/html/struct_m_l_x_fast_attention_params-members.html
+++ /dev/null
@@ -1,120 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=11"/>
-<meta name="generator" content="Doxygen 1.12.0"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: Member List</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<script type="text/javascript" src="clipboard.js"></script>
-<link href="navtree.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="resize.js"></script>
-<script type="text/javascript" src="cookie.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr id="projectrow">
-  <td id="projectalign">
-   <div id="projectname">MLX
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.12.0 -->
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-var searchBox = new SearchBox("searchBox", "search/",'.html');
-/* @license-end */
-</script>
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-$(function() { codefold.init(0); });
-/* @license-end */
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-$(function() {
-  initMenu('',true,false,'search.php','Search',false);
-  $(function() { init_search(); });
-});
-/* @license-end */
-</script>
-<div id="main-nav"></div>
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-$(function(){ initResizable(false); });
-/* @license-end */
-</script>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<div id="MSearchResults">
-<div class="SRPage">
-<div id="SRIndex">
-<div id="SRResults"></div>
-<div class="SRStatus" id="Loading">Loading...</div>
-<div class="SRStatus" id="Searching">Searching...</div>
-<div class="SRStatus" id="NoMatches">No Matches</div>
-</div>
-</div>
-</div>
-</div>
-
-</div><!-- top -->
-<div id="doc-content">
-<div class="header">
-  <div class="headertitle"><div class="title">MLXFastAttentionParams Member List</div></div>
-</div><!--header-->
-<div class="contents">
-
-<p>This is the complete list of members for <a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a>, including all inherited members.</p>
-<table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a932266d04fa7d6e27d4a4a2c175f1477">alpha</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a6f3d94dbe44b32e675558768710bf0a3">batch_ndim</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a162826d3f288f64c0aea88a36b34859b">batch_stride_k</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a3c5b1170999087f3f3a03830193b55c7">batch_stride_o</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a98766fc89f75d5eef65b345f16a782d1">batch_stride_q</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a1180e311b95cd4b6d4a336d21b873c21">batch_stride_v</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#adbc0a13076da5f704498e57239cb2bf2">gemm_k_iterations_aligned</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#ab56b3db8fc6a938ce9c739ee78a7b803">gemm_n_iterations_aligned</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a2799a2f219441fef7f351374f4cbc67c">gemm_sv_m_block_iterations</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#ada454f5ad22ec36a22d0ff596751af23">K</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a1f8c89bd55d89ad7b9fe27c60e3cb8d5">ldk</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a9e73dc1971b5ab913bd85a7afa7cf46c">ldo</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#af2dadba2a28f5db2ca52472d00937e58">ldq</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a274eeb8591c02511014dce50c4240c8a">lds</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#aebada0bf0789e8706dce564752208e8b">ldv</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a5cd3ede5f41d5fdf8177cab3f059f4d8">M</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#ab42c792a80388002e34992cbd837a167">N</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a68a338d522ffeb6761b7b168869361e2">swizzle_log</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a0df159c839fc27b9426b8ac4336cc0ad">tiles_m</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html#a608aa256216ac6d80af00209303d2029">tiles_n</a></td><td class="entry"><a class="el" href="struct_m_l_x_fast_attention_params.html">MLXFastAttentionParams</a></td><td class="entry"></td></tr>
-</table></div><!-- contents -->
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
-</small></address>
-</div><!-- doc-content -->
-</body>
-</html>
diff --git a/docs/build/html/struct_m_l_x_fast_attention_params.html b/docs/build/html/struct_m_l_x_fast_attention_params.html
deleted file mode 100644
index cfadec594..000000000
--- a/docs/build/html/struct_m_l_x_fast_attention_params.html
+++ /dev/null
@@ -1,430 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=11"/>
-<meta name="generator" content="Doxygen 1.12.0"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: MLXFastAttentionParams Struct Reference</title>
-<link href="tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="jquery.js"></script>
-<script type="text/javascript" src="dynsections.js"></script>
-<script type="text/javascript" src="clipboard.js"></script>
-<link href="navtree.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="resize.js"></script>
-<script type="text/javascript" src="cookie.js"></script>
-<link href="search/search.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="search/searchdata.js"></script>
-<script type="text/javascript" src="search/search.js"></script>
-<link href="doxygen.css" rel="stylesheet" type="text/css" />
-</head>
-<body>
-<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
-<div id="titlearea">
-<table cellspacing="0" cellpadding="0">
- <tbody>
- <tr id="projectrow">
-  <td id="projectalign">
-   <div id="projectname">MLX
-   </div>
-  </td>
- </tr>
- </tbody>
-</table>
-</div>
-<!-- end header part -->
-<!-- Generated by Doxygen 1.12.0 -->
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-var searchBox = new SearchBox("searchBox", "search/",'.html');
-/* @license-end */
-</script>
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-$(function() { codefold.init(0); });
-/* @license-end */
-</script>
-<script type="text/javascript" src="menudata.js"></script>
-<script type="text/javascript" src="menu.js"></script>
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-$(function() {
-  initMenu('',true,false,'search.php','Search',false);
-  $(function() { init_search(); });
-});
-/* @license-end */
-</script>
-<div id="main-nav"></div>
-<script type="text/javascript">
-/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
-$(function(){ initResizable(false); });
-/* @license-end */
-</script>
-<!-- window showing the filter options -->
-<div id="MSearchSelectWindow"
-     onmouseover="return searchBox.OnSearchSelectShow()"
-     onmouseout="return searchBox.OnSearchSelectHide()"
-     onkeydown="return searchBox.OnSearchSelectKey(event)">
-</div>
-
-<!-- iframe showing the search results (closed by default) -->
-<div id="MSearchResultsWindow">
-<div id="MSearchResults">
-<div class="SRPage">
-<div id="SRIndex">
-<div id="SRResults"></div>
-<div class="SRStatus" id="Loading">Loading...</div>
-<div class="SRStatus" id="Searching">Searching...</div>
-<div class="SRStatus" id="NoMatches">No Matches</div>
-</div>
-</div>
-</div>
-</div>
-
-</div><!-- top -->
-<div id="doc-content">
-<div class="header">
-  <div class="summary">
-<a href="#pub-attribs">Public Attributes</a> &#124;
-<a href="struct_m_l_x_fast_attention_params-members.html">List of all members</a>  </div>
-  <div class="headertitle"><div class="title">MLXFastAttentionParams Struct Reference</div></div>
-</div><!--header-->
-<div class="contents">
-
-<p><code>#include &lt;<a class="el" href="scaled__dot__product__attention__params_8h_source.html">scaled_dot_product_attention_params.h</a>&gt;</code></p>
-<table class="memberdecls">
-<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
-Public Attributes</h2></td></tr>
-<tr class="memitem:a5cd3ede5f41d5fdf8177cab3f059f4d8" id="r_a5cd3ede5f41d5fdf8177cab3f059f4d8"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5cd3ede5f41d5fdf8177cab3f059f4d8">M</a></td></tr>
-<tr class="separator:a5cd3ede5f41d5fdf8177cab3f059f4d8"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab42c792a80388002e34992cbd837a167" id="r_ab42c792a80388002e34992cbd837a167"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab42c792a80388002e34992cbd837a167">N</a></td></tr>
-<tr class="separator:ab42c792a80388002e34992cbd837a167"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ada454f5ad22ec36a22d0ff596751af23" id="r_ada454f5ad22ec36a22d0ff596751af23"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ada454f5ad22ec36a22d0ff596751af23">K</a></td></tr>
-<tr class="separator:ada454f5ad22ec36a22d0ff596751af23"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af2dadba2a28f5db2ca52472d00937e58" id="r_af2dadba2a28f5db2ca52472d00937e58"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af2dadba2a28f5db2ca52472d00937e58">ldq</a></td></tr>
-<tr class="separator:af2dadba2a28f5db2ca52472d00937e58"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1f8c89bd55d89ad7b9fe27c60e3cb8d5" id="r_a1f8c89bd55d89ad7b9fe27c60e3cb8d5"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1f8c89bd55d89ad7b9fe27c60e3cb8d5">ldk</a></td></tr>
-<tr class="separator:a1f8c89bd55d89ad7b9fe27c60e3cb8d5"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aebada0bf0789e8706dce564752208e8b" id="r_aebada0bf0789e8706dce564752208e8b"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aebada0bf0789e8706dce564752208e8b">ldv</a></td></tr>
-<tr class="separator:aebada0bf0789e8706dce564752208e8b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a274eeb8591c02511014dce50c4240c8a" id="r_a274eeb8591c02511014dce50c4240c8a"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a274eeb8591c02511014dce50c4240c8a">lds</a></td></tr>
-<tr class="separator:a274eeb8591c02511014dce50c4240c8a"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a9e73dc1971b5ab913bd85a7afa7cf46c" id="r_a9e73dc1971b5ab913bd85a7afa7cf46c"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9e73dc1971b5ab913bd85a7afa7cf46c">ldo</a></td></tr>
-<tr class="separator:a9e73dc1971b5ab913bd85a7afa7cf46c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a608aa256216ac6d80af00209303d2029" id="r_a608aa256216ac6d80af00209303d2029"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a608aa256216ac6d80af00209303d2029">tiles_n</a></td></tr>
-<tr class="separator:a608aa256216ac6d80af00209303d2029"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a0df159c839fc27b9426b8ac4336cc0ad" id="r_a0df159c839fc27b9426b8ac4336cc0ad"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0df159c839fc27b9426b8ac4336cc0ad">tiles_m</a></td></tr>
-<tr class="separator:a0df159c839fc27b9426b8ac4336cc0ad"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a98766fc89f75d5eef65b345f16a782d1" id="r_a98766fc89f75d5eef65b345f16a782d1"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a98766fc89f75d5eef65b345f16a782d1">batch_stride_q</a></td></tr>
-<tr class="separator:a98766fc89f75d5eef65b345f16a782d1"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a162826d3f288f64c0aea88a36b34859b" id="r_a162826d3f288f64c0aea88a36b34859b"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a162826d3f288f64c0aea88a36b34859b">batch_stride_k</a></td></tr>
-<tr class="separator:a162826d3f288f64c0aea88a36b34859b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1180e311b95cd4b6d4a336d21b873c21" id="r_a1180e311b95cd4b6d4a336d21b873c21"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1180e311b95cd4b6d4a336d21b873c21">batch_stride_v</a></td></tr>
-<tr class="separator:a1180e311b95cd4b6d4a336d21b873c21"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a3c5b1170999087f3f3a03830193b55c7" id="r_a3c5b1170999087f3f3a03830193b55c7"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3c5b1170999087f3f3a03830193b55c7">batch_stride_o</a></td></tr>
-<tr class="separator:a3c5b1170999087f3f3a03830193b55c7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a68a338d522ffeb6761b7b168869361e2" id="r_a68a338d522ffeb6761b7b168869361e2"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a68a338d522ffeb6761b7b168869361e2">swizzle_log</a></td></tr>
-<tr class="separator:a68a338d522ffeb6761b7b168869361e2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ab56b3db8fc6a938ce9c739ee78a7b803" id="r_ab56b3db8fc6a938ce9c739ee78a7b803"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab56b3db8fc6a938ce9c739ee78a7b803">gemm_n_iterations_aligned</a></td></tr>
-<tr class="separator:ab56b3db8fc6a938ce9c739ee78a7b803"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:adbc0a13076da5f704498e57239cb2bf2" id="r_adbc0a13076da5f704498e57239cb2bf2"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adbc0a13076da5f704498e57239cb2bf2">gemm_k_iterations_aligned</a></td></tr>
-<tr class="separator:adbc0a13076da5f704498e57239cb2bf2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a2799a2f219441fef7f351374f4cbc67c" id="r_a2799a2f219441fef7f351374f4cbc67c"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2799a2f219441fef7f351374f4cbc67c">gemm_sv_m_block_iterations</a></td></tr>
-<tr class="separator:a2799a2f219441fef7f351374f4cbc67c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6f3d94dbe44b32e675558768710bf0a3" id="r_a6f3d94dbe44b32e675558768710bf0a3"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6f3d94dbe44b32e675558768710bf0a3">batch_ndim</a></td></tr>
-<tr class="separator:a6f3d94dbe44b32e675558768710bf0a3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a932266d04fa7d6e27d4a4a2c175f1477" id="r_a932266d04fa7d6e27d4a4a2c175f1477"><td class="memItemLeft" align="right" valign="top">const float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a932266d04fa7d6e27d4a4a2c175f1477">alpha</a></td></tr>
-<tr class="separator:a932266d04fa7d6e27d4a4a2c175f1477"><td class="memSeparator" colspan="2">&#160;</td></tr>
-</table>
-<h2 class="groupheader">Member Data Documentation</h2>
-<a id="a932266d04fa7d6e27d4a4a2c175f1477" name="a932266d04fa7d6e27d4a4a2c175f1477"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a932266d04fa7d6e27d4a4a2c175f1477">&#9670;&#160;</a></span>alpha</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const float MLXFastAttentionParams::alpha</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a6f3d94dbe44b32e675558768710bf0a3" name="a6f3d94dbe44b32e675558768710bf0a3"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a6f3d94dbe44b32e675558768710bf0a3">&#9670;&#160;</a></span>batch_ndim</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::batch_ndim</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a162826d3f288f64c0aea88a36b34859b" name="a162826d3f288f64c0aea88a36b34859b"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a162826d3f288f64c0aea88a36b34859b">&#9670;&#160;</a></span>batch_stride_k</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::batch_stride_k</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a3c5b1170999087f3f3a03830193b55c7" name="a3c5b1170999087f3f3a03830193b55c7"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a3c5b1170999087f3f3a03830193b55c7">&#9670;&#160;</a></span>batch_stride_o</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::batch_stride_o</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a98766fc89f75d5eef65b345f16a782d1" name="a98766fc89f75d5eef65b345f16a782d1"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a98766fc89f75d5eef65b345f16a782d1">&#9670;&#160;</a></span>batch_stride_q</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::batch_stride_q</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a1180e311b95cd4b6d4a336d21b873c21" name="a1180e311b95cd4b6d4a336d21b873c21"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1180e311b95cd4b6d4a336d21b873c21">&#9670;&#160;</a></span>batch_stride_v</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::batch_stride_v</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="adbc0a13076da5f704498e57239cb2bf2" name="adbc0a13076da5f704498e57239cb2bf2"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#adbc0a13076da5f704498e57239cb2bf2">&#9670;&#160;</a></span>gemm_k_iterations_aligned</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::gemm_k_iterations_aligned</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="ab56b3db8fc6a938ce9c739ee78a7b803" name="ab56b3db8fc6a938ce9c739ee78a7b803"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ab56b3db8fc6a938ce9c739ee78a7b803">&#9670;&#160;</a></span>gemm_n_iterations_aligned</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::gemm_n_iterations_aligned</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a2799a2f219441fef7f351374f4cbc67c" name="a2799a2f219441fef7f351374f4cbc67c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a2799a2f219441fef7f351374f4cbc67c">&#9670;&#160;</a></span>gemm_sv_m_block_iterations</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::gemm_sv_m_block_iterations</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="ada454f5ad22ec36a22d0ff596751af23" name="ada454f5ad22ec36a22d0ff596751af23"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ada454f5ad22ec36a22d0ff596751af23">&#9670;&#160;</a></span>K</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::K</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a1f8c89bd55d89ad7b9fe27c60e3cb8d5" name="a1f8c89bd55d89ad7b9fe27c60e3cb8d5"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1f8c89bd55d89ad7b9fe27c60e3cb8d5">&#9670;&#160;</a></span>ldk</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::ldk</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a9e73dc1971b5ab913bd85a7afa7cf46c" name="a9e73dc1971b5ab913bd85a7afa7cf46c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a9e73dc1971b5ab913bd85a7afa7cf46c">&#9670;&#160;</a></span>ldo</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::ldo</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="af2dadba2a28f5db2ca52472d00937e58" name="af2dadba2a28f5db2ca52472d00937e58"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#af2dadba2a28f5db2ca52472d00937e58">&#9670;&#160;</a></span>ldq</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::ldq</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a274eeb8591c02511014dce50c4240c8a" name="a274eeb8591c02511014dce50c4240c8a"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a274eeb8591c02511014dce50c4240c8a">&#9670;&#160;</a></span>lds</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::lds</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="aebada0bf0789e8706dce564752208e8b" name="aebada0bf0789e8706dce564752208e8b"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aebada0bf0789e8706dce564752208e8b">&#9670;&#160;</a></span>ldv</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::ldv</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a5cd3ede5f41d5fdf8177cab3f059f4d8" name="a5cd3ede5f41d5fdf8177cab3f059f4d8"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a5cd3ede5f41d5fdf8177cab3f059f4d8">&#9670;&#160;</a></span>M</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::M</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="ab42c792a80388002e34992cbd837a167" name="ab42c792a80388002e34992cbd837a167"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ab42c792a80388002e34992cbd837a167">&#9670;&#160;</a></span>N</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::N</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a68a338d522ffeb6761b7b168869361e2" name="a68a338d522ffeb6761b7b168869361e2"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a68a338d522ffeb6761b7b168869361e2">&#9670;&#160;</a></span>swizzle_log</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::swizzle_log</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a0df159c839fc27b9426b8ac4336cc0ad" name="a0df159c839fc27b9426b8ac4336cc0ad"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a0df159c839fc27b9426b8ac4336cc0ad">&#9670;&#160;</a></span>tiles_m</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::tiles_m</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a608aa256216ac6d80af00209303d2029" name="a608aa256216ac6d80af00209303d2029"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a608aa256216ac6d80af00209303d2029">&#9670;&#160;</a></span>tiles_n</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const int MLXFastAttentionParams::tiles_n</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/<a class="el" href="scaled__dot__product__attention__params_8h_source.html">scaled_dot_product_attention_params.h</a></li>
-</ul>
-</div><!-- contents -->
-<!-- start footer part -->
-<hr class="footer"/><address class="footer"><small>
-Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
-</small></address>
-</div><!-- doc-content -->
-</body>
-</html>
diff --git a/docs/build/html/struct_max_op-members.html b/docs/build/html/struct_max_op-members.html
new file mode 100644
index 000000000..9d49410a6
--- /dev/null
+++ b/docs/build/html/struct_max_op-members.html
@@ -0,0 +1,101 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">MaxOp Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="struct_max_op.html">MaxOp</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e">apply</a>(T x, T y)</td><td class="entry"><a class="el" href="struct_max_op.html">MaxOp</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_max_op.html b/docs/build/html/struct_max_op.html
new file mode 100644
index 000000000..5acc35878
--- /dev/null
+++ b/docs/build/html/struct_max_op.html
@@ -0,0 +1,144 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: MaxOp Struct Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-static-methods">Static Public Member Functions</a> &#124;
+<a href="struct_max_op-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">MaxOp Struct Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
+Static Public Member Functions</h2></td></tr>
+<tr class="memitem:ab3d3c3040017a13c170e7bdd1ffac46e" id="r_ab3d3c3040017a13c170e7bdd1ffac46e"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:ab3d3c3040017a13c170e7bdd1ffac46e"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr T&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ab3d3c3040017a13c170e7bdd1ffac46e">apply</a> (T x, T y)</td></tr>
+<tr class="separator:ab3d3c3040017a13c170e7bdd1ffac46e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="ab3d3c3040017a13c170e7bdd1ffac46e" name="ab3d3c3040017a13c170e7bdd1ffac46e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ab3d3c3040017a13c170e7bdd1ffac46e">&#9670;&#160;</a></span>apply()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr T MaxOp::apply </td>
+          <td>(</td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/kernels/<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_mul_op-members.html b/docs/build/html/struct_mul_op-members.html
new file mode 100644
index 000000000..3c2e3a88f
--- /dev/null
+++ b/docs/build/html/struct_mul_op-members.html
@@ -0,0 +1,101 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">MulOp Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="struct_mul_op.html">MulOp</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756">apply</a>(T x, T y)</td><td class="entry"><a class="el" href="struct_mul_op.html">MulOp</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_mul_op.html b/docs/build/html/struct_mul_op.html
new file mode 100644
index 000000000..8d9574508
--- /dev/null
+++ b/docs/build/html/struct_mul_op.html
@@ -0,0 +1,144 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: MulOp Struct Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-static-methods">Static Public Member Functions</a> &#124;
+<a href="struct_mul_op-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">MulOp Struct Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
+Static Public Member Functions</h2></td></tr>
+<tr class="memitem:a1b93d804653d92fc7e46747de9e9c756" id="r_a1b93d804653d92fc7e46747de9e9c756"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:a1b93d804653d92fc7e46747de9e9c756"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr T&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1b93d804653d92fc7e46747de9e9c756">apply</a> (T x, T y)</td></tr>
+<tr class="separator:a1b93d804653d92fc7e46747de9e9c756"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="a1b93d804653d92fc7e46747de9e9c756" name="a1b93d804653d92fc7e46747de9e9c756"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1b93d804653d92fc7e46747de9e9c756">&#9670;&#160;</a></span>apply()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr T MulOp::apply </td>
+          <td>(</td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/kernels/<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_quantized_block_loader-members.html b/docs/build/html/struct_quantized_block_loader-members.html
index e0c400f3f..638c4464a 100644
--- a/docs/build/html/struct_quantized_block_loader-members.html
+++ b/docs/build/html/struct_quantized_block_loader-members.html
@@ -94,21 +94,22 @@ $(function(){ initResizable(false); });
   <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906">bi</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd">biases</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00">bj</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">dst</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">group_steps</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">load_safe</a>(short2 src_tile_dim) const</td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">load_unsafe</a>() const</td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">next</a>()</td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#af59b054750a65e7e79c1cd05c4acac93">QuantizedBlockLoader</a>(const device uint32_t *src_, const device T *scales_, const device T *biases_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#ad85c6b7e07c81307b3b91eb4dd7be30b">src</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">tile_stride</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db">bytes_per_pack</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83">dst</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6">group_step_cnt</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba">group_steps</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab">group_stride</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b">load_safe</a>(short2 src_tile_dim) const</td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc">load_unsafe</a>() const</td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9">next</a>()</td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589">QuantizedBlockLoader</a>(const device uint8_t *src_, const device T *scales_, const device T *biases_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76">src</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e">src_ld</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475">thread_idx</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320">tile_stride</a></td><td class="entry"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/struct_quantized_block_loader.html b/docs/build/html/struct_quantized_block_loader.html
index 8b1f99eae..8d94b9dfd 100644
--- a/docs/build/html/struct_quantized_block_loader.html
+++ b/docs/build/html/struct_quantized_block_loader.html
@@ -97,8 +97,8 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
-<tr class="memitem:af59b054750a65e7e79c1cd05c4acac93" id="r_af59b054750a65e7e79c1cd05c4acac93"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af59b054750a65e7e79c1cd05c4acac93">QuantizedBlockLoader</a> (const device uint32_t *src_, const device T *scales_, const device T *biases_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</td></tr>
-<tr class="separator:af59b054750a65e7e79c1cd05c4acac93"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a60713ce7498aa683cbb2a0f19ab16589" id="r_a60713ce7498aa683cbb2a0f19ab16589"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a60713ce7498aa683cbb2a0f19ab16589">QuantizedBlockLoader</a> (const device uint8_t *src_, const device T *scales_, const device T *biases_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</td></tr>
+<tr class="separator:a60713ce7498aa683cbb2a0f19ab16589"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a86009527cb4b53e4c21fd6b1f78cfefc" id="r_a86009527cb4b53e4c21fd6b1f78cfefc"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a86009527cb4b53e4c21fd6b1f78cfefc">load_unsafe</a> () const</td></tr>
 <tr class="separator:a86009527cb4b53e4c21fd6b1f78cfefc"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a699dc9aa284b8fbf870310bbb224465b" id="r_a699dc9aa284b8fbf870310bbb224465b"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a699dc9aa284b8fbf870310bbb224465b">load_safe</a> (short2 src_tile_dim) const</td></tr>
@@ -124,8 +124,8 @@ Public Attributes</h2></td></tr>
 <tr class="separator:ae2add92b2aaf3414e91f0470b9b0cc00"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a9857214690fe6abad0e19d1045152f83" id="r_a9857214690fe6abad0e19d1045152f83"><td class="memItemLeft" align="right" valign="top">threadgroup T *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9857214690fe6abad0e19d1045152f83">dst</a></td></tr>
 <tr class="separator:a9857214690fe6abad0e19d1045152f83"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad85c6b7e07c81307b3b91eb4dd7be30b" id="r_ad85c6b7e07c81307b3b91eb4dd7be30b"><td class="memItemLeft" align="right" valign="top">const device uint32_t *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad85c6b7e07c81307b3b91eb4dd7be30b">src</a></td></tr>
-<tr class="separator:ad85c6b7e07c81307b3b91eb4dd7be30b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abbf8249ca99e3e87b296ddd60a984b76" id="r_abbf8249ca99e3e87b296ddd60a984b76"><td class="memItemLeft" align="right" valign="top">const device uint8_t *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abbf8249ca99e3e87b296ddd60a984b76">src</a></td></tr>
+<tr class="separator:abbf8249ca99e3e87b296ddd60a984b76"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a6123e4a9209d6eacb58b2c2344ed1ecf" id="r_a6123e4a9209d6eacb58b2c2344ed1ecf"><td class="memItemLeft" align="right" valign="top">const device T *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6123e4a9209d6eacb58b2c2344ed1ecf">scales</a></td></tr>
 <tr class="separator:a6123e4a9209d6eacb58b2c2344ed1ecf"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a17d01a6aba0833b073586ef2c09d0fbd" id="r_a17d01a6aba0833b073586ef2c09d0fbd"><td class="memItemLeft" align="right" valign="top">const device T *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a17d01a6aba0833b073586ef2c09d0fbd">biases</a></td></tr>
@@ -133,8 +133,10 @@ Public Attributes</h2></td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-attribs" name="pub-static-attribs"></a>
 Static Public Attributes</h2></td></tr>
-<tr class="memitem:a8eae73a0c04bf1e41fb96131f6aa500d" id="r_a8eae73a0c04bf1e41fb96131f6aa500d"><td class="memItemLeft" align="right" valign="top">static constant constexpr const short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a> = 32 / bits</td></tr>
+<tr class="memitem:a8eae73a0c04bf1e41fb96131f6aa500d" id="r_a8eae73a0c04bf1e41fb96131f6aa500d"><td class="memItemLeft" align="right" valign="top">static constant constexpr const short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a> = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits</td></tr>
 <tr class="separator:a8eae73a0c04bf1e41fb96131f6aa500d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ad00fe6d8bd395206a41693a8ed65d4db" id="r_ad00fe6d8bd395206a41693a8ed65d4db"><td class="memItemLeft" align="right" valign="top">static constant constexpr const short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad00fe6d8bd395206a41693a8ed65d4db">bytes_per_pack</a> = (bits == 3 || bits == 6) ? 3 : 1</td></tr>
+<tr class="separator:ad00fe6d8bd395206a41693a8ed65d4db"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a1392a5278cf6e090ea80ebe7c4ac5fbb" id="r_a1392a5278cf6e090ea80ebe7c4ac5fbb"><td class="memItemLeft" align="right" valign="top">static constant constexpr const short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> = BCOLS / <a class="el" href="#a8eae73a0c04bf1e41fb96131f6aa500d">pack_factor</a></td></tr>
 <tr class="separator:a1392a5278cf6e090ea80ebe7c4ac5fbb"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a6213479f7a6d9314d8879f8856b0b6fb" id="r_a6213479f7a6d9314d8879f8856b0b6fb"><td class="memItemLeft" align="right" valign="top">static constant constexpr const short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6213479f7a6d9314d8879f8856b0b6fb">n_reads</a></td></tr>
@@ -143,8 +145,8 @@ Static Public Attributes</h2></td></tr>
 <tr class="separator:a31e14175f3d4902d9fe5ab5a219f61ba"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
-<a id="af59b054750a65e7e79c1cd05c4acac93" name="af59b054750a65e7e79c1cd05c4acac93"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#af59b054750a65e7e79c1cd05c4acac93">&#9670;&#160;</a></span>QuantizedBlockLoader()</h2>
+<a id="a60713ce7498aa683cbb2a0f19ab16589" name="a60713ce7498aa683cbb2a0f19ab16589"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a60713ce7498aa683cbb2a0f19ab16589">&#9670;&#160;</a></span>QuantizedBlockLoader()</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -157,7 +159,7 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
         <tr>
           <td class="memname"><a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;<a class="el" href="struct_quantized_block_loader.html">::QuantizedBlockLoader</a> </td>
           <td>(</td>
-          <td class="paramtype">const device uint32_t *</td>          <td class="paramname"><span class="paramname"><em>src_</em></span>, </td>
+          <td class="paramtype">const device uint8_t *</td>          <td class="paramname"><span class="paramname"><em>src_</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -352,6 +354,30 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
       </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="ad00fe6d8bd395206a41693a8ed65d4db" name="ad00fe6d8bd395206a41693a8ed65d4db"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ad00fe6d8bd395206a41693a8ed65d4db">&#9670;&#160;</a></span>bytes_per_pack</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short group_size, short bits&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">constant constexpr const short <a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;::bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="a9857214690fe6abad0e19d1045152f83" name="a9857214690fe6abad0e19d1045152f83"></a>
@@ -449,7 +475,7 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 </div><div class="memdoc">
 <b>Initial value:</b><div class="fragment"><div class="line">=</div>
 <div class="line">      (<a class="code hl_variable" href="#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS &lt; tgp_size) ? 1 : (<a class="code hl_variable" href="#a1392a5278cf6e090ea80ebe7c4ac5fbb">BCOLS_PACKED</a> * BROWS) / tgp_size</div>
-<div class="ttc" id="astruct_quantized_block_loader_html_a1392a5278cf6e090ea80ebe7c4ac5fbb"><div class="ttname"><a href="#a1392a5278cf6e090ea80ebe7c4ac5fbb">QuantizedBlockLoader::BCOLS_PACKED</a></div><div class="ttdeci">static constant constexpr const short BCOLS_PACKED</div><div class="ttdef"><b>Definition</b> quantized.h:274</div></div>
+<div class="ttc" id="astruct_quantized_block_loader_html_a1392a5278cf6e090ea80ebe7c4ac5fbb"><div class="ttname"><a href="#a1392a5278cf6e090ea80ebe7c4ac5fbb">QuantizedBlockLoader::BCOLS_PACKED</a></div><div class="ttdeci">static constant constexpr const short BCOLS_PACKED</div><div class="ttdef"><b>Definition</b> quantized.h:456</div></div>
 </div><!-- fragment -->
 </div>
 </div>
@@ -465,7 +491,7 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant constexpr const short <a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;::pack_factor = 32 / bits</td>
+          <td class="memname">constant constexpr const short <a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;::pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits</td>
         </tr>
       </table>
   </td>
@@ -493,8 +519,8 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 
 </div>
 </div>
-<a id="ad85c6b7e07c81307b3b91eb4dd7be30b" name="ad85c6b7e07c81307b3b91eb4dd7be30b"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ad85c6b7e07c81307b3b91eb4dd7be30b">&#9670;&#160;</a></span>src</h2>
+<a id="abbf8249ca99e3e87b296ddd60a984b76" name="abbf8249ca99e3e87b296ddd60a984b76"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#abbf8249ca99e3e87b296ddd60a984b76">&#9670;&#160;</a></span>src</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -502,7 +528,7 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short group_size, short bits&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">const device uint32_t* <a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;::src</td>
+          <td class="memname">const device uint8_t* <a class="el" href="struct_quantized_block_loader.html">QuantizedBlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, group_size, bits &gt;::src</td>
         </tr>
       </table>
 </div><div class="memdoc">
diff --git a/docs/build/html/struct_sub_op-members.html b/docs/build/html/struct_sub_op-members.html
new file mode 100644
index 000000000..4e6db215f
--- /dev/null
+++ b/docs/build/html/struct_sub_op-members.html
@@ -0,0 +1,101 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">SubOp Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="struct_sub_op.html">SubOp</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143">apply</a>(T x, T y)</td><td class="entry"><a class="el" href="struct_sub_op.html">SubOp</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_sub_op.html b/docs/build/html/struct_sub_op.html
new file mode 100644
index 000000000..e89010e24
--- /dev/null
+++ b/docs/build/html/struct_sub_op.html
@@ -0,0 +1,144 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: SubOp Struct Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-static-methods">Static Public Member Functions</a> &#124;
+<a href="struct_sub_op-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">SubOp Struct Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
+Static Public Member Functions</h2></td></tr>
+<tr class="memitem:ad211f879a212ed0e98136217ca8e4143" id="r_ad211f879a212ed0e98136217ca8e4143"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:ad211f879a212ed0e98136217ca8e4143"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr T&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ad211f879a212ed0e98136217ca8e4143">apply</a> (T x, T y)</td></tr>
+<tr class="separator:ad211f879a212ed0e98136217ca8e4143"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="ad211f879a212ed0e98136217ca8e4143" name="ad211f879a212ed0e98136217ca8e4143"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ad211f879a212ed0e98136217ca8e4143">&#9670;&#160;</a></span>apply()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr T SubOp::apply </td>
+          <td>(</td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/kernels/<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_sum_op-members.html b/docs/build/html/struct_sum_op-members.html
new file mode 100644
index 000000000..27df88b0f
--- /dev/null
+++ b/docs/build/html/struct_sum_op-members.html
@@ -0,0 +1,101 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">SumOp Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="struct_sum_op.html">SumOp</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d">apply</a>(T x, T y)</td><td class="entry"><a class="el" href="struct_sum_op.html">SumOp</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_sum_op.html b/docs/build/html/struct_sum_op.html
new file mode 100644
index 000000000..2fcb18d96
--- /dev/null
+++ b/docs/build/html/struct_sum_op.html
@@ -0,0 +1,144 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: SumOp Struct Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-static-methods">Static Public Member Functions</a> &#124;
+<a href="struct_sum_op-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">SumOp Struct Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
+Static Public Member Functions</h2></td></tr>
+<tr class="memitem:aa9563a98cbbe1b1921ade0c63ab38b4d" id="r_aa9563a98cbbe1b1921ade0c63ab38b4d"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:aa9563a98cbbe1b1921ade0c63ab38b4d"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr T&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa9563a98cbbe1b1921ade0c63ab38b4d">apply</a> (T x, T y)</td></tr>
+<tr class="separator:aa9563a98cbbe1b1921ade0c63ab38b4d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="aa9563a98cbbe1b1921ade0c63ab38b4d" name="aa9563a98cbbe1b1921ade0c63ab38b4d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa9563a98cbbe1b1921ade0c63ab38b4d">&#9670;&#160;</a></span>apply()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr T SumOp::apply </td>
+          <td>(</td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>y</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/kernels/<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_m_l_x_scaled_dot_product_attention_params-members.html b/docs/build/html/struct_transform_scale-members.html
similarity index 68%
rename from docs/build/html/struct_m_l_x_scaled_dot_product_attention_params-members.html
rename to docs/build/html/struct_transform_scale-members.html
index ea40d7a98..a42a65ec8 100644
--- a/docs/build/html/struct_m_l_x_scaled_dot_product_attention_params-members.html
+++ b/docs/build/html/struct_transform_scale-members.html
@@ -84,17 +84,15 @@ $(function(){ initResizable(false); });
 </div><!-- top -->
 <div id="doc-content">
 <div class="header">
-  <div class="headertitle"><div class="title">MLXScaledDotProductAttentionParams Member List</div></div>
+  <div class="headertitle"><div class="title">TransformScale&lt; T &gt; Member List</div></div>
 </div><!--header-->
 <div class="contents">
 
-<p>This is the complete list of members for <a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a>, including all inherited members.</p>
+<p>This is the complete list of members for <a class="el" href="struct_transform_scale.html">TransformScale&lt; T &gt;</a>, including all inherited members.</p>
 <table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a7461e0e17cdc7d3fed80bb00d58d8644">INV_ALPHA</a></td><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a58ef2765fd681e6b35b2ba72030610e0">KV_TILES</a></td><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a68a292b9986c20560aca88394f82e9f7">N_KV_HEADS</a></td><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a1a63d2e7ad712b4ba26219c784c95177">N_Q_HEADS</a></td><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html#a46cc2da6a069d822f36983ee18467e5c">QUERY_SEQUENCE_LENGTH</a></td><td class="entry"><a class="el" href="struct_m_l_x_scaled_dot_product_attention_params.html">MLXScaledDotProductAttentionParams</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16">apply</a>(T x) const</td><td class="entry"><a class="el" href="struct_transform_scale.html">TransformScale&lt; T &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6">scale</a></td><td class="entry"><a class="el" href="struct_transform_scale.html">TransformScale&lt; T &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70">TransformScale</a>(T scale_)</td><td class="entry"><a class="el" href="struct_transform_scale.html">TransformScale&lt; T &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html b/docs/build/html/struct_transform_scale.html
similarity index 50%
rename from docs/build/html/structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html
rename to docs/build/html/struct_transform_scale.html
index d1bf2a22d..9432afa3d 100644
--- a/docs/build/html/structlooped__elem__to__loc_3_010_00_01offset__t_01_4.html
+++ b/docs/build/html/struct_transform_scale.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=11"/>
 <meta name="generator" content="Doxygen 1.12.0"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: looped_elem_to_loc&lt; 0, offset_t &gt; Struct Template Reference</title>
+<title>MLX: TransformScale&lt; T &gt; Struct Template Reference</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -86,53 +86,71 @@ $(function(){ initResizable(false); });
 <div class="header">
   <div class="summary">
 <a href="#pub-methods">Public Member Functions</a> &#124;
-<a href="structlooped__elem__to__loc_3_010_00_01offset__t_01_4-members.html">List of all members</a>  </div>
-  <div class="headertitle"><div class="title">looped_elem_to_loc&lt; 0, offset_t &gt; Struct Template Reference</div></div>
+<a href="#pub-attribs">Public Attributes</a> &#124;
+<a href="struct_transform_scale-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">TransformScale&lt; T &gt; Struct Template Reference</div></div>
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2utils_8h_source.html">utils.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
-<tr class="memitem:aa1e9e1009c16befb9a730835836436e0" id="r_aa1e9e1009c16befb9a730835836436e0"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa1e9e1009c16befb9a730835836436e0">next</a> (const constant int *, const constant size_t *)</td></tr>
-<tr class="separator:aa1e9e1009c16befb9a730835836436e0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1064cdfdcef779b5628ce5357a6fe4f0" id="r_a1064cdfdcef779b5628ce5357a6fe4f0"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1064cdfdcef779b5628ce5357a6fe4f0">next</a> (int, const constant int *, const constant size_t *)</td></tr>
-<tr class="separator:a1064cdfdcef779b5628ce5357a6fe4f0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8c7aaffda0ca500d9f9566e5e74217a2" id="r_a8c7aaffda0ca500d9f9566e5e74217a2"><td class="memItemLeft" align="right" valign="top">offset_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8c7aaffda0ca500d9f9566e5e74217a2">location</a> (offset_t idx, const constant int *shape, const constant size_t *strides, int ndim)</td></tr>
-<tr class="separator:a8c7aaffda0ca500d9f9566e5e74217a2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae109cf7c963ba13df96977e7563f7b70" id="r_ae109cf7c963ba13df96977e7563f7b70"><td class="memItemLeft" align="right" valign="top">METAL_FUNC&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae109cf7c963ba13df96977e7563f7b70">TransformScale</a> (T scale_)</td></tr>
+<tr class="separator:ae109cf7c963ba13df96977e7563f7b70"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a9dd329422e5b8da43486cdce17132e16" id="r_a9dd329422e5b8da43486cdce17132e16"><td class="memItemLeft" align="right" valign="top">METAL_FUNC T&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9dd329422e5b8da43486cdce17132e16">apply</a> (T x) const</td></tr>
+<tr class="separator:a9dd329422e5b8da43486cdce17132e16"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
+Public Attributes</h2></td></tr>
+<tr class="memitem:aa56b8e107acf16fdf77006625c2b8bc6" id="r_aa56b8e107acf16fdf77006625c2b8bc6"><td class="memItemLeft" align="right" valign="top">T&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa56b8e107acf16fdf77006625c2b8bc6">scale</a></td></tr>
+<tr class="separator:aa56b8e107acf16fdf77006625c2b8bc6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
+<h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
+<a id="ae109cf7c963ba13df96977e7563f7b70" name="ae109cf7c963ba13df96977e7563f7b70"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ae109cf7c963ba13df96977e7563f7b70">&#9670;&#160;</a></span>TransformScale()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC <a class="el" href="struct_transform_scale.html">TransformScale</a>&lt; T &gt;<a class="el" href="struct_transform_scale.html">::TransformScale</a> </td>
+          <td>(</td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>scale_</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
 <h2 class="groupheader">Member Function Documentation</h2>
-<a id="a8c7aaffda0ca500d9f9566e5e74217a2" name="a8c7aaffda0ca500d9f9566e5e74217a2"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a8c7aaffda0ca500d9f9566e5e74217a2">&#9670;&#160;</a></span>location()</h2>
+<a id="a9dd329422e5b8da43486cdce17132e16" name="a9dd329422e5b8da43486cdce17132e16"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a9dd329422e5b8da43486cdce17132e16">&#9670;&#160;</a></span>apply()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename offset_t &gt; </div>
+template&lt;typename T &gt; </div>
 <table class="mlabels">
   <tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">offset_t <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; 0, offset_t &gt;::location </td>
+          <td class="memname">METAL_FUNC T <a class="el" href="struct_transform_scale.html">TransformScale</a>&lt; T &gt;::apply </td>
           <td>(</td>
-          <td class="paramtype">offset_t</td>          <td class="paramname"><span class="paramname"><em>idx</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>shape</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>ndim</em></span>&#160;)</td>
+          <td class="paramtype">T</td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td> const</td>
         </tr>
       </table>
   </td>
@@ -144,75 +162,25 @@ template&lt;typename offset_t &gt; </div>
 
 </div>
 </div>
-<a id="aa1e9e1009c16befb9a730835836436e0" name="aa1e9e1009c16befb9a730835836436e0"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa1e9e1009c16befb9a730835836436e0">&#9670;&#160;</a></span>next() <span class="overload">[1/2]</span></h2>
+<h2 class="groupheader">Member Data Documentation</h2>
+<a id="aa56b8e107acf16fdf77006625c2b8bc6" name="aa56b8e107acf16fdf77006625c2b8bc6"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa56b8e107acf16fdf77006625c2b8bc6">&#9670;&#160;</a></span>scale</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename offset_t &gt; </div>
-<table class="mlabels">
-  <tr>
-  <td class="mlabels-left">
+template&lt;typename T &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">void <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; 0, offset_t &gt;::next </td>
-          <td>(</td>
-          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em></em></span>&#160;)</td>
+          <td class="memname">T <a class="el" href="struct_transform_scale.html">TransformScale</a>&lt; T &gt;::scale</td>
         </tr>
       </table>
-  </td>
-  <td class="mlabels-right">
-<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
-  </tr>
-</table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a1064cdfdcef779b5628ce5357a6fe4f0" name="a1064cdfdcef779b5628ce5357a6fe4f0"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1064cdfdcef779b5628ce5357a6fe4f0">&#9670;&#160;</a></span>next() <span class="overload">[2/2]</span></h2>
-
-<div class="memitem">
-<div class="memproto">
-<div class="memtemplate">
-template&lt;typename offset_t &gt; </div>
-<table class="mlabels">
-  <tr>
-  <td class="mlabels-left">
-      <table class="memname">
-        <tr>
-          <td class="memname">void <a class="el" href="structlooped__elem__to__loc.html">looped_elem_to_loc</a>&lt; 0, offset_t &gt;::next </td>
-          <td>(</td>
-          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em></em></span>&#160;)</td>
-        </tr>
-      </table>
-  </td>
-  <td class="mlabels-right">
-<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
-  </tr>
-</table>
 </div><div class="memdoc">
 
 </div>
 </div>
 <hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/<a class="el" href="backend_2metal_2kernels_2utils_8h_source.html">utils.h</a></li>
+<li>mlx/backend/metal/kernels/steel/attn/kernels/<a class="el" href="steel__attention_8h_source.html">steel_attention.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html b/docs/build/html/structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html
index 7ab337e49..e5093b188 100644
--- a/docs/build/html/structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html
+++ b/docs/build/html/structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html
@@ -96,7 +96,7 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2bf16_8h_source.html">bf16.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html">bf16.h</a>&gt;</code></p>
 <div class="dynheader">
 Inheritance diagram for metal::_numeric_limits_impl&lt; bfloat16_t &gt;:</div>
 <div class="dyncontent">
@@ -106,23 +106,23 @@ Inheritance diagram for metal::_numeric_limits_impl&lt; bfloat16_t &gt;:</div>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
 Static Public Member Functions</h2></td></tr>
-<tr class="memitem:adaed80031f5ca0ff69d30ec4c5d0c98f" id="r_adaed80031f5ca0ff69d30ec4c5d0c98f"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adaed80031f5ca0ff69d30ec4c5d0c98f">min</a> ()</td></tr>
+<tr class="memitem:adaed80031f5ca0ff69d30ec4c5d0c98f" id="r_adaed80031f5ca0ff69d30ec4c5d0c98f"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adaed80031f5ca0ff69d30ec4c5d0c98f">min</a> ()</td></tr>
 <tr class="separator:adaed80031f5ca0ff69d30ec4c5d0c98f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae81c58b8223e504965183c99d19a2116" id="r_ae81c58b8223e504965183c99d19a2116"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae81c58b8223e504965183c99d19a2116">lowest</a> ()</td></tr>
+<tr class="memitem:ae81c58b8223e504965183c99d19a2116" id="r_ae81c58b8223e504965183c99d19a2116"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae81c58b8223e504965183c99d19a2116">lowest</a> ()</td></tr>
 <tr class="separator:ae81c58b8223e504965183c99d19a2116"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a92320d40a58218e40cc414986ac95c50" id="r_a92320d40a58218e40cc414986ac95c50"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a92320d40a58218e40cc414986ac95c50">max</a> ()</td></tr>
+<tr class="memitem:a92320d40a58218e40cc414986ac95c50" id="r_a92320d40a58218e40cc414986ac95c50"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a92320d40a58218e40cc414986ac95c50">max</a> ()</td></tr>
 <tr class="separator:a92320d40a58218e40cc414986ac95c50"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a96c4197e3076f0aa9065370b8ece49ca" id="r_a96c4197e3076f0aa9065370b8ece49ca"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a96c4197e3076f0aa9065370b8ece49ca">epsilon</a> ()</td></tr>
+<tr class="memitem:a96c4197e3076f0aa9065370b8ece49ca" id="r_a96c4197e3076f0aa9065370b8ece49ca"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a96c4197e3076f0aa9065370b8ece49ca">epsilon</a> ()</td></tr>
 <tr class="separator:a96c4197e3076f0aa9065370b8ece49ca"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:afa223448fa4f04c1113a85345dd720c3" id="r_afa223448fa4f04c1113a85345dd720c3"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afa223448fa4f04c1113a85345dd720c3">round_error</a> ()</td></tr>
+<tr class="memitem:afa223448fa4f04c1113a85345dd720c3" id="r_afa223448fa4f04c1113a85345dd720c3"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afa223448fa4f04c1113a85345dd720c3">round_error</a> ()</td></tr>
 <tr class="separator:afa223448fa4f04c1113a85345dd720c3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a61eb741e7af49046beb863abf023b206" id="r_a61eb741e7af49046beb863abf023b206"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a61eb741e7af49046beb863abf023b206">infinity</a> ()</td></tr>
+<tr class="memitem:a61eb741e7af49046beb863abf023b206" id="r_a61eb741e7af49046beb863abf023b206"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a61eb741e7af49046beb863abf023b206">infinity</a> ()</td></tr>
 <tr class="separator:a61eb741e7af49046beb863abf023b206"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aebeb07c01984be246bc2d1b8f8e4ac7b" id="r_aebeb07c01984be246bc2d1b8f8e4ac7b"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aebeb07c01984be246bc2d1b8f8e4ac7b">quiet_NaN</a> ()</td></tr>
+<tr class="memitem:aebeb07c01984be246bc2d1b8f8e4ac7b" id="r_aebeb07c01984be246bc2d1b8f8e4ac7b"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aebeb07c01984be246bc2d1b8f8e4ac7b">quiet_NaN</a> ()</td></tr>
 <tr class="separator:aebeb07c01984be246bc2d1b8f8e4ac7b"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad1f76a43c7d51a3765174aa6e0dd9f80" id="r_ad1f76a43c7d51a3765174aa6e0dd9f80"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad1f76a43c7d51a3765174aa6e0dd9f80">signaling_NaN</a> ()</td></tr>
+<tr class="memitem:ad1f76a43c7d51a3765174aa6e0dd9f80" id="r_ad1f76a43c7d51a3765174aa6e0dd9f80"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad1f76a43c7d51a3765174aa6e0dd9f80">signaling_NaN</a> ()</td></tr>
 <tr class="separator:ad1f76a43c7d51a3765174aa6e0dd9f80"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a6a9dbcba4dd79cad50876dda506b9eed" id="r_a6a9dbcba4dd79cad50876dda506b9eed"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6a9dbcba4dd79cad50876dda506b9eed">denorm_min</a> ()</td></tr>
+<tr class="memitem:a6a9dbcba4dd79cad50876dda506b9eed" id="r_a6a9dbcba4dd79cad50876dda506b9eed"><td class="memItemLeft" align="right" valign="top">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6a9dbcba4dd79cad50876dda506b9eed">denorm_min</a> ()</td></tr>
 <tr class="separator:a6a9dbcba4dd79cad50876dda506b9eed"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-attribs" name="pub-static-attribs"></a>
@@ -155,7 +155,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::denorm_min </td>
+          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::denorm_min </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -180,7 +180,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::epsilon </td>
+          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::epsilon </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -205,7 +205,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::infinity </td>
+          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::infinity </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -230,7 +230,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::lowest </td>
+          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::lowest </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -255,7 +255,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max </td>
+          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -280,7 +280,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::min </td>
+          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::min </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -305,7 +305,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::quiet_NaN </td>
+          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::quiet_NaN </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -330,7 +330,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::round_error </td>
+          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::round_error </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -355,7 +355,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::signaling_NaN </td>
+          <td class="memname">static constexpr <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::signaling_NaN </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -381,7 +381,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::digits = 8</td>
+          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::digits = 8</td>
         </tr>
       </table>
   </td>
@@ -403,7 +403,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::digits10 = 2</td>
+          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::digits10 = 2</td>
         </tr>
       </table>
   </td>
@@ -425,7 +425,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max_digits10 = 4</td>
+          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max_digits10 = 4</td>
         </tr>
       </table>
   </td>
@@ -447,7 +447,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max_exponent = 128</td>
+          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max_exponent = 128</td>
         </tr>
       </table>
   </td>
@@ -469,7 +469,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max_exponent10 = 38</td>
+          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::max_exponent10 = 38</td>
         </tr>
       </table>
   </td>
@@ -491,7 +491,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::min_exponent = -125</td>
+          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::min_exponent = -125</td>
         </tr>
       </table>
   </td>
@@ -513,7 +513,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::min_exponent10 = -37</td>
+          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::min_exponent10 = -37</td>
         </tr>
       </table>
   </td>
@@ -535,7 +535,7 @@ Static Public Attributes</h2></td></tr>
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::radix = 2</td>
+          <td class="memname">constant int metal::_numeric_limits_impl&lt; <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &gt;::radix = 2</td>
         </tr>
       </table>
   </td>
@@ -548,7 +548,7 @@ Static Public Attributes</h2></td></tr>
 </div>
 </div>
 <hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/<a class="el" href="backend_2metal_2kernels_2bf16_8h_source.html">bf16.h</a></li>
+<li>mlx/backend/metal/kernels/metal_3_0/<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h_source.html">bf16.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/structmlx_1_1core_1_1_print_formatter.html b/docs/build/html/structmlx_1_1core_1_1_print_formatter.html
index 03a8c1e0d..448f6cf61 100644
--- a/docs/build/html/structmlx_1_1core_1_1_print_formatter.html
+++ b/docs/build/html/structmlx_1_1core_1_1_print_formatter.html
@@ -116,7 +116,7 @@ Public Member Functions</h2></td></tr>
 <tr class="separator:ac59a5137ddd8b32aae057bb9826ee80d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ac4b7895d1168cfc1a3d1186d8a414d2f" id="r_ac4b7895d1168cfc1a3d1186d8a414d2f"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac4b7895d1168cfc1a3d1186d8a414d2f">print</a> (std::ostream &amp;os, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> val)</td></tr>
 <tr class="separator:ac4b7895d1168cfc1a3d1186d8a414d2f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae21005f92bc641f2d657096f5d176a6d" id="r_ae21005f92bc641f2d657096f5d176a6d"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae21005f92bc641f2d657096f5d176a6d">print</a> (std::ostream &amp;os, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> val)</td></tr>
+<tr class="memitem:ae21005f92bc641f2d657096f5d176a6d" id="r_ae21005f92bc641f2d657096f5d176a6d"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae21005f92bc641f2d657096f5d176a6d">print</a> (std::ostream &amp;os, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> val)</td></tr>
 <tr class="separator:ae21005f92bc641f2d657096f5d176a6d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a57af5c32561b95d6ac2a3a1dc4f5d43e" id="r_a57af5c32561b95d6ac2a3a1dc4f5d43e"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a57af5c32561b95d6ac2a3a1dc4f5d43e">print</a> (std::ostream &amp;os, float val)</td></tr>
 <tr class="separator:a57af5c32561b95d6ac2a3a1dc4f5d43e"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -146,7 +146,7 @@ Public Attributes</h2></td></tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>val</em></span>&#160;)</td>
+          <td class="paramtype"><a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a></td>          <td class="paramname"><span class="paramname"><em>val</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
diff --git a/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder-members.html b/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder-members.html
index 981601231..b6705a302 100644
--- a/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder-members.html
+++ b/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder-members.html
@@ -96,16 +96,22 @@ $(function(){ initResizable(false); });
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3">CommandEncoder</a>(MTL::CommandBuffer *cbuf)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14">CommandEncoder</a>(const CommandEncoder &amp;)=delete</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">dispatchThreadgroups</a>(MTL::Size grid_dims, MTL::Size group_dims)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">dispatchThreads</a>(MTL::Size grid_dims, MTL::Size group_dims)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a85796b2bf41dbf347ae0978d4660600d">dispatch_threadgroups</a>(MTL::Size grid_dims, MTL::Size group_dims)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a0a8501b940e5a347475fa4bc38fb4c05">dispatch_threads</a>(MTL::Size grid_dims, MTL::Size group_dims)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">inputs</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">maybeInsertBarrier</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966">operator-&gt;</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e">operator=</a>(const CommandEncoder &amp;)=delete</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">outputs</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e">operator=</a>(const CommandEncoder &amp;)=delete</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">outputs</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9c343f791812a45c6c03a5c9f27f74d5">set_bytes</a>(const T *v, int n, int idx)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#abc52d18ea87d213c47fd26062c829849">set_bytes</a>(const T &amp;v, int idx)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6d4c03a6585deedb5ccd1a1057d0c6ef">set_compute_pipeline_state</a>(MTL::ComputePipelineState *kernel)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">set_input_array</a>(const array &amp;a, int idx, int64_t offset=0)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">set_output_array</a>(array &amp;a, int idx, int64_t offset=0)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a68c3c6a036e11ec40211c09811bbed1b">set_vector_bytes</a>(const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a7375adf9ee5355bcf4b7f5f210efd115">set_vector_bytes</a>(const std::vector&lt; T &gt; &amp;vec, int idx)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">start_concurrent</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aeef08f5f3c015578d40de756a6465aa2">update_fence</a>(MTL::Fence *fence)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefdadbff4e003dc6f77506840babc088">wait_for_fence</a>(MTL::Fence *fence)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">~CommandEncoder</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder.html b/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder.html
index ee359fde9..b26bf1f81 100644
--- a/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder.html
+++ b/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder.html
@@ -111,18 +111,34 @@ Public Member Functions</h2></td></tr>
 <tr class="separator:ac68ca977b5bde5434284ce7979647f14"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a3f42a1362b4a513fa89e7b3dcc570a8e" id="r_a3f42a1362b4a513fa89e7b3dcc570a8e"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3f42a1362b4a513fa89e7b3dcc570a8e">operator=</a> (const <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;)=delete</td></tr>
 <tr class="separator:a3f42a1362b4a513fa89e7b3dcc570a8e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aac45ab0630ea32cf7d15c7ba3e229966" id="r_aac45ab0630ea32cf7d15c7ba3e229966"><td class="memItemLeft" align="right" valign="top">MTL::ComputeCommandEncoder *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aac45ab0630ea32cf7d15c7ba3e229966">operator-&gt;</a> ()</td></tr>
-<tr class="separator:aac45ab0630ea32cf7d15c7ba3e229966"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ab69ff0d7f14b9b59db4df0608193dce4" id="r_ab69ff0d7f14b9b59db4df0608193dce4"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab69ff0d7f14b9b59db4df0608193dce4">set_input_array</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a, int idx, int64_t offset=0)</td></tr>
 <tr class="separator:ab69ff0d7f14b9b59db4df0608193dce4"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a6a2e28e542eaa2886041bddd51ff6522" id="r_a6a2e28e542eaa2886041bddd51ff6522"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6a2e28e542eaa2886041bddd51ff6522">set_output_array</a> (<a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a, int idx, int64_t offset=0)</td></tr>
 <tr class="separator:a6a2e28e542eaa2886041bddd51ff6522"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a74bcd8e35f80f5a62db48c4a2bb0173e" id="r_a74bcd8e35f80f5a62db48c4a2bb0173e"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a74bcd8e35f80f5a62db48c4a2bb0173e">dispatchThreadgroups</a> (MTL::Size grid_dims, MTL::Size group_dims)</td></tr>
-<tr class="separator:a74bcd8e35f80f5a62db48c4a2bb0173e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1e41477f2f489e38499f7830a91c9810" id="r_a1e41477f2f489e38499f7830a91c9810"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1e41477f2f489e38499f7830a91c9810">dispatchThreads</a> (MTL::Size grid_dims, MTL::Size group_dims)</td></tr>
-<tr class="separator:a1e41477f2f489e38499f7830a91c9810"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a85796b2bf41dbf347ae0978d4660600d" id="r_a85796b2bf41dbf347ae0978d4660600d"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a85796b2bf41dbf347ae0978d4660600d">dispatch_threadgroups</a> (MTL::Size grid_dims, MTL::Size group_dims)</td></tr>
+<tr class="separator:a85796b2bf41dbf347ae0978d4660600d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a0a8501b940e5a347475fa4bc38fb4c05" id="r_a0a8501b940e5a347475fa4bc38fb4c05"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0a8501b940e5a347475fa4bc38fb4c05">dispatch_threads</a> (MTL::Size grid_dims, MTL::Size group_dims)</td></tr>
+<tr class="separator:a0a8501b940e5a347475fa4bc38fb4c05"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ad538ae88f90560063f9ba502e2795991" id="r_ad538ae88f90560063f9ba502e2795991"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad538ae88f90560063f9ba502e2795991">maybeInsertBarrier</a> ()</td></tr>
 <tr class="separator:ad538ae88f90560063f9ba502e2795991"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6d4c03a6585deedb5ccd1a1057d0c6ef" id="r_a6d4c03a6585deedb5ccd1a1057d0c6ef"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6d4c03a6585deedb5ccd1a1057d0c6ef">set_compute_pipeline_state</a> (MTL::ComputePipelineState *kernel)</td></tr>
+<tr class="separator:a6d4c03a6585deedb5ccd1a1057d0c6ef"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aefdadbff4e003dc6f77506840babc088" id="r_aefdadbff4e003dc6f77506840babc088"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aefdadbff4e003dc6f77506840babc088">wait_for_fence</a> (MTL::Fence *fence)</td></tr>
+<tr class="separator:aefdadbff4e003dc6f77506840babc088"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aeef08f5f3c015578d40de756a6465aa2" id="r_aeef08f5f3c015578d40de756a6465aa2"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aeef08f5f3c015578d40de756a6465aa2">update_fence</a> (MTL::Fence *fence)</td></tr>
+<tr class="separator:aeef08f5f3c015578d40de756a6465aa2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a68c3c6a036e11ec40211c09811bbed1b" id="r_a68c3c6a036e11ec40211c09811bbed1b"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:a68c3c6a036e11ec40211c09811bbed1b"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a68c3c6a036e11ec40211c09811bbed1b">set_vector_bytes</a> (const std::vector&lt; T &gt; &amp;vec, size_t nelems, int idx)</td></tr>
+<tr class="separator:a68c3c6a036e11ec40211c09811bbed1b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a7375adf9ee5355bcf4b7f5f210efd115" id="r_a7375adf9ee5355bcf4b7f5f210efd115"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:a7375adf9ee5355bcf4b7f5f210efd115"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a7375adf9ee5355bcf4b7f5f210efd115">set_vector_bytes</a> (const std::vector&lt; T &gt; &amp;vec, int idx)</td></tr>
+<tr class="separator:a7375adf9ee5355bcf4b7f5f210efd115"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a9c343f791812a45c6c03a5c9f27f74d5" id="r_a9c343f791812a45c6c03a5c9f27f74d5"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:a9c343f791812a45c6c03a5c9f27f74d5"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a9c343f791812a45c6c03a5c9f27f74d5">set_bytes</a> (const T *v, int n, int idx)</td></tr>
+<tr class="separator:a9c343f791812a45c6c03a5c9f27f74d5"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abc52d18ea87d213c47fd26062c829849" id="r_abc52d18ea87d213c47fd26062c829849"><td class="memTemplParams" colspan="2">template&lt;typename T &gt; </td></tr>
+<tr class="memitem:abc52d18ea87d213c47fd26062c829849"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#abc52d18ea87d213c47fd26062c829849">set_bytes</a> (const T &amp;v, int idx)</td></tr>
+<tr class="separator:abc52d18ea87d213c47fd26062c829849"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a48b548a0b15f9d1279c938a1c6167034" id="r_a48b548a0b15f9d1279c938a1c6167034"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a48b548a0b15f9d1279c938a1c6167034">start_concurrent</a> ()</td></tr>
 <tr class="separator:a48b548a0b15f9d1279c938a1c6167034"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a9b6dd221ccd2d939d544004cb6279198" id="r_a9b6dd221ccd2d939d544004cb6279198"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9b6dd221ccd2d939d544004cb6279198">~CommandEncoder</a> ()</td></tr>
@@ -193,14 +209,14 @@ Public Member Functions</h2></td></tr>
 </div>
 </div>
 <h2 class="groupheader">Member Function Documentation</h2>
-<a id="a74bcd8e35f80f5a62db48c4a2bb0173e" name="a74bcd8e35f80f5a62db48c4a2bb0173e"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a74bcd8e35f80f5a62db48c4a2bb0173e">&#9670;&#160;</a></span>dispatchThreadgroups()</h2>
+<a id="a85796b2bf41dbf347ae0978d4660600d" name="a85796b2bf41dbf347ae0978d4660600d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a85796b2bf41dbf347ae0978d4660600d">&#9670;&#160;</a></span>dispatch_threadgroups()</h2>
 
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">void mlx::core::metal::CommandEncoder::dispatchThreadgroups </td>
+          <td class="memname">void mlx::core::metal::CommandEncoder::dispatch_threadgroups </td>
           <td>(</td>
           <td class="paramtype">MTL::Size</td>          <td class="paramname"><span class="paramname"><em>grid_dims</em></span>, </td>
         </tr>
@@ -214,14 +230,14 @@ Public Member Functions</h2></td></tr>
 
 </div>
 </div>
-<a id="a1e41477f2f489e38499f7830a91c9810" name="a1e41477f2f489e38499f7830a91c9810"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1e41477f2f489e38499f7830a91c9810">&#9670;&#160;</a></span>dispatchThreads()</h2>
+<a id="a0a8501b940e5a347475fa4bc38fb4c05" name="a0a8501b940e5a347475fa4bc38fb4c05"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a0a8501b940e5a347475fa4bc38fb4c05">&#9670;&#160;</a></span>dispatch_threads()</h2>
 
 <div class="memitem">
 <div class="memproto">
       <table class="memname">
         <tr>
-          <td class="memname">void mlx::core::metal::CommandEncoder::dispatchThreads </td>
+          <td class="memname">void mlx::core::metal::CommandEncoder::dispatch_threads </td>
           <td>(</td>
           <td class="paramtype">MTL::Size</td>          <td class="paramname"><span class="paramname"><em>grid_dims</em></span>, </td>
         </tr>
@@ -275,31 +291,6 @@ Public Member Functions</h2></td></tr>
       </table>
 </div><div class="memdoc">
 
-</div>
-</div>
-<a id="aac45ab0630ea32cf7d15c7ba3e229966" name="aac45ab0630ea32cf7d15c7ba3e229966"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aac45ab0630ea32cf7d15c7ba3e229966">&#9670;&#160;</a></span>operator-&gt;()</h2>
-
-<div class="memitem">
-<div class="memproto">
-<table class="mlabels">
-  <tr>
-  <td class="mlabels-left">
-      <table class="memname">
-        <tr>
-          <td class="memname">MTL::ComputeCommandEncoder * mlx::core::metal::CommandEncoder::operator-&gt; </td>
-          <td>(</td>
-          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
-          <td></td>
-        </tr>
-      </table>
-  </td>
-  <td class="mlabels-right">
-<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
-  </tr>
-</table>
-</div><div class="memdoc">
-
 </div>
 </div>
 <a id="a3f42a1362b4a513fa89e7b3dcc570a8e" name="a3f42a1362b4a513fa89e7b3dcc570a8e"></a>
@@ -350,6 +341,98 @@ Public Member Functions</h2></td></tr>
 </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="abc52d18ea87d213c47fd26062c829849" name="abc52d18ea87d213c47fd26062c829849"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#abc52d18ea87d213c47fd26062c829849">&#9670;&#160;</a></span>set_bytes() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::metal::CommandEncoder::set_bytes </td>
+          <td>(</td>
+          <td class="paramtype">const T &amp;</td>          <td class="paramname"><span class="paramname"><em>v</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>idx</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a9c343f791812a45c6c03a5c9f27f74d5" name="a9c343f791812a45c6c03a5c9f27f74d5"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a9c343f791812a45c6c03a5c9f27f74d5">&#9670;&#160;</a></span>set_bytes() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::metal::CommandEncoder::set_bytes </td>
+          <td>(</td>
+          <td class="paramtype">const T *</td>          <td class="paramname"><span class="paramname"><em>v</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>idx</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a6d4c03a6585deedb5ccd1a1057d0c6ef" name="a6d4c03a6585deedb5ccd1a1057d0c6ef"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6d4c03a6585deedb5ccd1a1057d0c6ef">&#9670;&#160;</a></span>set_compute_pipeline_state()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::metal::CommandEncoder::set_compute_pipeline_state </td>
+          <td>(</td>
+          <td class="paramtype">MTL::ComputePipelineState *</td>          <td class="paramname"><span class="paramname"><em>kernel</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="ab69ff0d7f14b9b59db4df0608193dce4" name="ab69ff0d7f14b9b59db4df0608193dce4"></a>
@@ -402,6 +485,73 @@ Public Member Functions</h2></td></tr>
       </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="a7375adf9ee5355bcf4b7f5f210efd115" name="a7375adf9ee5355bcf4b7f5f210efd115"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a7375adf9ee5355bcf4b7f5f210efd115">&#9670;&#160;</a></span>set_vector_bytes() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::metal::CommandEncoder::set_vector_bytes </td>
+          <td>(</td>
+          <td class="paramtype">const std::vector&lt; T &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>vec</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>idx</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a68c3c6a036e11ec40211c09811bbed1b" name="a68c3c6a036e11ec40211c09811bbed1b"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a68c3c6a036e11ec40211c09811bbed1b">&#9670;&#160;</a></span>set_vector_bytes() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::metal::CommandEncoder::set_vector_bytes </td>
+          <td>(</td>
+          <td class="paramtype">const std::vector&lt; T &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>vec</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>nelems</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>idx</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="a48b548a0b15f9d1279c938a1c6167034" name="a48b548a0b15f9d1279c938a1c6167034"></a>
@@ -427,6 +577,56 @@ Public Member Functions</h2></td></tr>
 </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="aeef08f5f3c015578d40de756a6465aa2" name="aeef08f5f3c015578d40de756a6465aa2"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aeef08f5f3c015578d40de756a6465aa2">&#9670;&#160;</a></span>update_fence()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::metal::CommandEncoder::update_fence </td>
+          <td>(</td>
+          <td class="paramtype">MTL::Fence *</td>          <td class="paramname"><span class="paramname"><em>fence</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aefdadbff4e003dc6f77506840babc088" name="aefdadbff4e003dc6f77506840babc088"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aefdadbff4e003dc6f77506840babc088">&#9670;&#160;</a></span>wait_for_fence()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::metal::CommandEncoder::wait_for_fence </td>
+          <td>(</td>
+          <td class="paramtype">MTL::Fence *</td>          <td class="paramname"><span class="paramname"><em>fence</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <hr/>The documentation for this struct was generated from the following file:<ul>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_accum_helper-members.html b/docs/build/html/structmlx_1_1steel_1_1_accum_helper-members.html
index 5e66f523e..140212297 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_accum_helper-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_accum_helper-members.html
@@ -94,7 +94,7 @@ $(function(){ initResizable(false); });
 
 <p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_accum_helper.html">mlx::steel::AccumHelper&lt; T &gt;</a>, including all inherited members.</p>
 <table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_accum_helper.html#ab594958b88746f759aa7ca573f1903da">accum_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_accum_helper.html">mlx::steel::AccumHelper&lt; T &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_accum_helper.html#ae52abf69e7ba6af1a73d65d57182ed26">accum_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_accum_helper.html">mlx::steel::AccumHelper&lt; T &gt;</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_accum_helper.html b/docs/build/html/structmlx_1_1steel_1_1_accum_helper.html
index a3e3e2583..bd7cc3cda 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_accum_helper.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_accum_helper.html
@@ -95,16 +95,16 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-types" name="pub-types"></a>
 Public Types</h2></td></tr>
-<tr class="memitem:ab594958b88746f759aa7ca573f1903da" id="r_ab594958b88746f759aa7ca573f1903da"><td class="memItemLeft" align="right" valign="top">typedef float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab594958b88746f759aa7ca573f1903da">accum_type</a></td></tr>
-<tr class="separator:ab594958b88746f759aa7ca573f1903da"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae52abf69e7ba6af1a73d65d57182ed26" id="r_ae52abf69e7ba6af1a73d65d57182ed26"><td class="memItemLeft" align="right" valign="top">typedef float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae52abf69e7ba6af1a73d65d57182ed26">accum_type</a></td></tr>
+<tr class="separator:ae52abf69e7ba6af1a73d65d57182ed26"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Typedef Documentation</h2>
-<a id="ab594958b88746f759aa7ca573f1903da" name="ab594958b88746f759aa7ca573f1903da"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ab594958b88746f759aa7ca573f1903da">&#9670;&#160;</a></span>accum_type</h2>
+<a id="ae52abf69e7ba6af1a73d65d57182ed26" name="ae52abf69e7ba6af1a73d65d57182ed26"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ae52abf69e7ba6af1a73d65d57182ed26">&#9670;&#160;</a></span>accum_type</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -112,14 +112,15 @@ Public Types</h2></td></tr>
 template&lt;typename T &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">float <a class="el" href="structmlx_1_1steel_1_1_accum_helper.html">mlx::steel::AccumHelper</a>&lt; T &gt;::accum_type</td>
+          <td class="memname">typedef float <a class="el" href="structmlx_1_1steel_1_1_accum_helper.html">mlx::steel::AccumHelper</a>&lt; T &gt;::accum_type</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a></li>
 <li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_attn_params-members.html b/docs/build/html/structmlx_1_1steel_1_1_attn_params-members.html
new file mode 100644
index 000000000..4074723b4
--- /dev/null
+++ b/docs/build/html/structmlx_1_1steel_1_1_attn_params-members.html
@@ -0,0 +1,119 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">AttnParams</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">mlx::steel::AttnParams Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a1cba7fedbd02e157922619195997cf4f">B</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a07ae31628e43e09bce533c7682c8dae3">D</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a3b3e18cb993ab24819c852bc64288841">gqa_factor</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a3d286a0c27bace6016ed7a87f43291b7">H</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a03e5480d1cca6af541be54a8720e9974">K_strides</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a497b7404bcd25b535c3589c61f269f63">kL</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a68a66e3fafa922dcfd1ab1f6bdc2375e">NK</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#aaf953954274794cfcb4e35e82d681b58">NK_aligned</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a48575afc94ab9ff74deaba61464e57a1">NQ</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a4cfd2ccb0fd7eb81c2a781a0614fdcbe">NQ_aligned</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a33dc7fc22d2604a73af9f94eeea45bb4">O_strides</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a90bba215328201a37eb1c430ce9f8563">Q_strides</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#a59255882cbd78bb6f15e704e3a356a7f">qL</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#ad81bcd32e6ff8fec0000eca505fb6826">scale</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html#acc4860c3ce09c7230b470182ed002d3c">V_strides</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">mlx::steel::AttnParams</a></td><td class="entry"></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_attn_params.html b/docs/build/html/structmlx_1_1steel_1_1_attn_params.html
new file mode 100644
index 000000000..bbfa45e0c
--- /dev/null
+++ b/docs/build/html/structmlx_1_1steel_1_1_attn_params.html
@@ -0,0 +1,399 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx::steel::AttnParams Struct Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_attn_params.html">AttnParams</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-attribs">Public Attributes</a> &#124;
+<a href="structmlx_1_1steel_1_1_attn_params-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">mlx::steel::AttnParams Struct Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="attn_2params_8h_source.html">params.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
+Public Attributes</h2></td></tr>
+<tr class="memitem:a1cba7fedbd02e157922619195997cf4f" id="r_a1cba7fedbd02e157922619195997cf4f"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1cba7fedbd02e157922619195997cf4f">B</a></td></tr>
+<tr class="memdesc:a1cba7fedbd02e157922619195997cf4f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Batch Size.  <br /></td></tr>
+<tr class="separator:a1cba7fedbd02e157922619195997cf4f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3d286a0c27bace6016ed7a87f43291b7" id="r_a3d286a0c27bace6016ed7a87f43291b7"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3d286a0c27bace6016ed7a87f43291b7">H</a></td></tr>
+<tr class="memdesc:a3d286a0c27bace6016ed7a87f43291b7"><td class="mdescLeft">&#160;</td><td class="mdescRight">Heads.  <br /></td></tr>
+<tr class="separator:a3d286a0c27bace6016ed7a87f43291b7"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a07ae31628e43e09bce533c7682c8dae3" id="r_a07ae31628e43e09bce533c7682c8dae3"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a07ae31628e43e09bce533c7682c8dae3">D</a></td></tr>
+<tr class="memdesc:a07ae31628e43e09bce533c7682c8dae3"><td class="mdescLeft">&#160;</td><td class="mdescRight">Head Dim.  <br /></td></tr>
+<tr class="separator:a07ae31628e43e09bce533c7682c8dae3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a59255882cbd78bb6f15e704e3a356a7f" id="r_a59255882cbd78bb6f15e704e3a356a7f"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a59255882cbd78bb6f15e704e3a356a7f">qL</a></td></tr>
+<tr class="memdesc:a59255882cbd78bb6f15e704e3a356a7f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Query Sequence Length.  <br /></td></tr>
+<tr class="separator:a59255882cbd78bb6f15e704e3a356a7f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a497b7404bcd25b535c3589c61f269f63" id="r_a497b7404bcd25b535c3589c61f269f63"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a497b7404bcd25b535c3589c61f269f63">kL</a></td></tr>
+<tr class="memdesc:a497b7404bcd25b535c3589c61f269f63"><td class="mdescLeft">&#160;</td><td class="mdescRight">Key Sequence Length.  <br /></td></tr>
+<tr class="separator:a497b7404bcd25b535c3589c61f269f63"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3b3e18cb993ab24819c852bc64288841" id="r_a3b3e18cb993ab24819c852bc64288841"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3b3e18cb993ab24819c852bc64288841">gqa_factor</a></td></tr>
+<tr class="memdesc:a3b3e18cb993ab24819c852bc64288841"><td class="mdescLeft">&#160;</td><td class="mdescRight">Group Query factor.  <br /></td></tr>
+<tr class="separator:a3b3e18cb993ab24819c852bc64288841"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ad81bcd32e6ff8fec0000eca505fb6826" id="r_ad81bcd32e6ff8fec0000eca505fb6826"><td class="memItemLeft" align="right" valign="top">float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad81bcd32e6ff8fec0000eca505fb6826">scale</a></td></tr>
+<tr class="memdesc:ad81bcd32e6ff8fec0000eca505fb6826"><td class="mdescLeft">&#160;</td><td class="mdescRight">Attention scale.  <br /></td></tr>
+<tr class="separator:ad81bcd32e6ff8fec0000eca505fb6826"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a48575afc94ab9ff74deaba61464e57a1" id="r_a48575afc94ab9ff74deaba61464e57a1"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a48575afc94ab9ff74deaba61464e57a1">NQ</a></td></tr>
+<tr class="memdesc:a48575afc94ab9ff74deaba61464e57a1"><td class="mdescLeft">&#160;</td><td class="mdescRight">Number of query blocks.  <br /></td></tr>
+<tr class="separator:a48575afc94ab9ff74deaba61464e57a1"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a68a66e3fafa922dcfd1ab1f6bdc2375e" id="r_a68a66e3fafa922dcfd1ab1f6bdc2375e"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a68a66e3fafa922dcfd1ab1f6bdc2375e">NK</a></td></tr>
+<tr class="memdesc:a68a66e3fafa922dcfd1ab1f6bdc2375e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Number of key/value blocks.  <br /></td></tr>
+<tr class="separator:a68a66e3fafa922dcfd1ab1f6bdc2375e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a4cfd2ccb0fd7eb81c2a781a0614fdcbe" id="r_a4cfd2ccb0fd7eb81c2a781a0614fdcbe"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4cfd2ccb0fd7eb81c2a781a0614fdcbe">NQ_aligned</a></td></tr>
+<tr class="memdesc:a4cfd2ccb0fd7eb81c2a781a0614fdcbe"><td class="mdescLeft">&#160;</td><td class="mdescRight">Number of full query blocks.  <br /></td></tr>
+<tr class="separator:a4cfd2ccb0fd7eb81c2a781a0614fdcbe"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aaf953954274794cfcb4e35e82d681b58" id="r_aaf953954274794cfcb4e35e82d681b58"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aaf953954274794cfcb4e35e82d681b58">NK_aligned</a></td></tr>
+<tr class="memdesc:aaf953954274794cfcb4e35e82d681b58"><td class="mdescLeft">&#160;</td><td class="mdescRight">Number of full key/value blocks.  <br /></td></tr>
+<tr class="separator:aaf953954274794cfcb4e35e82d681b58"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a90bba215328201a37eb1c430ce9f8563" id="r_a90bba215328201a37eb1c430ce9f8563"><td class="memItemLeft" align="right" valign="top">size_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a90bba215328201a37eb1c430ce9f8563">Q_strides</a> [3]</td></tr>
+<tr class="memdesc:a90bba215328201a37eb1c430ce9f8563"><td class="mdescLeft">&#160;</td><td class="mdescRight">Query strides (B, H, L, D = 1)  <br /></td></tr>
+<tr class="separator:a90bba215328201a37eb1c430ce9f8563"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a03e5480d1cca6af541be54a8720e9974" id="r_a03e5480d1cca6af541be54a8720e9974"><td class="memItemLeft" align="right" valign="top">size_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a03e5480d1cca6af541be54a8720e9974">K_strides</a> [3]</td></tr>
+<tr class="memdesc:a03e5480d1cca6af541be54a8720e9974"><td class="mdescLeft">&#160;</td><td class="mdescRight">Key strides (B, H, L, D = 1)  <br /></td></tr>
+<tr class="separator:a03e5480d1cca6af541be54a8720e9974"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:acc4860c3ce09c7230b470182ed002d3c" id="r_acc4860c3ce09c7230b470182ed002d3c"><td class="memItemLeft" align="right" valign="top">size_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acc4860c3ce09c7230b470182ed002d3c">V_strides</a> [3]</td></tr>
+<tr class="memdesc:acc4860c3ce09c7230b470182ed002d3c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Value strides (B, H, L, D = 1)  <br /></td></tr>
+<tr class="separator:acc4860c3ce09c7230b470182ed002d3c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a33dc7fc22d2604a73af9f94eeea45bb4" id="r_a33dc7fc22d2604a73af9f94eeea45bb4"><td class="memItemLeft" align="right" valign="top">size_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a33dc7fc22d2604a73af9f94eeea45bb4">O_strides</a> [3]</td></tr>
+<tr class="memdesc:a33dc7fc22d2604a73af9f94eeea45bb4"><td class="mdescLeft">&#160;</td><td class="mdescRight">Output strides (B, H, L, D = 1)  <br /></td></tr>
+<tr class="separator:a33dc7fc22d2604a73af9f94eeea45bb4"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Member Data Documentation</h2>
+<a id="a1cba7fedbd02e157922619195997cf4f" name="a1cba7fedbd02e157922619195997cf4f"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1cba7fedbd02e157922619195997cf4f">&#9670;&#160;</a></span>B</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::B</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Batch Size. </p>
+
+</div>
+</div>
+<a id="a07ae31628e43e09bce533c7682c8dae3" name="a07ae31628e43e09bce533c7682c8dae3"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a07ae31628e43e09bce533c7682c8dae3">&#9670;&#160;</a></span>D</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::D</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Head Dim. </p>
+
+</div>
+</div>
+<a id="a3b3e18cb993ab24819c852bc64288841" name="a3b3e18cb993ab24819c852bc64288841"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a3b3e18cb993ab24819c852bc64288841">&#9670;&#160;</a></span>gqa_factor</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::gqa_factor</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Group Query factor. </p>
+
+</div>
+</div>
+<a id="a3d286a0c27bace6016ed7a87f43291b7" name="a3d286a0c27bace6016ed7a87f43291b7"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a3d286a0c27bace6016ed7a87f43291b7">&#9670;&#160;</a></span>H</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::H</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Heads. </p>
+
+</div>
+</div>
+<a id="a03e5480d1cca6af541be54a8720e9974" name="a03e5480d1cca6af541be54a8720e9974"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a03e5480d1cca6af541be54a8720e9974">&#9670;&#160;</a></span>K_strides</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">size_t mlx::steel::AttnParams::K_strides[3]</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Key strides (B, H, L, D = 1) </p>
+
+</div>
+</div>
+<a id="a497b7404bcd25b535c3589c61f269f63" name="a497b7404bcd25b535c3589c61f269f63"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a497b7404bcd25b535c3589c61f269f63">&#9670;&#160;</a></span>kL</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::kL</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Key Sequence Length. </p>
+
+</div>
+</div>
+<a id="a68a66e3fafa922dcfd1ab1f6bdc2375e" name="a68a66e3fafa922dcfd1ab1f6bdc2375e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a68a66e3fafa922dcfd1ab1f6bdc2375e">&#9670;&#160;</a></span>NK</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::NK</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Number of key/value blocks. </p>
+
+</div>
+</div>
+<a id="aaf953954274794cfcb4e35e82d681b58" name="aaf953954274794cfcb4e35e82d681b58"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aaf953954274794cfcb4e35e82d681b58">&#9670;&#160;</a></span>NK_aligned</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::NK_aligned</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Number of full key/value blocks. </p>
+
+</div>
+</div>
+<a id="a48575afc94ab9ff74deaba61464e57a1" name="a48575afc94ab9ff74deaba61464e57a1"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a48575afc94ab9ff74deaba61464e57a1">&#9670;&#160;</a></span>NQ</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::NQ</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Number of query blocks. </p>
+
+</div>
+</div>
+<a id="a4cfd2ccb0fd7eb81c2a781a0614fdcbe" name="a4cfd2ccb0fd7eb81c2a781a0614fdcbe"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a4cfd2ccb0fd7eb81c2a781a0614fdcbe">&#9670;&#160;</a></span>NQ_aligned</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::NQ_aligned</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Number of full query blocks. </p>
+
+</div>
+</div>
+<a id="a33dc7fc22d2604a73af9f94eeea45bb4" name="a33dc7fc22d2604a73af9f94eeea45bb4"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a33dc7fc22d2604a73af9f94eeea45bb4">&#9670;&#160;</a></span>O_strides</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">size_t mlx::steel::AttnParams::O_strides[3]</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Output strides (B, H, L, D = 1) </p>
+
+</div>
+</div>
+<a id="a90bba215328201a37eb1c430ce9f8563" name="a90bba215328201a37eb1c430ce9f8563"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a90bba215328201a37eb1c430ce9f8563">&#9670;&#160;</a></span>Q_strides</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">size_t mlx::steel::AttnParams::Q_strides[3]</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Query strides (B, H, L, D = 1) </p>
+
+</div>
+</div>
+<a id="a59255882cbd78bb6f15e704e3a356a7f" name="a59255882cbd78bb6f15e704e3a356a7f"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a59255882cbd78bb6f15e704e3a356a7f">&#9670;&#160;</a></span>qL</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">int mlx::steel::AttnParams::qL</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Query Sequence Length. </p>
+
+</div>
+</div>
+<a id="ad81bcd32e6ff8fec0000eca505fb6826" name="ad81bcd32e6ff8fec0000eca505fb6826"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ad81bcd32e6ff8fec0000eca505fb6826">&#9670;&#160;</a></span>scale</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">float mlx::steel::AttnParams::scale</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Attention scale. </p>
+
+</div>
+</div>
+<a id="acc4860c3ce09c7230b470182ed002d3c" name="acc4860c3ce09c7230b470182ed002d3c"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#acc4860c3ce09c7230b470182ed002d3c">&#9670;&#160;</a></span>V_strides</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">size_t mlx::steel::AttnParams::V_strides[3]</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Value strides (B, H, L, D = 1) </p>
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2params_8h_source.html">params.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag.html b/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag.html
index 1258f1714..d66ae2c48 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag.html
@@ -92,9 +92,10 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="mma_8h_source.html">mma.h</a>&gt;</code></p>
-<hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="mma_8h_source.html">mma.h</a></li>
+<p><code>#include &lt;<a class="el" href="attn_2mma_8h_source.html">mma.h</a>&gt;</code></p>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2mma_8h_source.html">mma.h</a></li>
+<li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="gemm_2mma_8h_source.html">mma.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4-members.html b/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4-members.html
index 7b0639562..7b840ba99 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4-members.html
@@ -94,7 +94,9 @@ $(function(){ initResizable(false); });
 
 <p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a>, including all inherited members.</p>
 <table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aab8dd1c6917247da41dd3a31139a665f">col_frag_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83">get_coord</a>(ushort simd_lane_id)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7331fff1d12f2f8b72b0006a3ad0dd83">get_coord</a>(ushort simd_lane_id)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a7c212200d86b4e93f274d99addf668bd">kElemCols</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a76aa5aa690dbcc954e957d767fad661f">kElemRows</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
@@ -102,12 +104,21 @@ $(function(){ initResizable(false); });
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a211102315e2afbcfcd2e2c201b638e9f">kFragCols</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a2fe53db449c692226f23f6b99fb2c0d4">kFragRows</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">load</a>(thread frag_type &amp;dst, SrcPtrType src, StrX str_x, StrY str_y)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ac73006b36fc710feda3a7c796e21415c">load</a>(thread frag_type &amp;dst, SrcPtrType src, StrX str_x, StrY str_y)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">load_safe</a>(thread frag_type &amp;dst, SrcPtrType src, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=Int&lt; 0 &gt;{}, OffY off_y=Int&lt; 0 &gt;{})</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#ad22aaee4a2938cbdd315b39eda84e07d">load_safe</a>(thread frag_type &amp;dst, SrcPtrType src, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=Int&lt; 0 &gt;{}, OffY off_y=Int&lt; 0 &gt;{})</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3">mma</a>(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946">mma</a>(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3">mma</a>(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946">mma</a>(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a318c4279bdc7b39b7919f108b1cd8010">row_bin_op</a>(thread frag_type &amp;inp_vals, thread T *row_vals)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a5ec2e40a8f5ad98c71b825544cdd878b">row_frag_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a51d662e4cff88b5ad17d7c44bb6b6970">row_reduce</a>(thread const frag_type &amp;inp_vals, thread T *reduced_vals)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">store</a>(const thread frag_type &amp;src, DstPtrType dst, StrX str_x, StrY str_y)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#aa8f50ea8961ec5b35c1b81366d64f2cb">store</a>(const thread frag_type &amp;src, DstPtrType dst, StrX str_x, StrY str_y)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328">store_safe</a>(const thread frag_type &amp;src, DstPtrType dst, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=Int&lt; 0 &gt;{}, OffY off_y=Int&lt; 0 &gt;{})</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1f0b00daad8eba2f855bb306e70d2328">store_safe</a>(const thread frag_type &amp;src, DstPtrType dst, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=Int&lt; 0 &gt;{}, OffY off_y=Int&lt; 0 &gt;{})</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html">mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html b/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html
index 5c8cc5d22..9d08e5dba 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html
@@ -97,34 +97,62 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="mma_8h_source.html">mma.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="attn_2mma_8h_source.html">mma.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-types" name="pub-types"></a>
 Public Types</h2></td></tr>
-<tr class="memitem:a72054f003c695b90a4fe5101e19cbaa9" id="r_a72054f003c695b90a4fe5101e19cbaa9"><td class="memItemLeft" align="right" valign="top">typedef metal::simdgroup_matrix&lt; T, <a class="el" href="#a2fe53db449c692226f23f6b99fb2c0d4">kFragRows</a>, <a class="el" href="#a211102315e2afbcfcd2e2c201b638e9f">kFragCols</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a></td></tr>
-<tr class="separator:a72054f003c695b90a4fe5101e19cbaa9"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af67c1b1aea594468e9426e1be0e31d0b" id="r_af67c1b1aea594468e9426e1be0e31d0b"><td class="memItemLeft" align="right" valign="top">typedef metal::vec&lt; T, <a class="el" href="#a3c34dfdc944db110f4735f1b25307cf0">kElemsPerFrag</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a></td></tr>
-<tr class="separator:af67c1b1aea594468e9426e1be0e31d0b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a958b6952cbd9462d7ae9f6e029631887" id="r_a958b6952cbd9462d7ae9f6e029631887"><td class="memItemLeft" align="right" valign="top">typedef metal::simdgroup_matrix&lt; T, <a class="el" href="#a2fe53db449c692226f23f6b99fb2c0d4">kFragRows</a>, <a class="el" href="#a211102315e2afbcfcd2e2c201b638e9f">kFragCols</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a></td></tr>
+<tr class="separator:a958b6952cbd9462d7ae9f6e029631887"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a9f53a5e9b046b4f217e782b733941b0c" id="r_a9f53a5e9b046b4f217e782b733941b0c"><td class="memItemLeft" align="right" valign="top">typedef metal::vec&lt; T, <a class="el" href="#a3c34dfdc944db110f4735f1b25307cf0">kElemsPerFrag</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a></td></tr>
+<tr class="separator:a9f53a5e9b046b4f217e782b733941b0c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a5ec2e40a8f5ad98c71b825544cdd878b" id="r_a5ec2e40a8f5ad98c71b825544cdd878b"><td class="memItemLeft" align="right" valign="top">typedef metal::vec&lt; T, <a class="el" href="#a76aa5aa690dbcc954e957d767fad661f">kElemRows</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5ec2e40a8f5ad98c71b825544cdd878b">row_frag_type</a></td></tr>
+<tr class="separator:a5ec2e40a8f5ad98c71b825544cdd878b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aab8dd1c6917247da41dd3a31139a665f" id="r_aab8dd1c6917247da41dd3a31139a665f"><td class="memItemLeft" align="right" valign="top">typedef metal::vec&lt; T, <a class="el" href="#a7c212200d86b4e93f274d99addf668bd">kElemCols</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aab8dd1c6917247da41dd3a31139a665f">col_frag_type</a></td></tr>
+<tr class="separator:aab8dd1c6917247da41dd3a31139a665f"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
 Static Public Member Functions</h2></td></tr>
 <tr class="memitem:a7331fff1d12f2f8b72b0006a3ad0dd83" id="r_a7331fff1d12f2f8b72b0006a3ad0dd83"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC constexpr short2&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7331fff1d12f2f8b72b0006a3ad0dd83">get_coord</a> (ushort simd_lane_id)</td></tr>
 <tr class="separator:a7331fff1d12f2f8b72b0006a3ad0dd83"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ac73006b36fc710feda3a7c796e21415c" id="r_ac73006b36fc710feda3a7c796e21415c"><td class="memTemplParams" colspan="2">template&lt;typename SrcPtrType , typename StrX , typename StrY &gt; </td></tr>
-<tr class="memitem:ac73006b36fc710feda3a7c796e21415c"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ac73006b36fc710feda3a7c796e21415c">load</a> (thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;dst, SrcPtrType src, StrX str_x, StrY str_y)</td></tr>
+<tr class="memitem:ac73006b36fc710feda3a7c796e21415c"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ac73006b36fc710feda3a7c796e21415c">load</a> (thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;dst, SrcPtrType src, StrX str_x, StrY str_y)</td></tr>
 <tr class="separator:ac73006b36fc710feda3a7c796e21415c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ad22aaee4a2938cbdd315b39eda84e07d" id="r_ad22aaee4a2938cbdd315b39eda84e07d"><td class="memTemplParams" colspan="2">template&lt;typename SrcPtrType , typename StrX , typename StrY , typename LimX , typename LimY , typename OffX , typename OffY &gt; </td></tr>
-<tr class="memitem:ad22aaee4a2938cbdd315b39eda84e07d"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ad22aaee4a2938cbdd315b39eda84e07d">load_safe</a> (thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;dst, SrcPtrType src, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{}, OffY off_y=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{})</td></tr>
+<tr class="memitem:ad22aaee4a2938cbdd315b39eda84e07d"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ad22aaee4a2938cbdd315b39eda84e07d">load_safe</a> (thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;dst, SrcPtrType src, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{}, OffY off_y=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{})</td></tr>
 <tr class="separator:ad22aaee4a2938cbdd315b39eda84e07d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aa8f50ea8961ec5b35c1b81366d64f2cb" id="r_aa8f50ea8961ec5b35c1b81366d64f2cb"><td class="memTemplParams" colspan="2">template&lt;typename DstPtrType , typename StrX , typename StrY &gt; </td></tr>
-<tr class="memitem:aa8f50ea8961ec5b35c1b81366d64f2cb"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa8f50ea8961ec5b35c1b81366d64f2cb">store</a> (const thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;src, DstPtrType dst, StrX str_x, StrY str_y)</td></tr>
+<tr class="memitem:aa8f50ea8961ec5b35c1b81366d64f2cb"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa8f50ea8961ec5b35c1b81366d64f2cb">store</a> (const thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;src, DstPtrType dst, StrX str_x, StrY str_y)</td></tr>
 <tr class="separator:aa8f50ea8961ec5b35c1b81366d64f2cb"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a1f0b00daad8eba2f855bb306e70d2328" id="r_a1f0b00daad8eba2f855bb306e70d2328"><td class="memTemplParams" colspan="2">template&lt;typename DstPtrType , typename StrX , typename StrY , typename LimX , typename LimY , typename OffX , typename OffY &gt; </td></tr>
-<tr class="memitem:a1f0b00daad8eba2f855bb306e70d2328"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1f0b00daad8eba2f855bb306e70d2328">store_safe</a> (const thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;src, DstPtrType dst, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{}, OffY off_y=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{})</td></tr>
+<tr class="memitem:a1f0b00daad8eba2f855bb306e70d2328"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1f0b00daad8eba2f855bb306e70d2328">store_safe</a> (const thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;src, DstPtrType dst, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{}, OffY off_y=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{})</td></tr>
 <tr class="separator:a1f0b00daad8eba2f855bb306e70d2328"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a8028512f5a3d2b6acaf966be529627a3" id="r_a8028512f5a3d2b6acaf966be529627a3"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8028512f5a3d2b6acaf966be529627a3">mma</a> (thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;D, thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;A, thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;B, thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;C)</td></tr>
+<tr class="memitem:a8028512f5a3d2b6acaf966be529627a3" id="r_a8028512f5a3d2b6acaf966be529627a3"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8028512f5a3d2b6acaf966be529627a3">mma</a> (thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;D, thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;A, thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;B, thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;C)</td></tr>
 <tr class="separator:a8028512f5a3d2b6acaf966be529627a3"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1868f57d57c8adedab2c58492ec76946" id="r_a1868f57d57c8adedab2c58492ec76946"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1868f57d57c8adedab2c58492ec76946">mma</a> (thread <a class="el" href="#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> &amp;D, thread <a class="el" href="#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> &amp;A, thread <a class="el" href="#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> &amp;B, thread <a class="el" href="#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> &amp;C)</td></tr>
+<tr class="memitem:a1868f57d57c8adedab2c58492ec76946" id="r_a1868f57d57c8adedab2c58492ec76946"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1868f57d57c8adedab2c58492ec76946">mma</a> (thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;D, thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;A, thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;B, thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;C)</td></tr>
+<tr class="separator:a1868f57d57c8adedab2c58492ec76946"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a51d662e4cff88b5ad17d7c44bb6b6970" id="r_a51d662e4cff88b5ad17d7c44bb6b6970"><td class="memTemplParams" colspan="2">template&lt;typename Op &gt; </td></tr>
+<tr class="memitem:a51d662e4cff88b5ad17d7c44bb6b6970"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a51d662e4cff88b5ad17d7c44bb6b6970">row_reduce</a> (thread const <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;inp_vals, thread T *reduced_vals)</td></tr>
+<tr class="separator:a51d662e4cff88b5ad17d7c44bb6b6970"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a318c4279bdc7b39b7919f108b1cd8010" id="r_a318c4279bdc7b39b7919f108b1cd8010"><td class="memTemplParams" colspan="2">template&lt;typename Op &gt; </td></tr>
+<tr class="memitem:a318c4279bdc7b39b7919f108b1cd8010"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a318c4279bdc7b39b7919f108b1cd8010">row_bin_op</a> (thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;inp_vals, thread T *row_vals)</td></tr>
+<tr class="separator:a318c4279bdc7b39b7919f108b1cd8010"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a7331fff1d12f2f8b72b0006a3ad0dd83" id="r_a7331fff1d12f2f8b72b0006a3ad0dd83"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC constexpr short2&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7331fff1d12f2f8b72b0006a3ad0dd83">get_coord</a> (ushort simd_lane_id)</td></tr>
+<tr class="separator:a7331fff1d12f2f8b72b0006a3ad0dd83"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac73006b36fc710feda3a7c796e21415c" id="r_ac73006b36fc710feda3a7c796e21415c"><td class="memTemplParams" colspan="2">template&lt;typename SrcPtrType , typename StrX , typename StrY &gt; </td></tr>
+<tr class="memitem:ac73006b36fc710feda3a7c796e21415c"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ac73006b36fc710feda3a7c796e21415c">load</a> (thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;dst, SrcPtrType src, StrX str_x, StrY str_y)</td></tr>
+<tr class="separator:ac73006b36fc710feda3a7c796e21415c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ad22aaee4a2938cbdd315b39eda84e07d" id="r_ad22aaee4a2938cbdd315b39eda84e07d"><td class="memTemplParams" colspan="2">template&lt;typename SrcPtrType , typename StrX , typename StrY , typename LimX , typename LimY , typename OffX , typename OffY &gt; </td></tr>
+<tr class="memitem:ad22aaee4a2938cbdd315b39eda84e07d"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ad22aaee4a2938cbdd315b39eda84e07d">load_safe</a> (thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;dst, SrcPtrType src, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{}, OffY off_y=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{})</td></tr>
+<tr class="separator:ad22aaee4a2938cbdd315b39eda84e07d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa8f50ea8961ec5b35c1b81366d64f2cb" id="r_aa8f50ea8961ec5b35c1b81366d64f2cb"><td class="memTemplParams" colspan="2">template&lt;typename DstPtrType , typename StrX , typename StrY &gt; </td></tr>
+<tr class="memitem:aa8f50ea8961ec5b35c1b81366d64f2cb"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa8f50ea8961ec5b35c1b81366d64f2cb">store</a> (const thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;src, DstPtrType dst, StrX str_x, StrY str_y)</td></tr>
+<tr class="separator:aa8f50ea8961ec5b35c1b81366d64f2cb"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1f0b00daad8eba2f855bb306e70d2328" id="r_a1f0b00daad8eba2f855bb306e70d2328"><td class="memTemplParams" colspan="2">template&lt;typename DstPtrType , typename StrX , typename StrY , typename LimX , typename LimY , typename OffX , typename OffY &gt; </td></tr>
+<tr class="memitem:a1f0b00daad8eba2f855bb306e70d2328"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1f0b00daad8eba2f855bb306e70d2328">store_safe</a> (const thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;src, DstPtrType dst, StrX str_x, StrY str_y, LimX lim_x, LimY lim_y, OffX off_x=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{}, OffY off_y=<a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt; 0 &gt;{})</td></tr>
+<tr class="separator:a1f0b00daad8eba2f855bb306e70d2328"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a8028512f5a3d2b6acaf966be529627a3" id="r_a8028512f5a3d2b6acaf966be529627a3"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8028512f5a3d2b6acaf966be529627a3">mma</a> (thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;D, thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;A, thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;B, thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;C)</td></tr>
+<tr class="separator:a8028512f5a3d2b6acaf966be529627a3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1868f57d57c8adedab2c58492ec76946" id="r_a1868f57d57c8adedab2c58492ec76946"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC constexpr void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1868f57d57c8adedab2c58492ec76946">mma</a> (thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;D, thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;A, thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;B, thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;C)</td></tr>
 <tr class="separator:a1868f57d57c8adedab2c58492ec76946"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
@@ -141,8 +169,8 @@ Public Attributes</h2></td></tr>
 <tr class="separator:a7c212200d86b4e93f274d99addf668bd"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Typedef Documentation</h2>
-<a id="af67c1b1aea594468e9426e1be0e31d0b" name="af67c1b1aea594468e9426e1be0e31d0b"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#af67c1b1aea594468e9426e1be0e31d0b">&#9670;&#160;</a></span>frag_type</h2>
+<a id="aab8dd1c6917247da41dd3a31139a665f" name="aab8dd1c6917247da41dd3a31139a665f"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aab8dd1c6917247da41dd3a31139a665f">&#9670;&#160;</a></span>col_frag_type</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -150,15 +178,15 @@ Public Attributes</h2></td></tr>
 template&lt;typename T &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">metal::vec&lt;T, <a class="el" href="#a3c34dfdc944db110f4735f1b25307cf0">kElemsPerFrag</a>&gt; <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::frag_type</td>
+          <td class="memname">metal::vec&lt;T, <a class="el" href="#a7c212200d86b4e93f274d99addf668bd">kElemCols</a>&gt; <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::col_frag_type</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a72054f003c695b90a4fe5101e19cbaa9" name="a72054f003c695b90a4fe5101e19cbaa9"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a72054f003c695b90a4fe5101e19cbaa9">&#9670;&#160;</a></span>mat_type</h2>
+<a id="a9f53a5e9b046b4f217e782b733941b0c" name="a9f53a5e9b046b4f217e782b733941b0c"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a9f53a5e9b046b4f217e782b733941b0c">&#9670;&#160;</a></span>frag_type</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -166,7 +194,39 @@ template&lt;typename T &gt; </div>
 template&lt;typename T &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">metal::simdgroup_matrix&lt;T, <a class="el" href="#a2fe53db449c692226f23f6b99fb2c0d4">kFragRows</a>, <a class="el" href="#a211102315e2afbcfcd2e2c201b638e9f">kFragCols</a>&gt; <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::mat_type</td>
+          <td class="memname">typedef metal::vec&lt; T, <a class="el" href="#a3c34dfdc944db110f4735f1b25307cf0">kElemsPerFrag</a> &gt; <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::frag_type</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a958b6952cbd9462d7ae9f6e029631887" name="a958b6952cbd9462d7ae9f6e029631887"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a958b6952cbd9462d7ae9f6e029631887">&#9670;&#160;</a></span>mat_type</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">typedef metal::simdgroup_matrix&lt; T, <a class="el" href="#a2fe53db449c692226f23f6b99fb2c0d4">kFragRows</a>, <a class="el" href="#a211102315e2afbcfcd2e2c201b638e9f">kFragCols</a> &gt; <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::mat_type</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a5ec2e40a8f5ad98c71b825544cdd878b" name="a5ec2e40a8f5ad98c71b825544cdd878b"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a5ec2e40a8f5ad98c71b825544cdd878b">&#9670;&#160;</a></span>row_frag_type</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">metal::vec&lt;T, <a class="el" href="#a76aa5aa690dbcc954e957d767fad661f">kElemRows</a>&gt; <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::row_frag_type</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -175,7 +235,34 @@ template&lt;typename T &gt; </div>
 </div>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="a7331fff1d12f2f8b72b0006a3ad0dd83" name="a7331fff1d12f2f8b72b0006a3ad0dd83"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7331fff1d12f2f8b72b0006a3ad0dd83">&#9670;&#160;</a></span>get_coord()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a7331fff1d12f2f8b72b0006a3ad0dd83">&#9670;&#160;</a></span>get_coord() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr short2 <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::get_coord </td>
+          <td>(</td>
+          <td class="paramtype">ushort</td>          <td class="paramname"><span class="paramname"><em>simd_lane_id</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a7331fff1d12f2f8b72b0006a3ad0dd83" name="a7331fff1d12f2f8b72b0006a3ad0dd83"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a7331fff1d12f2f8b72b0006a3ad0dd83">&#9670;&#160;</a></span>get_coord() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -202,7 +289,7 @@ template&lt;typename T &gt; </div>
 </div>
 </div>
 <a id="ac73006b36fc710feda3a7c796e21415c" name="ac73006b36fc710feda3a7c796e21415c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ac73006b36fc710feda3a7c796e21415c">&#9670;&#160;</a></span>load()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#ac73006b36fc710feda3a7c796e21415c">&#9670;&#160;</a></span>load() <span class="overload">[1/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -217,7 +304,50 @@ template&lt;typename SrcPtrType , typename StrX , typename StrY &gt; </div>
         <tr>
           <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::load </td>
           <td>(</td>
-          <td class="paramtype">thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SrcPtrType</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">StrX</td>          <td class="paramname"><span class="paramname"><em>str_x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">StrY</td>          <td class="paramname"><span class="paramname"><em>str_y</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="ac73006b36fc710feda3a7c796e21415c" name="ac73006b36fc710feda3a7c796e21415c"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ac73006b36fc710feda3a7c796e21415c">&#9670;&#160;</a></span>load() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<div class="memtemplate">
+template&lt;typename SrcPtrType , typename StrX , typename StrY &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::load </td>
+          <td>(</td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -245,7 +375,7 @@ template&lt;typename SrcPtrType , typename StrX , typename StrY &gt; </div>
 </div>
 </div>
 <a id="ad22aaee4a2938cbdd315b39eda84e07d" name="ad22aaee4a2938cbdd315b39eda84e07d"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ad22aaee4a2938cbdd315b39eda84e07d">&#9670;&#160;</a></span>load_safe()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#ad22aaee4a2938cbdd315b39eda84e07d">&#9670;&#160;</a></span>load_safe() <span class="overload">[1/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -260,7 +390,70 @@ template&lt;typename SrcPtrType , typename StrX , typename StrY , typename LimX
         <tr>
           <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::load_safe </td>
           <td>(</td>
-          <td class="paramtype">thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">SrcPtrType</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">StrX</td>          <td class="paramname"><span class="paramname"><em>str_x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">StrY</td>          <td class="paramname"><span class="paramname"><em>str_y</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">LimX</td>          <td class="paramname"><span class="paramname"><em>lim_x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">LimY</td>          <td class="paramname"><span class="paramname"><em>lim_y</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">OffX</td>          <td class="paramname"><span class="paramname"><em>off_x</em></span><span class="paramdefsep"> = </span><span class="paramdefval"><a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt;0&gt;{}</span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">OffY</td>          <td class="paramname"><span class="paramname"><em>off_y</em></span><span class="paramdefsep"> = </span><span class="paramdefval"><a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt;0&gt;{}</span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="ad22aaee4a2938cbdd315b39eda84e07d" name="ad22aaee4a2938cbdd315b39eda84e07d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ad22aaee4a2938cbdd315b39eda84e07d">&#9670;&#160;</a></span>load_safe() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<div class="memtemplate">
+template&lt;typename SrcPtrType , typename StrX , typename StrY , typename LimX , typename LimY , typename OffX , typename OffY &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::load_safe </td>
+          <td>(</td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -308,7 +501,7 @@ template&lt;typename SrcPtrType , typename StrX , typename StrY , typename LimX
 </div>
 </div>
 <a id="a8028512f5a3d2b6acaf966be529627a3" name="a8028512f5a3d2b6acaf966be529627a3"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a8028512f5a3d2b6acaf966be529627a3">&#9670;&#160;</a></span>mma() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a8028512f5a3d2b6acaf966be529627a3">&#9670;&#160;</a></span>mma() <span class="overload">[1/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -321,22 +514,63 @@ template&lt;typename T &gt; </div>
         <tr>
           <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::mma </td>
           <td>(</td>
-          <td class="paramtype">thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>C</em></span>&#160;)</td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>C</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a8028512f5a3d2b6acaf966be529627a3" name="a8028512f5a3d2b6acaf966be529627a3"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a8028512f5a3d2b6acaf966be529627a3">&#9670;&#160;</a></span>mma() <span class="overload">[2/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::mma </td>
+          <td>(</td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>C</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -349,7 +583,7 @@ template&lt;typename T &gt; </div>
 </div>
 </div>
 <a id="a1868f57d57c8adedab2c58492ec76946" name="a1868f57d57c8adedab2c58492ec76946"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1868f57d57c8adedab2c58492ec76946">&#9670;&#160;</a></span>mma() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a1868f57d57c8adedab2c58492ec76946">&#9670;&#160;</a></span>mma() <span class="overload">[3/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -362,22 +596,129 @@ template&lt;typename T &gt; </div>
         <tr>
           <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::mma </td>
           <td>(</td>
-          <td class="paramtype">thread <a class="el" href="#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">thread <a class="el" href="#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">thread <a class="el" href="#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">thread <a class="el" href="#a72054f003c695b90a4fe5101e19cbaa9">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>C</em></span>&#160;)</td>
+          <td class="paramtype">thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>C</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a1868f57d57c8adedab2c58492ec76946" name="a1868f57d57c8adedab2c58492ec76946"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1868f57d57c8adedab2c58492ec76946">&#9670;&#160;</a></span>mma() <span class="overload">[4/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::mma </td>
+          <td>(</td>
+          <td class="paramtype">thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread <a class="el" href="#a958b6952cbd9462d7ae9f6e029631887">mat_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>C</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a318c4279bdc7b39b7919f108b1cd8010" name="a318c4279bdc7b39b7919f108b1cd8010"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a318c4279bdc7b39b7919f108b1cd8010">&#9670;&#160;</a></span>row_bin_op()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<div class="memtemplate">
+template&lt;typename Op &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::row_bin_op </td>
+          <td>(</td>
+          <td class="paramtype">thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>inp_vals</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread T *</td>          <td class="paramname"><span class="paramname"><em>row_vals</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a51d662e4cff88b5ad17d7c44bb6b6970" name="a51d662e4cff88b5ad17d7c44bb6b6970"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a51d662e4cff88b5ad17d7c44bb6b6970">&#9670;&#160;</a></span>row_reduce()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<div class="memtemplate">
+template&lt;typename Op &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::row_reduce </td>
+          <td>(</td>
+          <td class="paramtype">thread const <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>inp_vals</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread T *</td>          <td class="paramname"><span class="paramname"><em>reduced_vals</em></span>&#160;)</td>
         </tr>
       </table>
   </td>
@@ -390,7 +731,7 @@ template&lt;typename T &gt; </div>
 </div>
 </div>
 <a id="aa8f50ea8961ec5b35c1b81366d64f2cb" name="aa8f50ea8961ec5b35c1b81366d64f2cb"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa8f50ea8961ec5b35c1b81366d64f2cb">&#9670;&#160;</a></span>store()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aa8f50ea8961ec5b35c1b81366d64f2cb">&#9670;&#160;</a></span>store() <span class="overload">[1/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -405,7 +746,50 @@ template&lt;typename DstPtrType , typename StrX , typename StrY &gt; </div>
         <tr>
           <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::store </td>
           <td>(</td>
-          <td class="paramtype">const thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
+          <td class="paramtype">const thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">DstPtrType</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">StrX</td>          <td class="paramname"><span class="paramname"><em>str_x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">StrY</td>          <td class="paramname"><span class="paramname"><em>str_y</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aa8f50ea8961ec5b35c1b81366d64f2cb" name="aa8f50ea8961ec5b35c1b81366d64f2cb"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa8f50ea8961ec5b35c1b81366d64f2cb">&#9670;&#160;</a></span>store() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<div class="memtemplate">
+template&lt;typename DstPtrType , typename StrX , typename StrY &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::store </td>
+          <td>(</td>
+          <td class="paramtype">const thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -433,7 +817,7 @@ template&lt;typename DstPtrType , typename StrX , typename StrY &gt; </div>
 </div>
 </div>
 <a id="a1f0b00daad8eba2f855bb306e70d2328" name="a1f0b00daad8eba2f855bb306e70d2328"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1f0b00daad8eba2f855bb306e70d2328">&#9670;&#160;</a></span>store_safe()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a1f0b00daad8eba2f855bb306e70d2328">&#9670;&#160;</a></span>store_safe() <span class="overload">[1/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -448,7 +832,70 @@ template&lt;typename DstPtrType , typename StrX , typename StrY , typename LimX
         <tr>
           <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::store_safe </td>
           <td>(</td>
-          <td class="paramtype">const thread <a class="el" href="#af67c1b1aea594468e9426e1be0e31d0b">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
+          <td class="paramtype">const thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">DstPtrType</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">StrX</td>          <td class="paramname"><span class="paramname"><em>str_x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">StrY</td>          <td class="paramname"><span class="paramname"><em>str_y</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">LimX</td>          <td class="paramname"><span class="paramname"><em>lim_x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">LimY</td>          <td class="paramname"><span class="paramname"><em>lim_y</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">OffX</td>          <td class="paramname"><span class="paramname"><em>off_x</em></span><span class="paramdefsep"> = </span><span class="paramdefval"><a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt;0&gt;{}</span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">OffY</td>          <td class="paramname"><span class="paramname"><em>off_y</em></span><span class="paramdefsep"> = </span><span class="paramdefval"><a class="el" href="namespacemlx_1_1steel.html#afe36ddf6725498d273e5eef4f1579891">Int</a>&lt;0&gt;{}</span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a1f0b00daad8eba2f855bb306e70d2328" name="a1f0b00daad8eba2f855bb306e70d2328"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1f0b00daad8eba2f855bb306e70d2328">&#9670;&#160;</a></span>store_safe() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T &gt; </div>
+<div class="memtemplate">
+template&lt;typename DstPtrType , typename StrX , typename StrY , typename LimX , typename LimY , typename OffX , typename OffY &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">mlx::steel::BaseMMAFrag</a>&lt; T, 8, 8 &gt;::store_safe </td>
+          <td>(</td>
+          <td class="paramtype">const thread <a class="el" href="#a9f53a5e9b046b4f217e782b733941b0c">frag_type</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -576,8 +1023,9 @@ template&lt;typename T &gt; </div>
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="mma_8h_source.html">mma.h</a></li>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2mma_8h_source.html">mma.h</a></li>
+<li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="gemm_2mma_8h_source.html">mma.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_loader-members.html b/docs/build/html/structmlx_1_1steel_1_1_block_loader-members.html
index 469f9c8b9..915957111 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_block_loader-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_loader-members.html
@@ -95,19 +95,24 @@ $(function(){ initResizable(false); });
 <p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a>, including all inherited members.</p>
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">apply_inplace_op</a>(thread const UnaryOp &amp;op) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#adb4ca2cc193630a779de552fa8847ddf">apply_inplace_op</a>(thread const UnaryOp &amp;op) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a9ef13742bcdf07532d8f09394928a8af">bi</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a78c326e75ee35a484685771143047cd4">bj</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335">BlockLoader</a>(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a37aca066e63dff238865b5923a2d4335">BlockLoader</a>(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#af34c184a19846e4b40ba54b2946589ec">dst</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#af1c6c35a42e9da4408c1013ff1741bc2">dst</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">load_safe</a>(short2 src_tile_dim) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#abb0f4f66ec8b123627beb8eb4fbb609d">load_safe</a>(short2 src_tile_dim) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27">load_unsafe</a>() const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6c9e27f11f48b34580ed2c7e9cad9a27">load_unsafe</a>() const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a973804e5b1d418c98c90861cda1a6fb5">n_rows</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">next</a>()</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a13004952d0bf2030b95acb621a3779dd">src</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">tile_stride</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a6af21428f0e7c17b48ddedf4dd20a1e8">next</a>()</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#ad1db14517568ae9eddfb6986ef31c7aa">src</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#aadafc50f7f06af434149d7469df4714d">src_ld</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a064e2cc77e0b1cf0f8027929e031775b">thread_idx</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#ab87876699d55473620c7ea99f9da911d">tile_stride</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_loader.html b/docs/build/html/structmlx_1_1steel_1_1_block_loader.html
index 9eb136773..e66ce3abd 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_block_loader.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_loader.html
@@ -97,7 +97,7 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="gemm_2loader_8h_source.html">loader.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="attn_2loader_8h_source.html">loader.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="nested-classes" name="nested-classes"></a>
 Classes</h2></td></tr>
@@ -117,6 +117,17 @@ Public Member Functions</h2></td></tr>
 <tr class="separator:abb0f4f66ec8b123627beb8eb4fbb609d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a6af21428f0e7c17b48ddedf4dd20a1e8" id="r_a6af21428f0e7c17b48ddedf4dd20a1e8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6af21428f0e7c17b48ddedf4dd20a1e8">next</a> ()</td></tr>
 <tr class="separator:a6af21428f0e7c17b48ddedf4dd20a1e8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a37aca066e63dff238865b5923a2d4335" id="r_a37aca066e63dff238865b5923a2d4335"><td class="memItemLeft" align="right" valign="top">METAL_FUNC&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a37aca066e63dff238865b5923a2d4335">BlockLoader</a> (const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</td></tr>
+<tr class="separator:a37aca066e63dff238865b5923a2d4335"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:adb4ca2cc193630a779de552fa8847ddf" id="r_adb4ca2cc193630a779de552fa8847ddf"><td class="memTemplParams" colspan="2">template&lt;typename UnaryOp &gt; </td></tr>
+<tr class="memitem:adb4ca2cc193630a779de552fa8847ddf"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#adb4ca2cc193630a779de552fa8847ddf">apply_inplace_op</a> (thread const UnaryOp &amp;<a class="el" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>) const</td></tr>
+<tr class="separator:adb4ca2cc193630a779de552fa8847ddf"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6c9e27f11f48b34580ed2c7e9cad9a27" id="r_a6c9e27f11f48b34580ed2c7e9cad9a27"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6c9e27f11f48b34580ed2c7e9cad9a27">load_unsafe</a> () const</td></tr>
+<tr class="separator:a6c9e27f11f48b34580ed2c7e9cad9a27"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:abb0f4f66ec8b123627beb8eb4fbb609d" id="r_abb0f4f66ec8b123627beb8eb4fbb609d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abb0f4f66ec8b123627beb8eb4fbb609d">load_safe</a> (short2 src_tile_dim) const</td></tr>
+<tr class="separator:abb0f4f66ec8b123627beb8eb4fbb609d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6af21428f0e7c17b48ddedf4dd20a1e8" id="r_a6af21428f0e7c17b48ddedf4dd20a1e8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6af21428f0e7c17b48ddedf4dd20a1e8">next</a> ()</td></tr>
+<tr class="separator:a6af21428f0e7c17b48ddedf4dd20a1e8"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
 Public Attributes</h2></td></tr>
@@ -134,14 +145,60 @@ Public Attributes</h2></td></tr>
 <tr class="separator:a9ef13742bcdf07532d8f09394928a8af"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a78c326e75ee35a484685771143047cd4" id="r_a78c326e75ee35a484685771143047cd4"><td class="memItemLeft" align="right" valign="top">const short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a78c326e75ee35a484685771143047cd4">bj</a></td></tr>
 <tr class="separator:a78c326e75ee35a484685771143047cd4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af34c184a19846e4b40ba54b2946589ec" id="r_af34c184a19846e4b40ba54b2946589ec"><td class="memItemLeft" align="right" valign="top">threadgroup T *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af34c184a19846e4b40ba54b2946589ec">dst</a></td></tr>
-<tr class="separator:af34c184a19846e4b40ba54b2946589ec"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a13004952d0bf2030b95acb621a3779dd" id="r_a13004952d0bf2030b95acb621a3779dd"><td class="memItemLeft" align="right" valign="top">const device T *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a13004952d0bf2030b95acb621a3779dd">src</a></td></tr>
-<tr class="separator:a13004952d0bf2030b95acb621a3779dd"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:af1c6c35a42e9da4408c1013ff1741bc2" id="r_af1c6c35a42e9da4408c1013ff1741bc2"><td class="memItemLeft" align="right" valign="top">threadgroup T *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af1c6c35a42e9da4408c1013ff1741bc2">dst</a></td></tr>
+<tr class="separator:af1c6c35a42e9da4408c1013ff1741bc2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ad1db14517568ae9eddfb6986ef31c7aa" id="r_ad1db14517568ae9eddfb6986ef31c7aa"><td class="memItemLeft" align="right" valign="top">const device T *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad1db14517568ae9eddfb6986ef31c7aa">src</a></td></tr>
+<tr class="separator:ad1db14517568ae9eddfb6986ef31c7aa"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <a id="a37aca066e63dff238865b5923a2d4335" name="a37aca066e63dff238865b5923a2d4335"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a37aca066e63dff238865b5923a2d4335">&#9670;&#160;</a></span>BlockLoader()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a37aca066e63dff238865b5923a2d4335">&#9670;&#160;</a></span>BlockLoader() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short alignment = 1, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::BlockLoader </td>
+          <td>(</td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>src_</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>src_ld_</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">threadgroup T *</td>          <td class="paramname"><span class="paramname"><em>dst_</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">ushort</td>          <td class="paramname"><span class="paramname"><em>simd_group_id</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">ushort</td>          <td class="paramname"><span class="paramname"><em>simd_lane_id</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a37aca066e63dff238865b5923a2d4335" name="a37aca066e63dff238865b5923a2d4335"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a37aca066e63dff238865b5923a2d4335">&#9670;&#160;</a></span>BlockLoader() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -188,7 +245,36 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 </div>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="adb4ca2cc193630a779de552fa8847ddf" name="adb4ca2cc193630a779de552fa8847ddf"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#adb4ca2cc193630a779de552fa8847ddf">&#9670;&#160;</a></span>apply_inplace_op()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#adb4ca2cc193630a779de552fa8847ddf">&#9670;&#160;</a></span>apply_inplace_op() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short alignment = 1, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<div class="memtemplate">
+template&lt;typename UnaryOp &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::apply_inplace_op </td>
+          <td>(</td>
+          <td class="paramtype">thread const UnaryOp &amp;</td>          <td class="paramname"><span class="paramname"><em>op</em></span></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="adb4ca2cc193630a779de552fa8847ddf" name="adb4ca2cc193630a779de552fa8847ddf"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#adb4ca2cc193630a779de552fa8847ddf">&#9670;&#160;</a></span>apply_inplace_op() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -217,7 +303,34 @@ template&lt;typename UnaryOp &gt; </div>
 </div>
 </div>
 <a id="abb0f4f66ec8b123627beb8eb4fbb609d" name="abb0f4f66ec8b123627beb8eb4fbb609d"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#abb0f4f66ec8b123627beb8eb4fbb609d">&#9670;&#160;</a></span>load_safe()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#abb0f4f66ec8b123627beb8eb4fbb609d">&#9670;&#160;</a></span>load_safe() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short alignment = 1, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::load_safe </td>
+          <td>(</td>
+          <td class="paramtype">short2</td>          <td class="paramname"><span class="paramname"><em>src_tile_dim</em></span></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="abb0f4f66ec8b123627beb8eb4fbb609d" name="abb0f4f66ec8b123627beb8eb4fbb609d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#abb0f4f66ec8b123627beb8eb4fbb609d">&#9670;&#160;</a></span>load_safe() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -244,7 +357,34 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 </div>
 </div>
 <a id="a6c9e27f11f48b34580ed2c7e9cad9a27" name="a6c9e27f11f48b34580ed2c7e9cad9a27"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a6c9e27f11f48b34580ed2c7e9cad9a27">&#9670;&#160;</a></span>load_unsafe()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a6c9e27f11f48b34580ed2c7e9cad9a27">&#9670;&#160;</a></span>load_unsafe() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short alignment = 1, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::load_unsafe </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a6c9e27f11f48b34580ed2c7e9cad9a27" name="a6c9e27f11f48b34580ed2c7e9cad9a27"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6c9e27f11f48b34580ed2c7e9cad9a27">&#9670;&#160;</a></span>load_unsafe() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -271,7 +411,34 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 </div>
 </div>
 <a id="a6af21428f0e7c17b48ddedf4dd20a1e8" name="a6af21428f0e7c17b48ddedf4dd20a1e8"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a6af21428f0e7c17b48ddedf4dd20a1e8">&#9670;&#160;</a></span>next()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a6af21428f0e7c17b48ddedf4dd20a1e8">&#9670;&#160;</a></span>next() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short alignment = 1, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::next </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a6af21428f0e7c17b48ddedf4dd20a1e8" name="a6af21428f0e7c17b48ddedf4dd20a1e8"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6af21428f0e7c17b48ddedf4dd20a1e8">&#9670;&#160;</a></span>next() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -330,8 +497,8 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 
 </div>
 </div>
-<a id="af34c184a19846e4b40ba54b2946589ec" name="af34c184a19846e4b40ba54b2946589ec"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#af34c184a19846e4b40ba54b2946589ec">&#9670;&#160;</a></span>dst</h2>
+<a id="af1c6c35a42e9da4408c1013ff1741bc2" name="af1c6c35a42e9da4408c1013ff1741bc2"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#af1c6c35a42e9da4408c1013ff1741bc2">&#9670;&#160;</a></span>dst</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -339,7 +506,7 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short alignment = 1, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">threadgroup T* <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::dst</td>
+          <td class="memname">threadgroup T * <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::dst</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -362,8 +529,8 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 
 </div>
 </div>
-<a id="a13004952d0bf2030b95acb621a3779dd" name="a13004952d0bf2030b95acb621a3779dd"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a13004952d0bf2030b95acb621a3779dd">&#9670;&#160;</a></span>src</h2>
+<a id="ad1db14517568ae9eddfb6986ef31c7aa" name="ad1db14517568ae9eddfb6986ef31c7aa"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ad1db14517568ae9eddfb6986ef31c7aa">&#9670;&#160;</a></span>src</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -371,7 +538,7 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short alignment = 1, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">const device T* <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::src</td>
+          <td class="memname">const device T * <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::src</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -442,7 +609,8 @@ template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2loader_8h_source.html">loader.h</a></li>
 <li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="gemm_2loader_8h_source.html">loader.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_loader_1_1_read_vector-members.html b/docs/build/html/structmlx_1_1steel_1_1_block_loader_1_1_read_vector-members.html
index c81cfc317..92b5f5cb8 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_block_loader_1_1_read_vector-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_loader_1_1_read_vector-members.html
@@ -94,7 +94,7 @@ $(function(){ initResizable(false); });
 
 <p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a>, including all inherited members.</p>
 <table class="directory">
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#afbef88bfb901a71e8423de911b7c7347">v</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html#a20963f7191251defca48bf8a843d019d">v</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html">mlx::steel::BlockLoader&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html b/docs/build/html/structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html
index bd160ce36..158b1e2b5 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_loader_1_1_read_vector.html
@@ -95,16 +95,16 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="gemm_2loader_8h_source.html">loader.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="attn_2loader_8h_source.html">loader.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
 Public Attributes</h2></td></tr>
-<tr class="memitem:afbef88bfb901a71e8423de911b7c7347" id="r_afbef88bfb901a71e8423de911b7c7347"><td class="memItemLeft" align="right" valign="top">uint8_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afbef88bfb901a71e8423de911b7c7347">v</a> [sizeof(T) *<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>]</td></tr>
-<tr class="separator:afbef88bfb901a71e8423de911b7c7347"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a20963f7191251defca48bf8a843d019d" id="r_a20963f7191251defca48bf8a843d019d"><td class="memItemLeft" align="right" valign="top">uint8_t&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a20963f7191251defca48bf8a843d019d">v</a> [sizeof(T) *<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>]</td></tr>
+<tr class="separator:a20963f7191251defca48bf8a843d019d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Data Documentation</h2>
-<a id="afbef88bfb901a71e8423de911b7c7347" name="afbef88bfb901a71e8423de911b7c7347"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#afbef88bfb901a71e8423de911b7c7347">&#9670;&#160;</a></span>v</h2>
+<a id="a20963f7191251defca48bf8a843d019d" name="a20963f7191251defca48bf8a843d019d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a20963f7191251defca48bf8a843d019d">&#9670;&#160;</a></span>v</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -112,14 +112,15 @@ Public Attributes</h2></td></tr>
 template&lt;typename T , short BROWS, short BCOLS, short dst_ld, short reduction_dim, short tgp_size, short alignment = 1, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">uint8_t <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector::v[sizeof(T) *<a class="el" href="structmlx_1_1steel_1_1_block_loader.html#a58bdf9b9c81962733e22ecdeae28c092">vec_size</a>]</td>
+          <td class="memname">uint8_t <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">mlx::steel::BlockLoader</a>&lt; T, BROWS, BCOLS, dst_ld, reduction_dim, tgp_size, alignment, n_reads, TCOLS, TROWS &gt;::ReadVector::v</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2loader_8h_source.html">loader.h</a></li>
 <li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="gemm_2loader_8h_source.html">loader.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_loader_t-members.html b/docs/build/html/structmlx_1_1steel_1_1_block_loader_t-members.html
new file mode 100644
index 000000000..5a004ecf1
--- /dev/null
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_loader_t-members.html
@@ -0,0 +1,118 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">BlockLoaderT</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt; Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a2b136fad00dc54300e68aa6b905eff97">apply_inplace_op</a>(thread const UnaryOp &amp;op) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a6964273994b06d6cf8ef7e59fb10bb35">bi</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#aca83e49c31095badc8a46eb3c8e00957">bj</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a076616a7c67ad1b847e0e6b046077ee2">BlockLoaderT</a>(const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a6eb4e566b687395e27f290da288362db">dst</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#ac2d95e35ba39e0984e6f1e58ca935f7d">load_safe</a>(short2 src_tile_dim) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#acb743f32146fdc7986264b7beb35fb38">load_unsafe</a>() const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a0ccc7caa93e6e709981a1a08159d41dc">n_rows</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a6008ef45ff980dbe1119da0630f6c697">next</a>()</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a7004a4efaa483cc79b8b79810a17c777">src</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#aeba87e81185da6b20a092c5d240d3321">src_ld</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#af2838998a02866f22b525f9b6ae004da">thread_idx</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a3abb86e68adb7e4d87cb808d6c25e35f">tile_stride</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;</a></td><td class="entry"></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_loader_t.html b/docs/build/html/structmlx_1_1steel_1_1_block_loader_t.html
new file mode 100644
index 000000000..b996f4ed8
--- /dev/null
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_loader_t.html
@@ -0,0 +1,449 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt; Struct Template Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">BlockLoaderT</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-methods">Public Member Functions</a> &#124;
+<a href="#pub-attribs">Public Attributes</a> &#124;
+<a href="structmlx_1_1steel_1_1_block_loader_t-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">mlx::steel::BlockLoaderT&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt; Struct Template Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="attn_2loader_8h_source.html">loader.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
+Public Member Functions</h2></td></tr>
+<tr class="memitem:a076616a7c67ad1b847e0e6b046077ee2" id="r_a076616a7c67ad1b847e0e6b046077ee2"><td class="memItemLeft" align="right" valign="top">METAL_FUNC&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a076616a7c67ad1b847e0e6b046077ee2">BlockLoaderT</a> (const device T *src_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)</td></tr>
+<tr class="separator:a076616a7c67ad1b847e0e6b046077ee2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a2b136fad00dc54300e68aa6b905eff97" id="r_a2b136fad00dc54300e68aa6b905eff97"><td class="memTemplParams" colspan="2">template&lt;typename UnaryOp &gt; </td></tr>
+<tr class="memitem:a2b136fad00dc54300e68aa6b905eff97"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a2b136fad00dc54300e68aa6b905eff97">apply_inplace_op</a> (thread const UnaryOp &amp;<a class="el" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>) const</td></tr>
+<tr class="separator:a2b136fad00dc54300e68aa6b905eff97"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:acb743f32146fdc7986264b7beb35fb38" id="r_acb743f32146fdc7986264b7beb35fb38"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#acb743f32146fdc7986264b7beb35fb38">load_unsafe</a> () const</td></tr>
+<tr class="separator:acb743f32146fdc7986264b7beb35fb38"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac2d95e35ba39e0984e6f1e58ca935f7d" id="r_ac2d95e35ba39e0984e6f1e58ca935f7d"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac2d95e35ba39e0984e6f1e58ca935f7d">load_safe</a> (short2 src_tile_dim) const</td></tr>
+<tr class="separator:ac2d95e35ba39e0984e6f1e58ca935f7d"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6008ef45ff980dbe1119da0630f6c697" id="r_a6008ef45ff980dbe1119da0630f6c697"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6008ef45ff980dbe1119da0630f6c697">next</a> ()</td></tr>
+<tr class="separator:a6008ef45ff980dbe1119da0630f6c697"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
+Public Attributes</h2></td></tr>
+<tr class="memitem:a0ccc7caa93e6e709981a1a08159d41dc" id="r_a0ccc7caa93e6e709981a1a08159d41dc"><td class="memItemLeft" align="right" valign="top"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0ccc7caa93e6e709981a1a08159d41dc">n_rows</a> = (BROWS + TROWS - 1) / TROWS</td></tr>
+<tr class="separator:a0ccc7caa93e6e709981a1a08159d41dc"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a9ac651d9e5097507c57b10dfeb40bfe5" id="r_a9ac651d9e5097507c57b10dfeb40bfe5"><td class="memItemLeft" align="right" valign="top"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9ac651d9e5097507c57b10dfeb40bfe5">vec_size</a> = n_reads</td></tr>
+<tr class="separator:a9ac651d9e5097507c57b10dfeb40bfe5"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aeba87e81185da6b20a092c5d240d3321" id="r_aeba87e81185da6b20a092c5d240d3321"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aeba87e81185da6b20a092c5d240d3321">src_ld</a></td></tr>
+<tr class="separator:aeba87e81185da6b20a092c5d240d3321"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3abb86e68adb7e4d87cb808d6c25e35f" id="r_a3abb86e68adb7e4d87cb808d6c25e35f"><td class="memItemLeft" align="right" valign="top">const int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3abb86e68adb7e4d87cb808d6c25e35f">tile_stride</a></td></tr>
+<tr class="separator:a3abb86e68adb7e4d87cb808d6c25e35f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:af2838998a02866f22b525f9b6ae004da" id="r_af2838998a02866f22b525f9b6ae004da"><td class="memItemLeft" align="right" valign="top">const short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af2838998a02866f22b525f9b6ae004da">thread_idx</a></td></tr>
+<tr class="separator:af2838998a02866f22b525f9b6ae004da"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6964273994b06d6cf8ef7e59fb10bb35" id="r_a6964273994b06d6cf8ef7e59fb10bb35"><td class="memItemLeft" align="right" valign="top">const short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6964273994b06d6cf8ef7e59fb10bb35">bi</a></td></tr>
+<tr class="separator:a6964273994b06d6cf8ef7e59fb10bb35"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aca83e49c31095badc8a46eb3c8e00957" id="r_aca83e49c31095badc8a46eb3c8e00957"><td class="memItemLeft" align="right" valign="top">const short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aca83e49c31095badc8a46eb3c8e00957">bj</a></td></tr>
+<tr class="separator:aca83e49c31095badc8a46eb3c8e00957"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6eb4e566b687395e27f290da288362db" id="r_a6eb4e566b687395e27f290da288362db"><td class="memItemLeft" align="right" valign="top">threadgroup T *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6eb4e566b687395e27f290da288362db">dst</a></td></tr>
+<tr class="separator:a6eb4e566b687395e27f290da288362db"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a7004a4efaa483cc79b8b79810a17c777" id="r_a7004a4efaa483cc79b8b79810a17c777"><td class="memItemLeft" align="right" valign="top">const device T *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7004a4efaa483cc79b8b79810a17c777">src</a></td></tr>
+<tr class="separator:a7004a4efaa483cc79b8b79810a17c777"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
+<a id="a076616a7c67ad1b847e0e6b046077ee2" name="a076616a7c67ad1b847e0e6b046077ee2"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a076616a7c67ad1b847e0e6b046077ee2">&#9670;&#160;</a></span>BlockLoaderT()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::BlockLoaderT </td>
+          <td>(</td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>src_</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>src_ld_</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">threadgroup T *</td>          <td class="paramname"><span class="paramname"><em>dst_</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">ushort</td>          <td class="paramname"><span class="paramname"><em>simd_group_id</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">ushort</td>          <td class="paramname"><span class="paramname"><em>simd_lane_id</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<h2 class="groupheader">Member Function Documentation</h2>
+<a id="a2b136fad00dc54300e68aa6b905eff97" name="a2b136fad00dc54300e68aa6b905eff97"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a2b136fad00dc54300e68aa6b905eff97">&#9670;&#160;</a></span>apply_inplace_op()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<div class="memtemplate">
+template&lt;typename UnaryOp &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::apply_inplace_op </td>
+          <td>(</td>
+          <td class="paramtype">thread const UnaryOp &amp;</td>          <td class="paramname"><span class="paramname"><em>op</em></span></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="ac2d95e35ba39e0984e6f1e58ca935f7d" name="ac2d95e35ba39e0984e6f1e58ca935f7d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ac2d95e35ba39e0984e6f1e58ca935f7d">&#9670;&#160;</a></span>load_safe()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::load_safe </td>
+          <td>(</td>
+          <td class="paramtype">short2</td>          <td class="paramname"><span class="paramname"><em>src_tile_dim</em></span></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="acb743f32146fdc7986264b7beb35fb38" name="acb743f32146fdc7986264b7beb35fb38"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#acb743f32146fdc7986264b7beb35fb38">&#9670;&#160;</a></span>load_unsafe()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::load_unsafe </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a6008ef45ff980dbe1119da0630f6c697" name="a6008ef45ff980dbe1119da0630f6c697"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6008ef45ff980dbe1119da0630f6c697">&#9670;&#160;</a></span>next()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::next </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<h2 class="groupheader">Member Data Documentation</h2>
+<a id="a6964273994b06d6cf8ef7e59fb10bb35" name="a6964273994b06d6cf8ef7e59fb10bb35"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6964273994b06d6cf8ef7e59fb10bb35">&#9670;&#160;</a></span>bi</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">const short <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::bi</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aca83e49c31095badc8a46eb3c8e00957" name="aca83e49c31095badc8a46eb3c8e00957"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aca83e49c31095badc8a46eb3c8e00957">&#9670;&#160;</a></span>bj</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">const short <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::bj</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a6eb4e566b687395e27f290da288362db" name="a6eb4e566b687395e27f290da288362db"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6eb4e566b687395e27f290da288362db">&#9670;&#160;</a></span>dst</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">threadgroup T* <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::dst</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a0ccc7caa93e6e709981a1a08159d41dc" name="a0ccc7caa93e6e709981a1a08159d41dc"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a0ccc7caa93e6e709981a1a08159d41dc">&#9670;&#160;</a></span>n_rows</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> short <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::n_rows = (BROWS + TROWS - 1) / TROWS</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a7004a4efaa483cc79b8b79810a17c777" name="a7004a4efaa483cc79b8b79810a17c777"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a7004a4efaa483cc79b8b79810a17c777">&#9670;&#160;</a></span>src</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">const device T* <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::src</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aeba87e81185da6b20a092c5d240d3321" name="aeba87e81185da6b20a092c5d240d3321"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aeba87e81185da6b20a092c5d240d3321">&#9670;&#160;</a></span>src_ld</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">const int <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::src_ld</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="af2838998a02866f22b525f9b6ae004da" name="af2838998a02866f22b525f9b6ae004da"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#af2838998a02866f22b525f9b6ae004da">&#9670;&#160;</a></span>thread_idx</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">const short <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::thread_idx</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a3abb86e68adb7e4d87cb808d6c25e35f" name="a3abb86e68adb7e4d87cb808d6c25e35f"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a3abb86e68adb7e4d87cb808d6c25e35f">&#9670;&#160;</a></span>tile_stride</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">const int <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::tile_stride</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a9ac651d9e5097507c57b10dfeb40bfe5" name="a9ac651d9e5097507c57b10dfeb40bfe5"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a9ac651d9e5097507c57b10dfeb40bfe5">&#9670;&#160;</a></span>vec_size</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , short BROWS, short BCOLS, short kDstStrRow, short kDstStrCol, short reduction_dim, short tgp_size, short n_reads = (BCOLS * BROWS) / (tgp_size), short TCOLS = BCOLS / n_reads, short TROWS = tgp_size / TCOLS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> short <a class="el" href="structmlx_1_1steel_1_1_block_loader_t.html">mlx::steel::BlockLoaderT</a>&lt; T, BROWS, BCOLS, kDstStrRow, kDstStrCol, reduction_dim, tgp_size, n_reads, TCOLS, TROWS &gt;::vec_size = n_reads</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2loader_8h_source.html">loader.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_m_m_a-members.html b/docs/build/html/structmlx_1_1steel_1_1_block_m_m_a-members.html
index edb23a08e..ac56509cd 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_block_m_m_a-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_m_m_a-members.html
@@ -98,30 +98,39 @@ $(function(){ initResizable(false); });
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ab9c7f5386594497f5f4df7e59670b877">A_str_m</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">apply_epilogue</a>(thread const UnaryEpilogue &amp;epilogue_op)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae">apply_epilogue</a>(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#af653c0808ba4fa9a25286f1febb7baff">apply_epilogue</a>(thread const UnaryEpilogue &amp;epilogue_op)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a823c56cbd2086f10272df7284a5247ae">apply_epilogue</a>(const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a">apply_epilogue_safe</a>(const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const BinaryEpilogue &amp;epilogue_op)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">As_offset</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#af1a138c5e118147dc46475e4a5557e7c">Atile</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">B_str_k</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">B_str_n</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a9e48f2d51099ec00171506724faab54a">apply_epilogue_safe</a>(const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const BinaryEpilogue &amp;epilogue_op)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a138ed1bbad2ca88d3a3c7d162cd36562">As_offset</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a47e614120c650f7479db79f23a0df586">Atile</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa71400922babd388177f228c2c82b211">B_str_k</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a49538190209e522ddbef45fe95563d17">B_str_n</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8">BlockMMA</a>(ushort simd_group_id, ushort simd_lane_id)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa14406b7298456ac45d23dd3c4642dd8">BlockMMA</a>(ushort simd_group_id, ushort simd_lane_id)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a92f6aeee432f53638447eac842f43eca">Bs_offset</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a19abba19edeb37018da4bd31e01c8e26">Btile</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88">Ctile</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a44fca27c821764317263047a780977b0">Btile</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a21b0c40d16eced109bd3196186170bc6">Ctile</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0">mma</a>(const threadgroup T *As, const threadgroup T *Bs)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae2c42cb6d0dde785859164c195f4d13c">MMAFrag_acc_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b">store_result</a>(device U *D, const int ldd)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3">store_result</a>(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611">store_result_safe</a>(device U *D, const int ldd, short2 dst_tile_dims)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391">store_result_safe</a>(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">tile_stride_a</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">tile_stride_b</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0">mma</a>(const threadgroup T *As, const threadgroup T *Bs)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8231b0e3475077c1381eb8f5daf62e35">MMAFrag_acc_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aa85451edf6900fd6af164d4d50889ae3">sm</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ade420e8b811d597345783c324c23a34a">sn</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b">store_result</a>(device U *D, const int ldd)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3">store_result</a>(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a0461451ffb5041b6a916ea17ed34288b">store_result</a>(device U *D, const int ldd)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7cf757e9785e23997b1417e024559ed3">store_result</a>(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611">store_result_safe</a>(device U *D, const int ldd, short2 dst_tile_dims)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391">store_result_safe</a>(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a081ba538d30d1d02498a7f341e6bd611">store_result_safe</a>(device U *D, const int ldd, short2 dst_tile_dims)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a7b324c992750ed3aaa4c485f15b2f391">store_result_safe</a>(device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8fddaa78913cdc8eea5e1cf7d2776330">tile_stride_a</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#ae3f35453b3afbaac9df64ad5966b34a4">tile_stride_b</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a5b0029866f493363942133b55bff7307">TM_stride</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a706ae779c1f8d2eb18f19c248567d424">TN</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html#a8b3690b383afd26563efb38f9c375e50">TN_stride</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_m_m_a.html b/docs/build/html/structmlx_1_1steel_1_1_block_m_m_a.html
index 95f0ec96d..8c6813e2d 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_block_m_m_a.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_m_m_a.html
@@ -97,12 +97,12 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="mma_8h_source.html">mma.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="attn_2mma_8h_source.html">mma.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-types" name="pub-types"></a>
 Public Types</h2></td></tr>
-<tr class="memitem:ae2c42cb6d0dde785859164c195f4d13c" id="r_ae2c42cb6d0dde785859164c195f4d13c"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae2c42cb6d0dde785859164c195f4d13c">MMAFrag_acc_t</a> = <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a>&lt;AccumType, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>&gt;</td></tr>
-<tr class="separator:ae2c42cb6d0dde785859164c195f4d13c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a8231b0e3475077c1381eb8f5daf62e35" id="r_a8231b0e3475077c1381eb8f5daf62e35"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8231b0e3475077c1381eb8f5daf62e35">MMAFrag_acc_t</a> = <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a>&lt;AccumType, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>&gt;</td></tr>
+<tr class="separator:a8231b0e3475077c1381eb8f5daf62e35"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
@@ -127,6 +127,27 @@ Public Member Functions</h2></td></tr>
 <tr class="separator:a7cf757e9785e23997b1417e024559ed3"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a7b324c992750ed3aaa4c485f15b2f391" id="r_a7b324c992750ed3aaa4c485f15b2f391"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7b324c992750ed3aaa4c485f15b2f391">store_result_safe</a> (device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const</td></tr>
 <tr class="separator:a7b324c992750ed3aaa4c485f15b2f391"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa14406b7298456ac45d23dd3c4642dd8" id="r_aa14406b7298456ac45d23dd3c4642dd8"><td class="memItemLeft" align="right" valign="top">METAL_FUNC&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa14406b7298456ac45d23dd3c4642dd8">BlockMMA</a> (ushort simd_group_id, ushort simd_lane_id)</td></tr>
+<tr class="separator:aa14406b7298456ac45d23dd3c4642dd8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6a2c2a6d5e767d52c41b42a9d36086b0" id="r_a6a2c2a6d5e767d52c41b42a9d36086b0"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6a2c2a6d5e767d52c41b42a9d36086b0">mma</a> (const threadgroup T *As, const threadgroup T *Bs)</td></tr>
+<tr class="separator:a6a2c2a6d5e767d52c41b42a9d36086b0"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a0461451ffb5041b6a916ea17ed34288b" id="r_a0461451ffb5041b6a916ea17ed34288b"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0461451ffb5041b6a916ea17ed34288b">store_result</a> (device U *D, const int ldd)</td></tr>
+<tr class="separator:a0461451ffb5041b6a916ea17ed34288b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a081ba538d30d1d02498a7f341e6bd611" id="r_a081ba538d30d1d02498a7f341e6bd611"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a081ba538d30d1d02498a7f341e6bd611">store_result_safe</a> (device U *D, const int ldd, short2 dst_tile_dims)</td></tr>
+<tr class="separator:a081ba538d30d1d02498a7f341e6bd611"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:af653c0808ba4fa9a25286f1febb7baff" id="r_af653c0808ba4fa9a25286f1febb7baff"><td class="memTemplParams" colspan="2">template&lt;typename UnaryEpilogue &gt; </td></tr>
+<tr class="memitem:af653c0808ba4fa9a25286f1febb7baff"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#af653c0808ba4fa9a25286f1febb7baff">apply_epilogue</a> (thread const UnaryEpilogue &amp;epilogue_op)</td></tr>
+<tr class="separator:af653c0808ba4fa9a25286f1febb7baff"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a823c56cbd2086f10272df7284a5247ae" id="r_a823c56cbd2086f10272df7284a5247ae"><td class="memTemplParams" colspan="2">template&lt;typename BinaryEpilogue &gt; </td></tr>
+<tr class="memitem:a823c56cbd2086f10272df7284a5247ae"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a823c56cbd2086f10272df7284a5247ae">apply_epilogue</a> (const device U *C, const int ldc, const int fdc, thread const BinaryEpilogue &amp;epilogue_op)</td></tr>
+<tr class="separator:a823c56cbd2086f10272df7284a5247ae"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a9e48f2d51099ec00171506724faab54a" id="r_a9e48f2d51099ec00171506724faab54a"><td class="memTemplParams" colspan="2">template&lt;typename BinaryEpilogue &gt; </td></tr>
+<tr class="memitem:a9e48f2d51099ec00171506724faab54a"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a9e48f2d51099ec00171506724faab54a">apply_epilogue_safe</a> (const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const BinaryEpilogue &amp;epilogue_op)</td></tr>
+<tr class="separator:a9e48f2d51099ec00171506724faab54a"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a7cf757e9785e23997b1417e024559ed3" id="r_a7cf757e9785e23997b1417e024559ed3"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7cf757e9785e23997b1417e024559ed3">store_result</a> (device U *D, const int ldd, const device U *C, const int ldc, const int fdc, thread const Epilogue &amp;epilogue_op) const</td></tr>
+<tr class="separator:a7cf757e9785e23997b1417e024559ed3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a7b324c992750ed3aaa4c485f15b2f391" id="r_a7b324c992750ed3aaa4c485f15b2f391"><td class="memItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7b324c992750ed3aaa4c485f15b2f391">store_result_safe</a> (device U *D, const int ldd, const device U *C, const int ldc, const int fdc, short2 dst_tile_dims, thread const Epilogue &amp;epilogue_op) const</td></tr>
+<tr class="separator:a7b324c992750ed3aaa4c485f15b2f391"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
 Public Attributes</h2></td></tr>
@@ -152,12 +173,12 @@ Public Attributes</h2></td></tr>
 <tr class="separator:a8fddaa78913cdc8eea5e1cf7d2776330"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ae3f35453b3afbaac9df64ad5966b34a4" id="r_ae3f35453b3afbaac9df64ad5966b34a4"><td class="memItemLeft" align="right" valign="top"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae3f35453b3afbaac9df64ad5966b34a4">tile_stride_b</a> = <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> * <a class="el" href="#aa71400922babd388177f228c2c82b211">B_str_k</a></td></tr>
 <tr class="separator:ae3f35453b3afbaac9df64ad5966b34a4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af1a138c5e118147dc46475e4a5557e7c" id="r_af1a138c5e118147dc46475e4a5557e7c"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; AccumType, <a class="el" href="#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>, 1, <a class="el" href="#ae2c42cb6d0dde785859164c195f4d13c">MMAFrag_acc_t</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af1a138c5e118147dc46475e4a5557e7c">Atile</a></td></tr>
-<tr class="separator:af1a138c5e118147dc46475e4a5557e7c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a19abba19edeb37018da4bd31e01c8e26" id="r_a19abba19edeb37018da4bd31e01c8e26"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; AccumType, 1, <a class="el" href="#a706ae779c1f8d2eb18f19c248567d424">TN</a>, <a class="el" href="#ae2c42cb6d0dde785859164c195f4d13c">MMAFrag_acc_t</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a19abba19edeb37018da4bd31e01c8e26">Btile</a></td></tr>
-<tr class="separator:a19abba19edeb37018da4bd31e01c8e26"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a81838da5d81e62d372d581be599c5a88" id="r_a81838da5d81e62d372d581be599c5a88"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; AccumType, <a class="el" href="#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>, <a class="el" href="#a706ae779c1f8d2eb18f19c248567d424">TN</a>, <a class="el" href="#ae2c42cb6d0dde785859164c195f4d13c">MMAFrag_acc_t</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a81838da5d81e62d372d581be599c5a88">Ctile</a></td></tr>
-<tr class="separator:a81838da5d81e62d372d581be599c5a88"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a47e614120c650f7479db79f23a0df586" id="r_a47e614120c650f7479db79f23a0df586"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; AccumType, <a class="el" href="#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>, 1, <a class="el" href="#a8231b0e3475077c1381eb8f5daf62e35">MMAFrag_acc_t</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a47e614120c650f7479db79f23a0df586">Atile</a></td></tr>
+<tr class="separator:a47e614120c650f7479db79f23a0df586"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a44fca27c821764317263047a780977b0" id="r_a44fca27c821764317263047a780977b0"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; AccumType, 1, <a class="el" href="#a706ae779c1f8d2eb18f19c248567d424">TN</a>, <a class="el" href="#a8231b0e3475077c1381eb8f5daf62e35">MMAFrag_acc_t</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a44fca27c821764317263047a780977b0">Btile</a></td></tr>
+<tr class="separator:a44fca27c821764317263047a780977b0"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a21b0c40d16eced109bd3196186170bc6" id="r_a21b0c40d16eced109bd3196186170bc6"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; AccumType, <a class="el" href="#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>, <a class="el" href="#a706ae779c1f8d2eb18f19c248567d424">TN</a>, <a class="el" href="#a8231b0e3475077c1381eb8f5daf62e35">MMAFrag_acc_t</a> &gt;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a21b0c40d16eced109bd3196186170bc6">Ctile</a></td></tr>
+<tr class="separator:a21b0c40d16eced109bd3196186170bc6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aa85451edf6900fd6af164d4d50889ae3" id="r_aa85451edf6900fd6af164d4d50889ae3"><td class="memItemLeft" align="right" valign="top">short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa85451edf6900fd6af164d4d50889ae3">sm</a></td></tr>
 <tr class="separator:aa85451edf6900fd6af164d4d50889ae3"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ade420e8b811d597345783c324c23a34a" id="r_ade420e8b811d597345783c324c23a34a"><td class="memItemLeft" align="right" valign="top">short&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ade420e8b811d597345783c324c23a34a">sn</a></td></tr>
@@ -168,8 +189,8 @@ Public Attributes</h2></td></tr>
 <tr class="separator:a92f6aeee432f53638447eac842f43eca"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Typedef Documentation</h2>
-<a id="ae2c42cb6d0dde785859164c195f4d13c" name="ae2c42cb6d0dde785859164c195f4d13c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ae2c42cb6d0dde785859164c195f4d13c">&#9670;&#160;</a></span>MMAFrag_acc_t</h2>
+<a id="a8231b0e3475077c1381eb8f5daf62e35" name="a8231b0e3475077c1381eb8f5daf62e35"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a8231b0e3475077c1381eb8f5daf62e35">&#9670;&#160;</a></span>MMAFrag_acc_t</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -177,7 +198,7 @@ Public Attributes</h2></td></tr>
 template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">using <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::MMAFrag_acc_t = <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a>&lt;AccumType, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>&gt;</td>
+          <td class="memname">typedef <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a>&lt; AccumType, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a> &gt; <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::MMAFrag_acc_t = <a class="el" href="structmlx_1_1steel_1_1_base_m_m_a_frag.html">BaseMMAFrag</a>&lt;AccumType, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>, <a class="el" href="#aee8caec45c1f9e4428586effbfe6137d">kFragSize</a>&gt;</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -186,7 +207,38 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 </div>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <a id="aa14406b7298456ac45d23dd3c4642dd8" name="aa14406b7298456ac45d23dd3c4642dd8"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa14406b7298456ac45d23dd3c4642dd8">&#9670;&#160;</a></span>BlockMMA()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aa14406b7298456ac45d23dd3c4642dd8">&#9670;&#160;</a></span>BlockMMA() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::BlockMMA </td>
+          <td>(</td>
+          <td class="paramtype">ushort</td>          <td class="paramname"><span class="paramname"><em>simd_group_id</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">ushort</td>          <td class="paramname"><span class="paramname"><em>simd_lane_id</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aa14406b7298456ac45d23dd3c4642dd8" name="aa14406b7298456ac45d23dd3c4642dd8"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa14406b7298456ac45d23dd3c4642dd8">&#9670;&#160;</a></span>BlockMMA() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -218,7 +270,50 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 </div>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="a823c56cbd2086f10272df7284a5247ae" name="a823c56cbd2086f10272df7284a5247ae"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a823c56cbd2086f10272df7284a5247ae">&#9670;&#160;</a></span>apply_epilogue() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a823c56cbd2086f10272df7284a5247ae">&#9670;&#160;</a></span>apply_epilogue() <span class="overload">[1/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename BinaryEpilogue &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::apply_epilogue </td>
+          <td>(</td>
+          <td class="paramtype">const device U *</td>          <td class="paramname"><span class="paramname"><em>C</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ldc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>fdc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread const BinaryEpilogue &amp;</td>          <td class="paramname"><span class="paramname"><em>epilogue_op</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a823c56cbd2086f10272df7284a5247ae" name="a823c56cbd2086f10272df7284a5247ae"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a823c56cbd2086f10272df7284a5247ae">&#9670;&#160;</a></span>apply_epilogue() <span class="overload">[2/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -261,7 +356,36 @@ template&lt;typename BinaryEpilogue &gt; </div>
 </div>
 </div>
 <a id="af653c0808ba4fa9a25286f1febb7baff" name="af653c0808ba4fa9a25286f1febb7baff"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#af653c0808ba4fa9a25286f1febb7baff">&#9670;&#160;</a></span>apply_epilogue() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#af653c0808ba4fa9a25286f1febb7baff">&#9670;&#160;</a></span>apply_epilogue() <span class="overload">[3/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename UnaryEpilogue &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::apply_epilogue </td>
+          <td>(</td>
+          <td class="paramtype">thread const UnaryEpilogue &amp;</td>          <td class="paramname"><span class="paramname"><em>epilogue_op</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="af653c0808ba4fa9a25286f1febb7baff" name="af653c0808ba4fa9a25286f1febb7baff"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#af653c0808ba4fa9a25286f1febb7baff">&#9670;&#160;</a></span>apply_epilogue() <span class="overload">[4/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -290,7 +414,55 @@ template&lt;typename UnaryEpilogue &gt; </div>
 </div>
 </div>
 <a id="a9e48f2d51099ec00171506724faab54a" name="a9e48f2d51099ec00171506724faab54a"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a9e48f2d51099ec00171506724faab54a">&#9670;&#160;</a></span>apply_epilogue_safe()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a9e48f2d51099ec00171506724faab54a">&#9670;&#160;</a></span>apply_epilogue_safe() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename BinaryEpilogue &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::apply_epilogue_safe </td>
+          <td>(</td>
+          <td class="paramtype">const device U *</td>          <td class="paramname"><span class="paramname"><em>C</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ldc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>fdc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">short2</td>          <td class="paramname"><span class="paramname"><em>dst_tile_dims</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread const BinaryEpilogue &amp;</td>          <td class="paramname"><span class="paramname"><em>epilogue_op</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a9e48f2d51099ec00171506724faab54a" name="a9e48f2d51099ec00171506724faab54a"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a9e48f2d51099ec00171506724faab54a">&#9670;&#160;</a></span>apply_epilogue_safe() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -338,7 +510,38 @@ template&lt;typename BinaryEpilogue &gt; </div>
 </div>
 </div>
 <a id="a6a2c2a6d5e767d52c41b42a9d36086b0" name="a6a2c2a6d5e767d52c41b42a9d36086b0"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a6a2c2a6d5e767d52c41b42a9d36086b0">&#9670;&#160;</a></span>mma()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a6a2c2a6d5e767d52c41b42a9d36086b0">&#9670;&#160;</a></span>mma() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::mma </td>
+          <td>(</td>
+          <td class="paramtype">const threadgroup T *</td>          <td class="paramname"><span class="paramname"><em>As</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const threadgroup T *</td>          <td class="paramname"><span class="paramname"><em>Bs</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a6a2c2a6d5e767d52c41b42a9d36086b0" name="a6a2c2a6d5e767d52c41b42a9d36086b0"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6a2c2a6d5e767d52c41b42a9d36086b0">&#9670;&#160;</a></span>mma() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -369,7 +572,38 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 </div>
 </div>
 <a id="a0461451ffb5041b6a916ea17ed34288b" name="a0461451ffb5041b6a916ea17ed34288b"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a0461451ffb5041b6a916ea17ed34288b">&#9670;&#160;</a></span>store_result() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a0461451ffb5041b6a916ea17ed34288b">&#9670;&#160;</a></span>store_result() <span class="overload">[1/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::store_result </td>
+          <td>(</td>
+          <td class="paramtype">device U *</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ldd</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a0461451ffb5041b6a916ea17ed34288b" name="a0461451ffb5041b6a916ea17ed34288b"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a0461451ffb5041b6a916ea17ed34288b">&#9670;&#160;</a></span>store_result() <span class="overload">[2/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -400,7 +634,58 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 </div>
 </div>
 <a id="a7cf757e9785e23997b1417e024559ed3" name="a7cf757e9785e23997b1417e024559ed3"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7cf757e9785e23997b1417e024559ed3">&#9670;&#160;</a></span>store_result() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a7cf757e9785e23997b1417e024559ed3">&#9670;&#160;</a></span>store_result() <span class="overload">[3/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::store_result </td>
+          <td>(</td>
+          <td class="paramtype">device U *</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ldd</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device U *</td>          <td class="paramname"><span class="paramname"><em>C</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ldc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>fdc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread const Epilogue &amp;</td>          <td class="paramname"><span class="paramname"><em>epilogue_op</em></span>&#160;) const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a7cf757e9785e23997b1417e024559ed3" name="a7cf757e9785e23997b1417e024559ed3"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a7cf757e9785e23997b1417e024559ed3">&#9670;&#160;</a></span>store_result() <span class="overload">[4/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -451,7 +736,63 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 </div>
 </div>
 <a id="a7b324c992750ed3aaa4c485f15b2f391" name="a7b324c992750ed3aaa4c485f15b2f391"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7b324c992750ed3aaa4c485f15b2f391">&#9670;&#160;</a></span>store_result_safe() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a7b324c992750ed3aaa4c485f15b2f391">&#9670;&#160;</a></span>store_result_safe() <span class="overload">[1/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::store_result_safe </td>
+          <td>(</td>
+          <td class="paramtype">device U *</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ldd</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device U *</td>          <td class="paramname"><span class="paramname"><em>C</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ldc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>fdc</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">short2</td>          <td class="paramname"><span class="paramname"><em>dst_tile_dims</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread const Epilogue &amp;</td>          <td class="paramname"><span class="paramname"><em>epilogue_op</em></span>&#160;) const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a7b324c992750ed3aaa4c485f15b2f391" name="a7b324c992750ed3aaa4c485f15b2f391"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a7b324c992750ed3aaa4c485f15b2f391">&#9670;&#160;</a></span>store_result_safe() <span class="overload">[2/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -507,7 +848,43 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 </div>
 </div>
 <a id="a081ba538d30d1d02498a7f341e6bd611" name="a081ba538d30d1d02498a7f341e6bd611"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a081ba538d30d1d02498a7f341e6bd611">&#9670;&#160;</a></span>store_result_safe() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a081ba538d30d1d02498a7f341e6bd611">&#9670;&#160;</a></span>store_result_safe() <span class="overload">[3/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::store_result_safe </td>
+          <td>(</td>
+          <td class="paramtype">device U *</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ldd</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">short2</td>          <td class="paramname"><span class="paramname"><em>dst_tile_dims</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a081ba538d30d1d02498a7f341e6bd611" name="a081ba538d30d1d02498a7f341e6bd611"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a081ba538d30d1d02498a7f341e6bd611">&#9670;&#160;</a></span>store_result_safe() <span class="overload">[4/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -591,8 +968,8 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 
 </div>
 </div>
-<a id="af1a138c5e118147dc46475e4a5557e7c" name="af1a138c5e118147dc46475e4a5557e7c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#af1a138c5e118147dc46475e4a5557e7c">&#9670;&#160;</a></span>Atile</h2>
+<a id="a47e614120c650f7479db79f23a0df586" name="a47e614120c650f7479db79f23a0df586"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a47e614120c650f7479db79f23a0df586">&#9670;&#160;</a></span>Atile</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -600,7 +977,7 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt;AccumType, <a class="el" href="#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>, 1, <a class="el" href="#ae2c42cb6d0dde785859164c195f4d13c">MMAFrag_acc_t</a>&gt; <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::Atile</td>
+          <td class="memname"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; AccumType, <a class="el" href="#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>, 1, <a class="el" href="#a8231b0e3475077c1381eb8f5daf62e35">MMAFrag_acc_t</a> &gt; <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::Atile</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -655,8 +1032,8 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 
 </div>
 </div>
-<a id="a19abba19edeb37018da4bd31e01c8e26" name="a19abba19edeb37018da4bd31e01c8e26"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a19abba19edeb37018da4bd31e01c8e26">&#9670;&#160;</a></span>Btile</h2>
+<a id="a44fca27c821764317263047a780977b0" name="a44fca27c821764317263047a780977b0"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a44fca27c821764317263047a780977b0">&#9670;&#160;</a></span>Btile</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -664,15 +1041,15 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt;AccumType, 1, <a class="el" href="#a706ae779c1f8d2eb18f19c248567d424">TN</a>, <a class="el" href="#ae2c42cb6d0dde785859164c195f4d13c">MMAFrag_acc_t</a>&gt; <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::Btile</td>
+          <td class="memname"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; AccumType, 1, <a class="el" href="#a706ae779c1f8d2eb18f19c248567d424">TN</a>, <a class="el" href="#a8231b0e3475077c1381eb8f5daf62e35">MMAFrag_acc_t</a> &gt; <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::Btile</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a81838da5d81e62d372d581be599c5a88" name="a81838da5d81e62d372d581be599c5a88"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a81838da5d81e62d372d581be599c5a88">&#9670;&#160;</a></span>Ctile</h2>
+<a id="a21b0c40d16eced109bd3196186170bc6" name="a21b0c40d16eced109bd3196186170bc6"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a21b0c40d16eced109bd3196186170bc6">&#9670;&#160;</a></span>Ctile</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -680,7 +1057,7 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, short lda_tgp, short ldb_tgp, typename AccumType  = float, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt;AccumType, <a class="el" href="#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>, <a class="el" href="#a706ae779c1f8d2eb18f19c248567d424">TN</a>, <a class="el" href="#ae2c42cb6d0dde785859164c195f4d13c">MMAFrag_acc_t</a>&gt; <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::Ctile</td>
+          <td class="memname"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">MMATile</a>&lt; AccumType, <a class="el" href="#aba5f749fdf32d8bd9d9e29f2a9ae4591">TM</a>, <a class="el" href="#a706ae779c1f8d2eb18f19c248567d424">TN</a>, <a class="el" href="#a8231b0e3475077c1381eb8f5daf62e35">MMAFrag_acc_t</a> &gt; <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">mlx::steel::BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, lda_tgp, ldb_tgp, AccumType, Epilogue &gt;::Ctile</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -831,8 +1208,9 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="mma_8h_source.html">mma.h</a></li>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2mma_8h_source.html">mma.h</a></li>
+<li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="gemm_2mma_8h_source.html">mma.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_swizzle-members.html b/docs/build/html/structmlx_1_1steel_1_1_block_swizzle-members.html
index 6df11e2c0..6ce3c0ddf 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_block_swizzle-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_swizzle-members.html
@@ -95,6 +95,7 @@ $(function(){ initResizable(false); });
 <p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html">mlx::steel::BlockSwizzle</a>, including all inherited members.</p>
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760">swizzle</a>(uint3 tid, const int swizzle_log)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html">mlx::steel::BlockSwizzle</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html#a98e558d63826d2aaa06d3e65a06d2760">swizzle</a>(uint3 tid, const int swizzle_log)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_block_swizzle.html">mlx::steel::BlockSwizzle</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_block_swizzle.html b/docs/build/html/structmlx_1_1steel_1_1_block_swizzle.html
index e620d2679..588eb0c35 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_block_swizzle.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_block_swizzle.html
@@ -95,16 +95,18 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
 Static Public Member Functions</h2></td></tr>
 <tr class="memitem:a98e558d63826d2aaa06d3e65a06d2760" id="r_a98e558d63826d2aaa06d3e65a06d2760"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC int2&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a98e558d63826d2aaa06d3e65a06d2760">swizzle</a> (uint3 tid, const int swizzle_log)</td></tr>
 <tr class="separator:a98e558d63826d2aaa06d3e65a06d2760"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a98e558d63826d2aaa06d3e65a06d2760" id="r_a98e558d63826d2aaa06d3e65a06d2760"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC int2&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a98e558d63826d2aaa06d3e65a06d2760">swizzle</a> (uint3 tid, const int swizzle_log)</td></tr>
+<tr class="separator:a98e558d63826d2aaa06d3e65a06d2760"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="a98e558d63826d2aaa06d3e65a06d2760" name="a98e558d63826d2aaa06d3e65a06d2760"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a98e558d63826d2aaa06d3e65a06d2760">&#9670;&#160;</a></span>swizzle()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a98e558d63826d2aaa06d3e65a06d2760">&#9670;&#160;</a></span>swizzle() <span class="overload">[1/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -132,7 +134,37 @@ Static Public Member Functions</h2></td></tr>
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
+<a id="a98e558d63826d2aaa06d3e65a06d2760" name="a98e558d63826d2aaa06d3e65a06d2760"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a98e558d63826d2aaa06d3e65a06d2760">&#9670;&#160;</a></span>swizzle() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC int2 mlx::steel::BlockSwizzle::swizzle </td>
+          <td>(</td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>tid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>swizzle_log</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a></li>
 <li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_c_shape-members.html b/docs/build/html/structmlx_1_1steel_1_1_c_shape-members.html
new file mode 100644
index 000000000..55003ccbf
--- /dev/null
+++ b/docs/build/html/structmlx_1_1steel_1_1_c_shape-members.html
@@ -0,0 +1,106 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_c_shape.html">CShape</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">mlx::steel::CShape&lt; R, C &gt; Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_c_shape.html">mlx::steel::CShape&lt; R, C &gt;</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_c_shape.html#a01b09227356b6a682a0694523a8e6901">kCols</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_c_shape.html">mlx::steel::CShape&lt; R, C &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_c_shape.html#a5caf36cb9acf9f90ba59a9b0b4197993">kRows</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_c_shape.html">mlx::steel::CShape&lt; R, C &gt;</a></td><td class="entry"></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_c_shape.html b/docs/build/html/structmlx_1_1steel_1_1_c_shape.html
new file mode 100644
index 000000000..42e909874
--- /dev/null
+++ b/docs/build/html/structmlx_1_1steel_1_1_c_shape.html
@@ -0,0 +1,150 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx::steel::CShape&lt; R, C &gt; Struct Template Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_c_shape.html">CShape</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-attribs">Public Attributes</a> &#124;
+<a href="structmlx_1_1steel_1_1_c_shape-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">mlx::steel::CShape&lt; R, C &gt; Struct Template Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="attn_2loader_8h_source.html">loader.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
+Public Attributes</h2></td></tr>
+<tr class="memitem:a5caf36cb9acf9f90ba59a9b0b4197993" id="r_a5caf36cb9acf9f90ba59a9b0b4197993"><td class="memItemLeft" align="right" valign="top"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5caf36cb9acf9f90ba59a9b0b4197993">kRows</a> = R</td></tr>
+<tr class="separator:a5caf36cb9acf9f90ba59a9b0b4197993"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a01b09227356b6a682a0694523a8e6901" id="r_a01b09227356b6a682a0694523a8e6901"><td class="memItemLeft" align="right" valign="top"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a01b09227356b6a682a0694523a8e6901">kCols</a> = C</td></tr>
+<tr class="separator:a01b09227356b6a682a0694523a8e6901"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Member Data Documentation</h2>
+<a id="a01b09227356b6a682a0694523a8e6901" name="a01b09227356b6a682a0694523a8e6901"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a01b09227356b6a682a0694523a8e6901">&#9670;&#160;</a></span>kCols</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int R, int C&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> int <a class="el" href="structmlx_1_1steel_1_1_c_shape.html">mlx::steel::CShape</a>&lt; R, C &gt;::kCols = C</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a5caf36cb9acf9f90ba59a9b0b4197993" name="a5caf36cb9acf9f90ba59a9b0b4197993"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a5caf36cb9acf9f90ba59a9b0b4197993">&#9670;&#160;</a></span>kRows</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;int R, int C&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> int <a class="el" href="structmlx_1_1steel_1_1_c_shape.html">mlx::steel::CShape</a>&lt; R, C &gt;::kRows = R</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2loader_8h_source.html">loader.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_g_e_m_m_kernel-members.html b/docs/build/html/structmlx_1_1steel_1_1_g_e_m_m_kernel-members.html
index bc9494fa3..88cd48255 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_g_e_m_m_kernel-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_g_e_m_m_kernel-members.html
@@ -95,9 +95,11 @@ $(function(){ initResizable(false); });
 <p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a>, including all inherited members.</p>
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop</a>(threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread loader_a_t &amp;loader_a, thread loader_b_t &amp;loader_b, thread mma_t &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, LoopAlignment&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa8a04ed74d2259f99b337d4662c64d83">loader_a_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#aa98f32278b5fd98c93ae5483c3596395">loader_b_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#add8c6a31011a4895667c2a94a5af3782">mma_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop</a>(threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread loader_a_t &amp;loader_a, thread loader_b_t &amp;loader_b, thread mma_t &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, LoopAlignment&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a98b6ec692580510081e2aa887a61944b">loader_a_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1a115d5af0fb6e260165adba2e377635">loader_b_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ae52eb09c9478cd4f199662346ac0c83e">mma_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5">run</a>(const device T *A, const device T *B, device U *D, const constant GEMMParams *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a00e55d4a161758350ed7310817d2d2a5">run</a>(const device T *A, const device T *B, device U *D, const constant GEMMParams *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#a1ec583584e69dcbbb72106390a4fc5da">tgp_mem_size</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html#ac00b149d76a903c2f91b0f477dc5037f">tgp_mem_size_a</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;</a></td><td class="entry"></td></tr>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_g_e_m_m_kernel.html b/docs/build/html/structmlx_1_1steel_1_1_g_e_m_m_kernel.html
index 86d32c6b8..a058daa99 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_g_e_m_m_kernel.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_g_e_m_m_kernel.html
@@ -97,21 +97,26 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="gemm_8h_source.html">gemm.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="attn_8h_source.html">attn.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-types" name="pub-types"></a>
 Public Types</h2></td></tr>
-<tr class="memitem:aa8a04ed74d2259f99b337d4662c64d83" id="r_aa8a04ed74d2259f99b337d4662c64d83"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa8a04ed74d2259f99b337d4662c64d83">loader_a_t</a></td></tr>
-<tr class="separator:aa8a04ed74d2259f99b337d4662c64d83"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:aa98f32278b5fd98c93ae5483c3596395" id="r_aa98f32278b5fd98c93ae5483c3596395"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa98f32278b5fd98c93ae5483c3596395">loader_b_t</a></td></tr>
-<tr class="separator:aa98f32278b5fd98c93ae5483c3596395"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:add8c6a31011a4895667c2a94a5af3782" id="r_add8c6a31011a4895667c2a94a5af3782"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#add8c6a31011a4895667c2a94a5af3782">mma_t</a></td></tr>
-<tr class="separator:add8c6a31011a4895667c2a94a5af3782"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a98b6ec692580510081e2aa887a61944b" id="r_a98b6ec692580510081e2aa887a61944b"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a98b6ec692580510081e2aa887a61944b">loader_a_t</a></td></tr>
+<tr class="separator:a98b6ec692580510081e2aa887a61944b"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1a115d5af0fb6e260165adba2e377635" id="r_a1a115d5af0fb6e260165adba2e377635"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1a115d5af0fb6e260165adba2e377635">loader_b_t</a></td></tr>
+<tr class="separator:a1a115d5af0fb6e260165adba2e377635"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae52eb09c9478cd4f199662346ac0c83e" id="r_ae52eb09c9478cd4f199662346ac0c83e"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae52eb09c9478cd4f199662346ac0c83e">mma_t</a></td></tr>
+<tr class="separator:ae52eb09c9478cd4f199662346ac0c83e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
 Static Public Member Functions</h2></td></tr>
 <tr class="memitem:a756d7bbcc96e2919cd65eec4bc135780" id="r_a756d7bbcc96e2919cd65eec4bc135780"><td class="memTemplParams" colspan="2">template&lt;bool M_aligned, bool N_aligned, bool K_aligned_&gt; </td></tr>
-<tr class="memitem:a756d7bbcc96e2919cd65eec4bc135780"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop</a> (threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread <a class="el" href="#aa8a04ed74d2259f99b337d4662c64d83">loader_a_t</a> &amp;loader_a, thread <a class="el" href="#aa98f32278b5fd98c93ae5483c3596395">loader_b_t</a> &amp;loader_b, thread <a class="el" href="#add8c6a31011a4895667c2a94a5af3782">mma_t</a> &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, <a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html">LoopAlignment</a>&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})</td></tr>
+<tr class="memitem:a756d7bbcc96e2919cd65eec4bc135780"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop</a> (threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread <a class="el" href="#a98b6ec692580510081e2aa887a61944b">loader_a_t</a> &amp;loader_a, thread <a class="el" href="#a1a115d5af0fb6e260165adba2e377635">loader_b_t</a> &amp;loader_b, thread <a class="el" href="#ae52eb09c9478cd4f199662346ac0c83e">mma_t</a> &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, <a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html">LoopAlignment</a>&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})</td></tr>
+<tr class="separator:a756d7bbcc96e2919cd65eec4bc135780"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a00e55d4a161758350ed7310817d2d2a5" id="r_a00e55d4a161758350ed7310817d2d2a5"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a00e55d4a161758350ed7310817d2d2a5">run</a> (const device T *A, const device T *B, device U *D, const constant <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html">GEMMParams</a> *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</td></tr>
+<tr class="separator:a00e55d4a161758350ed7310817d2d2a5"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a756d7bbcc96e2919cd65eec4bc135780" id="r_a756d7bbcc96e2919cd65eec4bc135780"><td class="memTemplParams" colspan="2">template&lt;bool M_aligned, bool N_aligned, bool K_aligned_&gt; </td></tr>
+<tr class="memitem:a756d7bbcc96e2919cd65eec4bc135780"><td class="memTemplItemLeft" align="right" valign="top">static METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a756d7bbcc96e2919cd65eec4bc135780">gemm_loop</a> (threadgroup T *As, threadgroup T *Bs, const int gemm_k_iterations, thread <a class="el" href="#a98b6ec692580510081e2aa887a61944b">loader_a_t</a> &amp;loader_a, thread <a class="el" href="#a1a115d5af0fb6e260165adba2e377635">loader_b_t</a> &amp;loader_b, thread <a class="el" href="#ae52eb09c9478cd4f199662346ac0c83e">mma_t</a> &amp;mma_op, thread const short &amp;tgp_bm, thread const short &amp;tgp_bn, thread const short &amp;lbk, <a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html">LoopAlignment</a>&lt; M_aligned, N_aligned, K_aligned_ &gt; l={})</td></tr>
 <tr class="separator:a756d7bbcc96e2919cd65eec4bc135780"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a00e55d4a161758350ed7310817d2d2a5" id="r_a00e55d4a161758350ed7310817d2d2a5"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a00e55d4a161758350ed7310817d2d2a5">run</a> (const device T *A, const device T *B, device U *D, const constant <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html">GEMMParams</a> *params, threadgroup T *As, threadgroup T *Bs, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)</td></tr>
 <tr class="separator:a00e55d4a161758350ed7310817d2d2a5"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -132,8 +137,8 @@ Public Attributes</h2></td></tr>
 <tr class="separator:a9058ddb73e30e83fb9c548ba22817d64"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Typedef Documentation</h2>
-<a id="aa8a04ed74d2259f99b337d4662c64d83" name="aa8a04ed74d2259f99b337d4662c64d83"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa8a04ed74d2259f99b337d4662c64d83">&#9670;&#160;</a></span>loader_a_t</h2>
+<a id="a98b6ec692580510081e2aa887a61944b" name="a98b6ec692580510081e2aa887a61944b"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a98b6ec692580510081e2aa887a61944b">&#9670;&#160;</a></span>loader_a_t</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -141,7 +146,7 @@ Public Attributes</h2></td></tr>
 template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, bool MN_aligned, bool K_aligned, typename AccumType  = typename AccumHelper&lt;T&gt;::accum_type, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">using <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;::loader_a_t</td>
+          <td class="memname">typedef <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a>&lt; T, transpose_a ? BK :BM, transpose_a ? BM :BK, transpose_a ? BM+<a class="el" href="#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a> :BK+<a class="el" href="#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>, !transpose_a, <a class="el" href="#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a> &gt; <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;::loader_a_t</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -152,13 +157,13 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 <div class="line">      transpose_a ? BM + <a class="code hl_variable" href="#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a> : BK + <a class="code hl_variable" href="#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>,</div>
 <div class="line">      !transpose_a,</div>
 <div class="line">      <a class="code hl_variable" href="#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a>&gt;</div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a9058ddb73e30e83fb9c548ba22817d64"><div class="ttname"><a href="#a9058ddb73e30e83fb9c548ba22817d64">mlx::steel::GEMMKernel::tgp_size</a></div><div class="ttdeci">STEEL_CONST short tgp_size</div><div class="ttdef"><b>Definition</b> gemm.h:46</div></div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad547704ccbff6c2076abeffa6628c5a0"><div class="ttname"><a href="#ad547704ccbff6c2076abeffa6628c5a0">mlx::steel::GEMMKernel::tgp_padding_a</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_a</div><div class="ttdef"><b>Definition</b> gemm.h:38</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_a9058ddb73e30e83fb9c548ba22817d64"><div class="ttname"><a href="#a9058ddb73e30e83fb9c548ba22817d64">mlx::steel::GEMMKernel::tgp_size</a></div><div class="ttdeci">STEEL_CONST short tgp_size</div><div class="ttdef"><b>Definition</b> attn.h:47</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad547704ccbff6c2076abeffa6628c5a0"><div class="ttname"><a href="#ad547704ccbff6c2076abeffa6628c5a0">mlx::steel::GEMMKernel::tgp_padding_a</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_a</div><div class="ttdef"><b>Definition</b> attn.h:39</div></div>
 </div><!-- fragment -->
 </div>
 </div>
-<a id="aa98f32278b5fd98c93ae5483c3596395" name="aa98f32278b5fd98c93ae5483c3596395"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa98f32278b5fd98c93ae5483c3596395">&#9670;&#160;</a></span>loader_b_t</h2>
+<a id="a1a115d5af0fb6e260165adba2e377635" name="a1a115d5af0fb6e260165adba2e377635"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1a115d5af0fb6e260165adba2e377635">&#9670;&#160;</a></span>loader_b_t</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -166,7 +171,7 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, bool MN_aligned, bool K_aligned, typename AccumType  = typename AccumHelper&lt;T&gt;::accum_type, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">using <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;::loader_b_t</td>
+          <td class="memname">typedef <a class="el" href="structmlx_1_1steel_1_1_block_loader.html">BlockLoader</a>&lt; T, transpose_b ? BN :BK, transpose_b ? BK :BN, transpose_b ? BK+<a class="el" href="#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a> :BN+<a class="el" href="#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>, transpose_b, <a class="el" href="#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a> &gt; <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;::loader_b_t</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -177,12 +182,12 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 <div class="line">      transpose_b ? BK + <a class="code hl_variable" href="#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a> : BN + <a class="code hl_variable" href="#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>,</div>
 <div class="line">      transpose_b,</div>
 <div class="line">      <a class="code hl_variable" href="#a9058ddb73e30e83fb9c548ba22817d64">tgp_size</a>&gt;</div>
-<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad1b03941e869017558423c08b08bc094"><div class="ttname"><a href="#ad1b03941e869017558423c08b08bc094">mlx::steel::GEMMKernel::tgp_padding_b</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_b</div><div class="ttdef"><b>Definition</b> gemm.h:39</div></div>
+<div class="ttc" id="astructmlx_1_1steel_1_1_g_e_m_m_kernel_html_ad1b03941e869017558423c08b08bc094"><div class="ttname"><a href="#ad1b03941e869017558423c08b08bc094">mlx::steel::GEMMKernel::tgp_padding_b</a></div><div class="ttdeci">STEEL_CONST short tgp_padding_b</div><div class="ttdef"><b>Definition</b> attn.h:40</div></div>
 </div><!-- fragment -->
 </div>
 </div>
-<a id="add8c6a31011a4895667c2a94a5af3782" name="add8c6a31011a4895667c2a94a5af3782"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#add8c6a31011a4895667c2a94a5af3782">&#9670;&#160;</a></span>mma_t</h2>
+<a id="ae52eb09c9478cd4f199662346ac0c83e" name="ae52eb09c9478cd4f199662346ac0c83e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ae52eb09c9478cd4f199662346ac0c83e">&#9670;&#160;</a></span>mma_t</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -190,7 +195,7 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, bool MN_aligned, bool K_aligned, typename AccumType  = typename AccumHelper&lt;T&gt;::accum_type, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">using <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;::mma_t</td>
+          <td class="memname">typedef <a class="el" href="structmlx_1_1steel_1_1_block_m_m_a.html">BlockMMA</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, transpose_a ? BM+<a class="el" href="#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a> :BK+<a class="el" href="#ad547704ccbff6c2076abeffa6628c5a0">tgp_padding_a</a>, transpose_b ? BK+<a class="el" href="#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a> :BN+<a class="el" href="#ad1b03941e869017558423c08b08bc094">tgp_padding_b</a>, AccumType, Epilogue &gt; <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;::mma_t</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -213,7 +218,7 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 </div>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="a756d7bbcc96e2919cd65eec4bc135780" name="a756d7bbcc96e2919cd65eec4bc135780"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a756d7bbcc96e2919cd65eec4bc135780">&#9670;&#160;</a></span>gemm_loop()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a756d7bbcc96e2919cd65eec4bc135780">&#9670;&#160;</a></span>gemm_loop() <span class="overload">[1/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -243,17 +248,90 @@ template&lt;bool M_aligned, bool N_aligned, bool K_aligned_&gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">thread <a class="el" href="#aa8a04ed74d2259f99b337d4662c64d83">loader_a_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>loader_a</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a98b6ec692580510081e2aa887a61944b">loader_a_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>loader_a</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">thread <a class="el" href="#aa98f32278b5fd98c93ae5483c3596395">loader_b_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>loader_b</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#a1a115d5af0fb6e260165adba2e377635">loader_b_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>loader_b</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">thread <a class="el" href="#add8c6a31011a4895667c2a94a5af3782">mma_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>mma_op</em></span>, </td>
+          <td class="paramtype">thread <a class="el" href="#ae52eb09c9478cd4f199662346ac0c83e">mma_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>mma_op</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread const short &amp;</td>          <td class="paramname"><span class="paramname"><em>tgp_bm</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread const short &amp;</td>          <td class="paramname"><span class="paramname"><em>tgp_bn</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread const short &amp;</td>          <td class="paramname"><span class="paramname"><em>lbk</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype"><a class="el" href="structmlx_1_1steel_1_1_loop_alignment.html">LoopAlignment</a>&lt; M_aligned, N_aligned, K_aligned_ &gt;</td>          <td class="paramname"><span class="paramname"><em>l</em></span><span class="paramdefsep"> = </span><span class="paramdefval">{}</span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a756d7bbcc96e2919cd65eec4bc135780" name="a756d7bbcc96e2919cd65eec4bc135780"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a756d7bbcc96e2919cd65eec4bc135780">&#9670;&#160;</a></span>gemm_loop() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, bool MN_aligned, bool K_aligned, typename AccumType  = typename AccumHelper&lt;T&gt;::accum_type, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;bool M_aligned, bool N_aligned, bool K_aligned_&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;::gemm_loop </td>
+          <td>(</td>
+          <td class="paramtype">threadgroup T *</td>          <td class="paramname"><span class="paramname"><em>As</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">threadgroup T *</td>          <td class="paramname"><span class="paramname"><em>Bs</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>gemm_k_iterations</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread <a class="el" href="#a98b6ec692580510081e2aa887a61944b">loader_a_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>loader_a</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread <a class="el" href="#a1a115d5af0fb6e260165adba2e377635">loader_b_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>loader_b</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">thread <a class="el" href="#ae52eb09c9478cd4f199662346ac0c83e">mma_t</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>mma_op</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
@@ -286,7 +364,78 @@ template&lt;bool M_aligned, bool N_aligned, bool K_aligned_&gt; </div>
 </div>
 </div>
 <a id="a00e55d4a161758350ed7310817d2d2a5" name="a00e55d4a161758350ed7310817d2d2a5"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a00e55d4a161758350ed7310817d2d2a5">&#9670;&#160;</a></span>run()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a00e55d4a161758350ed7310817d2d2a5">&#9670;&#160;</a></span>run() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bool transpose_a, bool transpose_b, bool MN_aligned, bool K_aligned, typename AccumType  = typename AccumHelper&lt;T&gt;::accum_type, typename Epilogue  = TransformNone&lt;U, AccumType&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_kernel.html">mlx::steel::GEMMKernel</a>&lt; T, U, BM, BN, BK, WM, WN, transpose_a, transpose_b, MN_aligned, K_aligned, AccumType, Epilogue &gt;::run </td>
+          <td>(</td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">device U *</td>          <td class="paramname"><span class="paramname"><em>D</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant <a class="el" href="structmlx_1_1steel_1_1_g_e_m_m_params.html">GEMMParams</a> *</td>          <td class="paramname"><span class="paramname"><em>params</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">threadgroup T *</td>          <td class="paramname"><span class="paramname"><em>As</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">threadgroup T *</td>          <td class="paramname"><span class="paramname"><em>Bs</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_lane_id</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_group_id</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>tid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>lid</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a00e55d4a161758350ed7310817d2d2a5" name="a00e55d4a161758350ed7310817d2d2a5"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a00e55d4a161758350ed7310817d2d2a5">&#9670;&#160;</a></span>run() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -457,7 +606,8 @@ template&lt;typename T , typename U , int BM, int BN, int BK, int WM, int WN, bo
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_8h_source.html">attn.h</a></li>
 <li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="gemm_8h_source.html">gemm.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_layout2_d-members.html b/docs/build/html/structmlx_1_1steel_1_1_layout2_d-members.html
new file mode 100644
index 000000000..01173df91
--- /dev/null
+++ b/docs/build/html/structmlx_1_1steel_1_1_layout2_d-members.html
@@ -0,0 +1,106 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">Layout2D</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">mlx::steel::Layout2D&lt; Shape, Layout &gt; Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">mlx::steel::Layout2D&lt; Shape, Layout &gt;</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html#a6beedf1677ee1b192fb48c83a29ac8a1">layout</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">mlx::steel::Layout2D&lt; Shape, Layout &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html#a23183747ab1ddbdd3f1fcac6d0faa2cd">shape</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">mlx::steel::Layout2D&lt; Shape, Layout &gt;</a></td><td class="entry"></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_layout2_d.html b/docs/build/html/structmlx_1_1steel_1_1_layout2_d.html
new file mode 100644
index 000000000..25d0a50a8
--- /dev/null
+++ b/docs/build/html/structmlx_1_1steel_1_1_layout2_d.html
@@ -0,0 +1,150 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: mlx::steel::Layout2D&lt; Shape, Layout &gt; Struct Template Reference</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">Layout2D</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="summary">
+<a href="#pub-attribs">Public Attributes</a> &#124;
+<a href="structmlx_1_1steel_1_1_layout2_d-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">mlx::steel::Layout2D&lt; Shape, Layout &gt; Struct Template Reference</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p><code>#include &lt;<a class="el" href="attn_2mma_8h_source.html">mma.h</a>&gt;</code></p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
+Public Attributes</h2></td></tr>
+<tr class="memitem:a23183747ab1ddbdd3f1fcac6d0faa2cd" id="r_a23183747ab1ddbdd3f1fcac6d0faa2cd"><td class="memItemLeft" align="right" valign="top">Shape&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a23183747ab1ddbdd3f1fcac6d0faa2cd">shape</a></td></tr>
+<tr class="separator:a23183747ab1ddbdd3f1fcac6d0faa2cd"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6beedf1677ee1b192fb48c83a29ac8a1" id="r_a6beedf1677ee1b192fb48c83a29ac8a1"><td class="memItemLeft" align="right" valign="top">Layout&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6beedf1677ee1b192fb48c83a29ac8a1">layout</a></td></tr>
+<tr class="separator:a6beedf1677ee1b192fb48c83a29ac8a1"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+<h2 class="groupheader">Member Data Documentation</h2>
+<a id="a6beedf1677ee1b192fb48c83a29ac8a1" name="a6beedf1677ee1b192fb48c83a29ac8a1"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6beedf1677ee1b192fb48c83a29ac8a1">&#9670;&#160;</a></span>layout</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename Shape , typename Layout &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">Layout <a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">mlx::steel::Layout2D</a>&lt; Shape, Layout &gt;::layout</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a23183747ab1ddbdd3f1fcac6d0faa2cd" name="a23183747ab1ddbdd3f1fcac6d0faa2cd"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a23183747ab1ddbdd3f1fcac6d0faa2cd">&#9670;&#160;</a></span>shape</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename Shape , typename Layout &gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">Shape <a class="el" href="structmlx_1_1steel_1_1_layout2_d.html">mlx::steel::Layout2D</a>&lt; Shape, Layout &gt;::shape</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following file:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2mma_8h_source.html">mma.h</a></li>
+</ul>
+</div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_loop_alignment.html b/docs/build/html/structmlx_1_1steel_1_1_loop_alignment.html
index a89968240..8967088d6 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_loop_alignment.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_loop_alignment.html
@@ -92,8 +92,9 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="gemm_8h_source.html">gemm.h</a>&gt;</code></p>
-<hr/>The documentation for this struct was generated from the following file:<ul>
+<p><code>#include &lt;<a class="el" href="attn_8h_source.html">attn.h</a>&gt;</code></p>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_8h_source.html">attn.h</a></li>
 <li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="gemm_8h_source.html">gemm.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_m_m_a_tile-members.html b/docs/build/html/structmlx_1_1steel_1_1_m_m_a_tile-members.html
index f185357c2..732103eec 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_m_m_a_tile-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_m_m_a_tile-members.html
@@ -95,32 +95,49 @@ $(function(){ initResizable(false); });
 <p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a>, including all inherited members.</p>
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">clear</a>()</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">elems</a>()</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">elems</a>() const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(const short i, const short j)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">frag_at</a>(const short i, const short j) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5976565323f2e30479158c14f4b1bfef">frag_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">kCols</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7">clear</a>()</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a824409bc107330805853f932e80a7628">elem_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">elems</a>()</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">elems</a>() const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a865ece5ad0b9a56937b6d77a18b5a1dc">elems</a>()</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae21bb7cce701290de84c6015e064d8a1">elems</a>() const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(const short i, const short j)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">frag_at</a>(const short i, const short j) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a>(const short i, const short j)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad476e1d9a12178fb35c207312339e485">frag_at</a>(const short i, const short j) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a948784652e93830887ee8ad506ec3257">kCols</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1ea49efd92696b15302ee4b52ecd548c">kColsPerThread</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a98357339ec98f804a1b12597937b318f">kElemsPerTile</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ad095371db98e7c335ec41ca77c10f906">kFragCols</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a594142f957ffb99296a243f7af7b59e7">kFragRows</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a60ea6b8ff2923b7fe6f598e74ac54323">kRows</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96">load</a>(const threadgroup U *src)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9">load</a>(const device U *src, const int ld)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a5b1d1c85a5046108a4e38bdc5a0ea74e">kRowsPerThread</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1d126b14910385ab644e224ac1d0307a">kTileRows</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96">load</a>(const threadgroup U *src)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9">load</a>(const device U *src, const int ld)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa5426c6beabfb3ee41b58f01b3392a96">load</a>(const threadgroup U *src)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa9e484d8cae936503898d5b772c573f9">load</a>(const device U *src, const int ld)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">load_safe</a>(const device U *src, const int ld, const short2 src_tile_dims)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3a4af67813908109da08ce7352f82da">load_safe</a>(const device U *src, const int ld, const short2 src_tile_dims)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">mat_at</a>(const short i, const short j)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a28306efc1a903b80219c8bb16dc5b190">mat_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382">MMAFrag_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a323a4f38cd0693bf333832bb4258b28e">mat_at</a>(const short i, const short j)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a6dadcd666afb3759a11094e754560dd4">MMAFrag_t</a> typedef</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6">MMATile</a>() thread</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6">MMATile</a>() thread</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a3d0d5b9c7962658cc6d5afbbbb2f19e2">row_bin_op</a>(thread T vals[kRowsPerThread])</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#aa0ad5cb750ace934bf230385d8bd9f88">row_reduce</a>(thread T vals[kRowsPerThread]) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98">store</a>(threadgroup U *dst) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f">store</a>(device U *dst, const int ld) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a2aadaa3239cb3af0c2ee8af9b88c8a98">store</a>(threadgroup U *dst) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a752f708e4fe5ef37fdd902dae153179f">store</a>(device U *dst, const int ld) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba">store_safe</a>(device U *dst, const int ld, const short2 dst_tile_dims) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#ac4fb73ebc4e7b47a44b8bd6cadda5d44">val_frags</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a57703f522c7409dbe2c0a68bb7acc2ba">store_safe</a>(device U *dst, const int ld, const short2 dst_tile_dims) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_m_m_a_tile.html b/docs/build/html/structmlx_1_1steel_1_1_m_m_a_tile.html
index e2cb867ec..c0806b736 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_m_m_a_tile.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_m_m_a_tile.html
@@ -97,18 +97,18 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="mma_8h_source.html">mma.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="attn_2mma_8h_source.html">mma.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-types" name="pub-types"></a>
 Public Types</h2></td></tr>
-<tr class="memitem:abe33de70e34300745bad9aa822fd0382" id="r_abe33de70e34300745bad9aa822fd0382"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abe33de70e34300745bad9aa822fd0382">MMAFrag_t</a> = MMAFrag_</td></tr>
-<tr class="separator:abe33de70e34300745bad9aa822fd0382"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a80078f0dfa4c225e79d9b460202d5e2c" id="r_a80078f0dfa4c225e79d9b460202d5e2c"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a> = T</td></tr>
-<tr class="separator:a80078f0dfa4c225e79d9b460202d5e2c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a28306efc1a903b80219c8bb16dc5b190" id="r_a28306efc1a903b80219c8bb16dc5b190"><td class="memItemLeft" align="right" valign="top">typedef MMAFrag_t::mat_type&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a28306efc1a903b80219c8bb16dc5b190">mat_type</a></td></tr>
-<tr class="separator:a28306efc1a903b80219c8bb16dc5b190"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a5976565323f2e30479158c14f4b1bfef" id="r_a5976565323f2e30479158c14f4b1bfef"><td class="memItemLeft" align="right" valign="top">typedef MMAFrag_t::frag_type&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5976565323f2e30479158c14f4b1bfef">frag_type</a></td></tr>
-<tr class="separator:a5976565323f2e30479158c14f4b1bfef"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6dadcd666afb3759a11094e754560dd4" id="r_a6dadcd666afb3759a11094e754560dd4"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6dadcd666afb3759a11094e754560dd4">MMAFrag_t</a> = MMAFrag_</td></tr>
+<tr class="separator:a6dadcd666afb3759a11094e754560dd4"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a824409bc107330805853f932e80a7628" id="r_a824409bc107330805853f932e80a7628"><td class="memItemLeft" align="right" valign="top">using&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a824409bc107330805853f932e80a7628">elem_type</a> = T</td></tr>
+<tr class="separator:a824409bc107330805853f932e80a7628"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1eeb197c9bdf4db42892a39cdb9bd73a" id="r_a1eeb197c9bdf4db42892a39cdb9bd73a"><td class="memItemLeft" align="right" valign="top">typedef MMAFrag_t::mat_type&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a></td></tr>
+<tr class="separator:a1eeb197c9bdf4db42892a39cdb9bd73a"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aac25cd0a9bdf24aa2af809c95f0bd171" id="r_aac25cd0a9bdf24aa2af809c95f0bd171"><td class="memItemLeft" align="right" valign="top">typedef MMAFrag_t::frag_type&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a></td></tr>
+<tr class="separator:aac25cd0a9bdf24aa2af809c95f0bd171"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
@@ -116,15 +116,53 @@ Public Member Functions</h2></td></tr>
 <tr class="separator:aa3fb310dd08ec23c334511f7b316d1b6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aa97a98e423827a889c13a92217626ec7" id="r_aa97a98e423827a889c13a92217626ec7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC constexpr void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa97a98e423827a889c13a92217626ec7">clear</a> ()</td></tr>
 <tr class="separator:aa97a98e423827a889c13a92217626ec7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1a6b1446e8c8da46885bbaa8e8fdc7e4" id="r_a1a6b1446e8c8da46885bbaa8e8fdc7e4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC constexpr thread <a class="el" href="#a5976565323f2e30479158c14f4b1bfef">frag_type</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a> (const short i, const short j)</td></tr>
+<tr class="memitem:a1a6b1446e8c8da46885bbaa8e8fdc7e4" id="r_a1a6b1446e8c8da46885bbaa8e8fdc7e4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC constexpr thread <a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a> (const short i, const short j)</td></tr>
 <tr class="separator:a1a6b1446e8c8da46885bbaa8e8fdc7e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ad476e1d9a12178fb35c207312339e485" id="r_ad476e1d9a12178fb35c207312339e485"><td class="memItemLeft" align="right" valign="top">METAL_FUNC constexpr const thread <a class="el" href="#a5976565323f2e30479158c14f4b1bfef">frag_type</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad476e1d9a12178fb35c207312339e485">frag_at</a> (const short i, const short j) const</td></tr>
+<tr class="memitem:ad476e1d9a12178fb35c207312339e485" id="r_ad476e1d9a12178fb35c207312339e485"><td class="memItemLeft" align="right" valign="top">METAL_FUNC constexpr const thread <a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad476e1d9a12178fb35c207312339e485">frag_at</a> (const short i, const short j) const</td></tr>
 <tr class="separator:ad476e1d9a12178fb35c207312339e485"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a323a4f38cd0693bf333832bb4258b28e" id="r_a323a4f38cd0693bf333832bb4258b28e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="#a28306efc1a903b80219c8bb16dc5b190">mat_type</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a323a4f38cd0693bf333832bb4258b28e">mat_at</a> (const short i, const short j)</td></tr>
+<tr class="memitem:a323a4f38cd0693bf333832bb4258b28e" id="r_a323a4f38cd0693bf333832bb4258b28e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a323a4f38cd0693bf333832bb4258b28e">mat_at</a> (const short i, const short j)</td></tr>
 <tr class="separator:a323a4f38cd0693bf333832bb4258b28e"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a865ece5ad0b9a56937b6d77a18b5a1dc" id="r_a865ece5ad0b9a56937b6d77a18b5a1dc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC thread <a class="el" href="#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a> *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a865ece5ad0b9a56937b6d77a18b5a1dc">elems</a> ()</td></tr>
+<tr class="memitem:a865ece5ad0b9a56937b6d77a18b5a1dc" id="r_a865ece5ad0b9a56937b6d77a18b5a1dc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC thread <a class="el" href="#a824409bc107330805853f932e80a7628">elem_type</a> *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a865ece5ad0b9a56937b6d77a18b5a1dc">elems</a> ()</td></tr>
 <tr class="separator:a865ece5ad0b9a56937b6d77a18b5a1dc"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ae21bb7cce701290de84c6015e064d8a1" id="r_ae21bb7cce701290de84c6015e064d8a1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC const thread <a class="el" href="#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a> *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae21bb7cce701290de84c6015e064d8a1">elems</a> () const</td></tr>
+<tr class="memitem:ae21bb7cce701290de84c6015e064d8a1" id="r_ae21bb7cce701290de84c6015e064d8a1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC const thread <a class="el" href="#a824409bc107330805853f932e80a7628">elem_type</a> *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae21bb7cce701290de84c6015e064d8a1">elems</a> () const</td></tr>
+<tr class="separator:ae21bb7cce701290de84c6015e064d8a1"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa0ad5cb750ace934bf230385d8bd9f88" id="r_aa0ad5cb750ace934bf230385d8bd9f88"><td class="memTemplParams" colspan="2">template&lt;typename Op &gt; </td></tr>
+<tr class="memitem:aa0ad5cb750ace934bf230385d8bd9f88"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa0ad5cb750ace934bf230385d8bd9f88">row_reduce</a> (thread T vals[<a class="el" href="#a5b1d1c85a5046108a4e38bdc5a0ea74e">kRowsPerThread</a>]) const</td></tr>
+<tr class="separator:aa0ad5cb750ace934bf230385d8bd9f88"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3d0d5b9c7962658cc6d5afbbbb2f19e2" id="r_a3d0d5b9c7962658cc6d5afbbbb2f19e2"><td class="memTemplParams" colspan="2">template&lt;typename Op &gt; </td></tr>
+<tr class="memitem:a3d0d5b9c7962658cc6d5afbbbb2f19e2"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a3d0d5b9c7962658cc6d5afbbbb2f19e2">row_bin_op</a> (thread T vals[<a class="el" href="#a5b1d1c85a5046108a4e38bdc5a0ea74e">kRowsPerThread</a>])</td></tr>
+<tr class="separator:a3d0d5b9c7962658cc6d5afbbbb2f19e2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa5426c6beabfb3ee41b58f01b3392a96" id="r_aa5426c6beabfb3ee41b58f01b3392a96"><td class="memTemplParams" colspan="2">template&lt;typename U , int w_x, int w_y, int str_x, int str_y&gt; </td></tr>
+<tr class="memitem:aa5426c6beabfb3ee41b58f01b3392a96"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa5426c6beabfb3ee41b58f01b3392a96">load</a> (const threadgroup U *src)</td></tr>
+<tr class="separator:aa5426c6beabfb3ee41b58f01b3392a96"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a2aadaa3239cb3af0c2ee8af9b88c8a98" id="r_a2aadaa3239cb3af0c2ee8af9b88c8a98"><td class="memTemplParams" colspan="2">template&lt;typename U , int w_x, int w_y, int str_x, int str_y&gt; </td></tr>
+<tr class="memitem:a2aadaa3239cb3af0c2ee8af9b88c8a98"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a2aadaa3239cb3af0c2ee8af9b88c8a98">store</a> (threadgroup U *dst) const</td></tr>
+<tr class="separator:a2aadaa3239cb3af0c2ee8af9b88c8a98"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa9e484d8cae936503898d5b772c573f9" id="r_aa9e484d8cae936503898d5b772c573f9"><td class="memTemplParams" colspan="2">template&lt;typename U , int w_x, int w_y&gt; </td></tr>
+<tr class="memitem:aa9e484d8cae936503898d5b772c573f9"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa9e484d8cae936503898d5b772c573f9">load</a> (const device U *src, const int ld)</td></tr>
+<tr class="separator:aa9e484d8cae936503898d5b772c573f9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a752f708e4fe5ef37fdd902dae153179f" id="r_a752f708e4fe5ef37fdd902dae153179f"><td class="memTemplParams" colspan="2">template&lt;typename U , int w_x, int w_y&gt; </td></tr>
+<tr class="memitem:a752f708e4fe5ef37fdd902dae153179f"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a752f708e4fe5ef37fdd902dae153179f">store</a> (device U *dst, const int ld) const</td></tr>
+<tr class="separator:a752f708e4fe5ef37fdd902dae153179f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa3a4af67813908109da08ce7352f82da" id="r_aa3a4af67813908109da08ce7352f82da"><td class="memTemplParams" colspan="2">template&lt;typename U , int w_x, int w_y&gt; </td></tr>
+<tr class="memitem:aa3a4af67813908109da08ce7352f82da"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa3a4af67813908109da08ce7352f82da">load_safe</a> (const device U *src, const int ld, const short2 src_tile_dims)</td></tr>
+<tr class="separator:aa3a4af67813908109da08ce7352f82da"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a57703f522c7409dbe2c0a68bb7acc2ba" id="r_a57703f522c7409dbe2c0a68bb7acc2ba"><td class="memTemplParams" colspan="2">template&lt;typename U , int w_x, int w_y&gt; </td></tr>
+<tr class="memitem:a57703f522c7409dbe2c0a68bb7acc2ba"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a57703f522c7409dbe2c0a68bb7acc2ba">store_safe</a> (device U *dst, const int ld, const short2 dst_tile_dims) const</td></tr>
+<tr class="separator:a57703f522c7409dbe2c0a68bb7acc2ba"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa3fb310dd08ec23c334511f7b316d1b6" id="r_aa3fb310dd08ec23c334511f7b316d1b6"><td class="memItemLeft" align="right" valign="top">METAL_FUNC&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa3fb310dd08ec23c334511f7b316d1b6">MMATile</a> () thread</td></tr>
+<tr class="separator:aa3fb310dd08ec23c334511f7b316d1b6"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aa97a98e423827a889c13a92217626ec7" id="r_aa97a98e423827a889c13a92217626ec7"><td class="memItemLeft" align="right" valign="top">METAL_FUNC constexpr void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa97a98e423827a889c13a92217626ec7">clear</a> ()</td></tr>
+<tr class="separator:aa97a98e423827a889c13a92217626ec7"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1a6b1446e8c8da46885bbaa8e8fdc7e4" id="r_a1a6b1446e8c8da46885bbaa8e8fdc7e4"><td class="memItemLeft" align="right" valign="top">METAL_FUNC constexpr thread <a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1a6b1446e8c8da46885bbaa8e8fdc7e4">frag_at</a> (const short i, const short j)</td></tr>
+<tr class="separator:a1a6b1446e8c8da46885bbaa8e8fdc7e4"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ad476e1d9a12178fb35c207312339e485" id="r_ad476e1d9a12178fb35c207312339e485"><td class="memItemLeft" align="right" valign="top">METAL_FUNC constexpr const thread <a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad476e1d9a12178fb35c207312339e485">frag_at</a> (const short i, const short j) const</td></tr>
+<tr class="separator:ad476e1d9a12178fb35c207312339e485"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a323a4f38cd0693bf333832bb4258b28e" id="r_a323a4f38cd0693bf333832bb4258b28e"><td class="memItemLeft" align="right" valign="top">METAL_FUNC <a class="el" href="#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a323a4f38cd0693bf333832bb4258b28e">mat_at</a> (const short i, const short j)</td></tr>
+<tr class="separator:a323a4f38cd0693bf333832bb4258b28e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a865ece5ad0b9a56937b6d77a18b5a1dc" id="r_a865ece5ad0b9a56937b6d77a18b5a1dc"><td class="memItemLeft" align="right" valign="top">METAL_FUNC thread <a class="el" href="#a824409bc107330805853f932e80a7628">elem_type</a> *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a865ece5ad0b9a56937b6d77a18b5a1dc">elems</a> ()</td></tr>
+<tr class="separator:a865ece5ad0b9a56937b6d77a18b5a1dc"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae21bb7cce701290de84c6015e064d8a1" id="r_ae21bb7cce701290de84c6015e064d8a1"><td class="memItemLeft" align="right" valign="top">METAL_FUNC const thread <a class="el" href="#a824409bc107330805853f932e80a7628">elem_type</a> *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae21bb7cce701290de84c6015e064d8a1">elems</a> () const</td></tr>
 <tr class="separator:ae21bb7cce701290de84c6015e064d8a1"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aa5426c6beabfb3ee41b58f01b3392a96" id="r_aa5426c6beabfb3ee41b58f01b3392a96"><td class="memTemplParams" colspan="2">template&lt;typename U , int w_x, int w_y, int str_x, int str_y&gt; </td></tr>
 <tr class="memitem:aa5426c6beabfb3ee41b58f01b3392a96"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#aa5426c6beabfb3ee41b58f01b3392a96">load</a> (const threadgroup U *src)</td></tr>
@@ -165,12 +203,16 @@ Public Attributes</h2></td></tr>
 <tr class="separator:ae326e7693eb77c22d5a6e3e9219019d3"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a98357339ec98f804a1b12597937b318f" id="r_a98357339ec98f804a1b12597937b318f"><td class="memItemLeft" align="right" valign="top"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a98357339ec98f804a1b12597937b318f">kElemsPerTile</a> = <a class="el" href="#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a> * <a class="el" href="#aef0ea2387e1ff5767bff8563b2d36bd6">kElemsPerFrag</a></td></tr>
 <tr class="separator:a98357339ec98f804a1b12597937b318f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ac4fb73ebc4e7b47a44b8bd6cadda5d44" id="r_ac4fb73ebc4e7b47a44b8bd6cadda5d44"><td class="memItemLeft" align="right" valign="top"><a class="el" href="#a5976565323f2e30479158c14f4b1bfef">frag_type</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ac4fb73ebc4e7b47a44b8bd6cadda5d44">val_frags</a> [<a class="el" href="#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a>] = {<a class="el" href="#a5976565323f2e30479158c14f4b1bfef">frag_type</a>(0)}</td></tr>
-<tr class="separator:ac4fb73ebc4e7b47a44b8bd6cadda5d44"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a5b1d1c85a5046108a4e38bdc5a0ea74e" id="r_a5b1d1c85a5046108a4e38bdc5a0ea74e"><td class="memItemLeft" align="right" valign="top"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5b1d1c85a5046108a4e38bdc5a0ea74e">kRowsPerThread</a> = <a class="el" href="#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> * MMAFrag_t::kElemRows</td></tr>
+<tr class="separator:a5b1d1c85a5046108a4e38bdc5a0ea74e"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1ea49efd92696b15302ee4b52ecd548c" id="r_a1ea49efd92696b15302ee4b52ecd548c"><td class="memItemLeft" align="right" valign="top"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1ea49efd92696b15302ee4b52ecd548c">kColsPerThread</a> = <a class="el" href="#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> * MMAFrag_t::kElemCols</td></tr>
+<tr class="separator:a1ea49efd92696b15302ee4b52ecd548c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a684e6c6d9f00f583994285b60aaa3b62" id="r_a684e6c6d9f00f583994285b60aaa3b62"><td class="memItemLeft" align="right" valign="top"><a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a684e6c6d9f00f583994285b60aaa3b62">val_frags</a> [<a class="el" href="#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a>] = {<a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>(0)}</td></tr>
+<tr class="separator:a684e6c6d9f00f583994285b60aaa3b62"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Typedef Documentation</h2>
-<a id="a80078f0dfa4c225e79d9b460202d5e2c" name="a80078f0dfa4c225e79d9b460202d5e2c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a80078f0dfa4c225e79d9b460202d5e2c">&#9670;&#160;</a></span>elem_type</h2>
+<a id="a824409bc107330805853f932e80a7628" name="a824409bc107330805853f932e80a7628"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a824409bc107330805853f932e80a7628">&#9670;&#160;</a></span>elem_type</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -178,15 +220,15 @@ Public Attributes</h2></td></tr>
 template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">using <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::elem_type = T</td>
+          <td class="memname">typedef T <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::elem_type = T</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a5976565323f2e30479158c14f4b1bfef" name="a5976565323f2e30479158c14f4b1bfef"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a5976565323f2e30479158c14f4b1bfef">&#9670;&#160;</a></span>frag_type</h2>
+<a id="aac25cd0a9bdf24aa2af809c95f0bd171" name="aac25cd0a9bdf24aa2af809c95f0bd171"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aac25cd0a9bdf24aa2af809c95f0bd171">&#9670;&#160;</a></span>frag_type</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -194,15 +236,15 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">MMAFrag_t::frag_type <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::frag_type</td>
+          <td class="memname">typedef MMAFrag_t::frag_type <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::frag_type</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a28306efc1a903b80219c8bb16dc5b190" name="a28306efc1a903b80219c8bb16dc5b190"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a28306efc1a903b80219c8bb16dc5b190">&#9670;&#160;</a></span>mat_type</h2>
+<a id="a1eeb197c9bdf4db42892a39cdb9bd73a" name="a1eeb197c9bdf4db42892a39cdb9bd73a"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1eeb197c9bdf4db42892a39cdb9bd73a">&#9670;&#160;</a></span>mat_type</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -210,15 +252,15 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">MMAFrag_t::mat_type <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::mat_type</td>
+          <td class="memname">typedef MMAFrag_t::mat_type <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::mat_type</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="abe33de70e34300745bad9aa822fd0382" name="abe33de70e34300745bad9aa822fd0382"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#abe33de70e34300745bad9aa822fd0382">&#9670;&#160;</a></span>MMAFrag_t</h2>
+<a id="a6dadcd666afb3759a11094e754560dd4" name="a6dadcd666afb3759a11094e754560dd4"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6dadcd666afb3759a11094e754560dd4">&#9670;&#160;</a></span>MMAFrag_t</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -226,7 +268,7 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">using <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::MMAFrag_t = MMAFrag_</td>
+          <td class="memname">typedef MMAFrag_ <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::MMAFrag_t = MMAFrag_</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -235,7 +277,34 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 </div>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <a id="aa3fb310dd08ec23c334511f7b316d1b6" name="aa3fb310dd08ec23c334511f7b316d1b6"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa3fb310dd08ec23c334511f7b316d1b6">&#9670;&#160;</a></span>MMATile()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aa3fb310dd08ec23c334511f7b316d1b6">&#9670;&#160;</a></span>MMATile() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::MMATile </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aa3fb310dd08ec23c334511f7b316d1b6" name="aa3fb310dd08ec23c334511f7b316d1b6"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa3fb310dd08ec23c334511f7b316d1b6">&#9670;&#160;</a></span>MMATile() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -263,7 +332,34 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 </div>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="aa97a98e423827a889c13a92217626ec7" name="aa97a98e423827a889c13a92217626ec7"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa97a98e423827a889c13a92217626ec7">&#9670;&#160;</a></span>clear()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aa97a98e423827a889c13a92217626ec7">&#9670;&#160;</a></span>clear() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC constexpr void <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::clear </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aa97a98e423827a889c13a92217626ec7" name="aa97a98e423827a889c13a92217626ec7"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa97a98e423827a889c13a92217626ec7">&#9670;&#160;</a></span>clear() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -290,7 +386,7 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 </div>
 </div>
 <a id="a865ece5ad0b9a56937b6d77a18b5a1dc" name="a865ece5ad0b9a56937b6d77a18b5a1dc"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a865ece5ad0b9a56937b6d77a18b5a1dc">&#9670;&#160;</a></span>elems() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a865ece5ad0b9a56937b6d77a18b5a1dc">&#9670;&#160;</a></span>elems() <span class="overload">[1/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -301,7 +397,34 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC thread <a class="el" href="#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a> * <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::elems </td>
+          <td class="memname">METAL_FUNC thread <a class="el" href="#a824409bc107330805853f932e80a7628">elem_type</a> * <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::elems </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a865ece5ad0b9a56937b6d77a18b5a1dc" name="a865ece5ad0b9a56937b6d77a18b5a1dc"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a865ece5ad0b9a56937b6d77a18b5a1dc">&#9670;&#160;</a></span>elems() <span class="overload">[2/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC thread <a class="el" href="#a824409bc107330805853f932e80a7628">elem_type</a> * <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::elems </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td></td>
@@ -317,7 +440,7 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 </div>
 </div>
 <a id="ae21bb7cce701290de84c6015e064d8a1" name="ae21bb7cce701290de84c6015e064d8a1"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ae21bb7cce701290de84c6015e064d8a1">&#9670;&#160;</a></span>elems() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#ae21bb7cce701290de84c6015e064d8a1">&#9670;&#160;</a></span>elems() <span class="overload">[3/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -328,7 +451,34 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC const thread <a class="el" href="#a80078f0dfa4c225e79d9b460202d5e2c">elem_type</a> * <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::elems </td>
+          <td class="memname">METAL_FUNC const thread <a class="el" href="#a824409bc107330805853f932e80a7628">elem_type</a> * <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::elems </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="ae21bb7cce701290de84c6015e064d8a1" name="ae21bb7cce701290de84c6015e064d8a1"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ae21bb7cce701290de84c6015e064d8a1">&#9670;&#160;</a></span>elems() <span class="overload">[4/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC const thread <a class="el" href="#a824409bc107330805853f932e80a7628">elem_type</a> * <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::elems </td>
           <td>(</td>
           <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
           <td> const</td>
@@ -344,7 +494,7 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 </div>
 </div>
 <a id="a1a6b1446e8c8da46885bbaa8e8fdc7e4" name="a1a6b1446e8c8da46885bbaa8e8fdc7e4"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1a6b1446e8c8da46885bbaa8e8fdc7e4">&#9670;&#160;</a></span>frag_at() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a1a6b1446e8c8da46885bbaa8e8fdc7e4">&#9670;&#160;</a></span>frag_at() <span class="overload">[1/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -355,7 +505,38 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC constexpr thread <a class="el" href="#a5976565323f2e30479158c14f4b1bfef">frag_type</a> &amp; <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::frag_at </td>
+          <td class="memname">METAL_FUNC constexpr thread <a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> &amp; <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::frag_at </td>
+          <td>(</td>
+          <td class="paramtype">const short</td>          <td class="paramname"><span class="paramname"><em>i</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const short</td>          <td class="paramname"><span class="paramname"><em>j</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a1a6b1446e8c8da46885bbaa8e8fdc7e4" name="a1a6b1446e8c8da46885bbaa8e8fdc7e4"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1a6b1446e8c8da46885bbaa8e8fdc7e4">&#9670;&#160;</a></span>frag_at() <span class="overload">[2/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC constexpr thread <a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> &amp; <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::frag_at </td>
           <td>(</td>
           <td class="paramtype">const short</td>          <td class="paramname"><span class="paramname"><em>i</em></span>, </td>
         </tr>
@@ -375,7 +556,7 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 </div>
 </div>
 <a id="ad476e1d9a12178fb35c207312339e485" name="ad476e1d9a12178fb35c207312339e485"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ad476e1d9a12178fb35c207312339e485">&#9670;&#160;</a></span>frag_at() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#ad476e1d9a12178fb35c207312339e485">&#9670;&#160;</a></span>frag_at() <span class="overload">[3/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -386,7 +567,38 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC constexpr const thread <a class="el" href="#a5976565323f2e30479158c14f4b1bfef">frag_type</a> &amp; <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::frag_at </td>
+          <td class="memname">METAL_FUNC constexpr const thread <a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> &amp; <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::frag_at </td>
+          <td>(</td>
+          <td class="paramtype">const short</td>          <td class="paramname"><span class="paramname"><em>i</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const short</td>          <td class="paramname"><span class="paramname"><em>j</em></span>&#160;) const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">constexpr</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="ad476e1d9a12178fb35c207312339e485" name="ad476e1d9a12178fb35c207312339e485"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ad476e1d9a12178fb35c207312339e485">&#9670;&#160;</a></span>frag_at() <span class="overload">[4/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC constexpr const thread <a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> &amp; <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::frag_at </td>
           <td>(</td>
           <td class="paramtype">const short</td>          <td class="paramname"><span class="paramname"><em>i</em></span>, </td>
         </tr>
@@ -406,7 +618,40 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 </div>
 </div>
 <a id="aa9e484d8cae936503898d5b772c573f9" name="aa9e484d8cae936503898d5b772c573f9"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa9e484d8cae936503898d5b772c573f9">&#9670;&#160;</a></span>load() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aa9e484d8cae936503898d5b772c573f9">&#9670;&#160;</a></span>load() <span class="overload">[1/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename U , int w_x, int w_y&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::load </td>
+          <td>(</td>
+          <td class="paramtype">const device U *</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ld</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aa9e484d8cae936503898d5b772c573f9" name="aa9e484d8cae936503898d5b772c573f9"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa9e484d8cae936503898d5b772c573f9">&#9670;&#160;</a></span>load() <span class="overload">[2/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -439,7 +684,36 @@ template&lt;typename U , int w_x, int w_y&gt; </div>
 </div>
 </div>
 <a id="aa5426c6beabfb3ee41b58f01b3392a96" name="aa5426c6beabfb3ee41b58f01b3392a96"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa5426c6beabfb3ee41b58f01b3392a96">&#9670;&#160;</a></span>load() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aa5426c6beabfb3ee41b58f01b3392a96">&#9670;&#160;</a></span>load() <span class="overload">[3/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename U , int w_x, int w_y, int str_x, int str_y&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::load </td>
+          <td>(</td>
+          <td class="paramtype">const threadgroup U *</td>          <td class="paramname"><span class="paramname"><em>src</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aa5426c6beabfb3ee41b58f01b3392a96" name="aa5426c6beabfb3ee41b58f01b3392a96"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa5426c6beabfb3ee41b58f01b3392a96">&#9670;&#160;</a></span>load() <span class="overload">[4/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -468,7 +742,45 @@ template&lt;typename U , int w_x, int w_y, int str_x, int str_y&gt; </div>
 </div>
 </div>
 <a id="aa3a4af67813908109da08ce7352f82da" name="aa3a4af67813908109da08ce7352f82da"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aa3a4af67813908109da08ce7352f82da">&#9670;&#160;</a></span>load_safe()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aa3a4af67813908109da08ce7352f82da">&#9670;&#160;</a></span>load_safe() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename U , int w_x, int w_y&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::load_safe </td>
+          <td>(</td>
+          <td class="paramtype">const device U *</td>          <td class="paramname"><span class="paramname"><em>src</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ld</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const short2</td>          <td class="paramname"><span class="paramname"><em>src_tile_dims</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aa3a4af67813908109da08ce7352f82da" name="aa3a4af67813908109da08ce7352f82da"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa3a4af67813908109da08ce7352f82da">&#9670;&#160;</a></span>load_safe() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -506,7 +818,7 @@ template&lt;typename U , int w_x, int w_y&gt; </div>
 </div>
 </div>
 <a id="a323a4f38cd0693bf333832bb4258b28e" name="a323a4f38cd0693bf333832bb4258b28e"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a323a4f38cd0693bf333832bb4258b28e">&#9670;&#160;</a></span>mat_at()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a323a4f38cd0693bf333832bb4258b28e">&#9670;&#160;</a></span>mat_at() <span class="overload">[1/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -517,7 +829,7 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
   <td class="mlabels-left">
       <table class="memname">
         <tr>
-          <td class="memname">METAL_FUNC <a class="el" href="#a28306efc1a903b80219c8bb16dc5b190">mat_type</a> <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::mat_at </td>
+          <td class="memname">METAL_FUNC <a class="el" href="#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a> <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::mat_at </td>
           <td>(</td>
           <td class="paramtype">const short</td>          <td class="paramname"><span class="paramname"><em>i</em></span>, </td>
         </tr>
@@ -534,10 +846,132 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="a323a4f38cd0693bf333832bb4258b28e" name="a323a4f38cd0693bf333832bb4258b28e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a323a4f38cd0693bf333832bb4258b28e">&#9670;&#160;</a></span>mat_at() <span class="overload">[2/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC <a class="el" href="#a1eeb197c9bdf4db42892a39cdb9bd73a">mat_type</a> <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::mat_at </td>
+          <td>(</td>
+          <td class="paramtype">const short</td>          <td class="paramname"><span class="paramname"><em>i</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const short</td>          <td class="paramname"><span class="paramname"><em>j</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a3d0d5b9c7962658cc6d5afbbbb2f19e2" name="a3d0d5b9c7962658cc6d5afbbbb2f19e2"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a3d0d5b9c7962658cc6d5afbbbb2f19e2">&#9670;&#160;</a></span>row_bin_op()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename Op &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::row_bin_op </td>
+          <td>(</td>
+          <td class="paramtype">thread T</td>          <td class="paramname"><span class="paramname"><em>vals</em></span>[kRowsPerThread]</td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aa0ad5cb750ace934bf230385d8bd9f88" name="aa0ad5cb750ace934bf230385d8bd9f88"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aa0ad5cb750ace934bf230385d8bd9f88">&#9670;&#160;</a></span>row_reduce()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename Op &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::row_reduce </td>
+          <td>(</td>
+          <td class="paramtype">thread T</td>          <td class="paramname"><span class="paramname"><em>vals</em></span>[kRowsPerThread]</td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="a752f708e4fe5ef37fdd902dae153179f" name="a752f708e4fe5ef37fdd902dae153179f"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a752f708e4fe5ef37fdd902dae153179f">&#9670;&#160;</a></span>store() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a752f708e4fe5ef37fdd902dae153179f">&#9670;&#160;</a></span>store() <span class="overload">[1/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename U , int w_x, int w_y&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::store </td>
+          <td>(</td>
+          <td class="paramtype">device U *</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ld</em></span>&#160;) const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a752f708e4fe5ef37fdd902dae153179f" name="a752f708e4fe5ef37fdd902dae153179f"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a752f708e4fe5ef37fdd902dae153179f">&#9670;&#160;</a></span>store() <span class="overload">[2/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -570,7 +1004,36 @@ template&lt;typename U , int w_x, int w_y&gt; </div>
 </div>
 </div>
 <a id="a2aadaa3239cb3af0c2ee8af9b88c8a98" name="a2aadaa3239cb3af0c2ee8af9b88c8a98"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a2aadaa3239cb3af0c2ee8af9b88c8a98">&#9670;&#160;</a></span>store() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a2aadaa3239cb3af0c2ee8af9b88c8a98">&#9670;&#160;</a></span>store() <span class="overload">[3/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename U , int w_x, int w_y, int str_x, int str_y&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::store </td>
+          <td>(</td>
+          <td class="paramtype">threadgroup U *</td>          <td class="paramname"><span class="paramname"><em>dst</em></span></td><td>)</td>
+          <td> const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a2aadaa3239cb3af0c2ee8af9b88c8a98" name="a2aadaa3239cb3af0c2ee8af9b88c8a98"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a2aadaa3239cb3af0c2ee8af9b88c8a98">&#9670;&#160;</a></span>store() <span class="overload">[4/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -599,7 +1062,45 @@ template&lt;typename U , int w_x, int w_y, int str_x, int str_y&gt; </div>
 </div>
 </div>
 <a id="a57703f522c7409dbe2c0a68bb7acc2ba" name="a57703f522c7409dbe2c0a68bb7acc2ba"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a57703f522c7409dbe2c0a68bb7acc2ba">&#9670;&#160;</a></span>store_safe()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a57703f522c7409dbe2c0a68bb7acc2ba">&#9670;&#160;</a></span>store_safe() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+<div class="memtemplate">
+template&lt;typename U , int w_x, int w_y&gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC void <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::store_safe </td>
+          <td>(</td>
+          <td class="paramtype">device U *</td>          <td class="paramname"><span class="paramname"><em>dst</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>ld</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const short2</td>          <td class="paramname"><span class="paramname"><em>dst_tile_dims</em></span>&#160;) const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a57703f522c7409dbe2c0a68bb7acc2ba" name="a57703f522c7409dbe2c0a68bb7acc2ba"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a57703f522c7409dbe2c0a68bb7acc2ba">&#9670;&#160;</a></span>store_safe() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -651,6 +1152,22 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
       </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="a1ea49efd92696b15302ee4b52ecd548c" name="a1ea49efd92696b15302ee4b52ecd548c"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1ea49efd92696b15302ee4b52ecd548c">&#9670;&#160;</a></span>kColsPerThread</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> int <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::kColsPerThread = <a class="el" href="#a46324d40f8ad61cade08a1ebad6d9ad4">kTileCols</a> * MMAFrag_t::kElemCols</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="aef0ea2387e1ff5767bff8563b2d36bd6" name="aef0ea2387e1ff5767bff8563b2d36bd6"></a>
@@ -747,6 +1264,22 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
       </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="a5b1d1c85a5046108a4e38bdc5a0ea74e" name="a5b1d1c85a5046108a4e38bdc5a0ea74e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a5b1d1c85a5046108a4e38bdc5a0ea74e">&#9670;&#160;</a></span>kRowsPerThread</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="steel_2defines_8h.html#a90b91c866313ffa46eff6d9cc944ad2b">STEEL_CONST</a> int <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::kRowsPerThread = <a class="el" href="#a1d126b14910385ab644e224ac1d0307a">kTileRows</a> * MMAFrag_t::kElemRows</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="a46324d40f8ad61cade08a1ebad6d9ad4" name="a46324d40f8ad61cade08a1ebad6d9ad4"></a>
@@ -781,8 +1314,8 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 
 </div>
 </div>
-<a id="ac4fb73ebc4e7b47a44b8bd6cadda5d44" name="ac4fb73ebc4e7b47a44b8bd6cadda5d44"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ac4fb73ebc4e7b47a44b8bd6cadda5d44">&#9670;&#160;</a></span>val_frags</h2>
+<a id="a684e6c6d9f00f583994285b60aaa3b62" name="a684e6c6d9f00f583994285b60aaa3b62"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a684e6c6d9f00f583994285b60aaa3b62">&#9670;&#160;</a></span>val_frags</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -790,15 +1323,16 @@ template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseM
 template&lt;typename T , int kTileRows_, int kTileCols_, class MMAFrag_  = BaseMMAFrag&lt;T, 8, 8&gt;&gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname"><a class="el" href="#a5976565323f2e30479158c14f4b1bfef">frag_type</a> <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::val_frags[<a class="el" href="#ae326e7693eb77c22d5a6e3e9219019d3">kNumFrags</a>] = {<a class="el" href="#a5976565323f2e30479158c14f4b1bfef">frag_type</a>(0)}</td>
+          <td class="memname"><a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a> <a class="el" href="structmlx_1_1steel_1_1_m_m_a_tile.html">mlx::steel::MMATile</a>&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;::val_frags = {<a class="el" href="#aac25cd0a9bdf24aa2af809c95f0bd171">frag_type</a>(0)}</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="mma_8h_source.html">mma.h</a></li>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2mma_8h_source.html">mma.h</a></li>
+<li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="gemm_2mma_8h_source.html">mma.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_shape2_d-members.html b/docs/build/html/structmlx_1_1steel_1_1_shape2_d-members.html
new file mode 100644
index 000000000..b6453c677
--- /dev/null
+++ b/docs/build/html/structmlx_1_1steel_1_1_shape2_d-members.html
@@ -0,0 +1,107 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=11"/>
+<meta name="generator" content="Doxygen 1.12.0"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<title>MLX: Member List</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<script type="text/javascript" src="clipboard.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="cookie.js"></script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr id="projectrow">
+  <td id="projectalign">
+   <div id="projectname">MLX
+   </div>
+  </td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.12.0 -->
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+var searchBox = new SearchBox("searchBox", "search/",'.html');
+/* @license-end */
+</script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() { codefold.init(0); });
+/* @license-end */
+</script>
+<script type="text/javascript" src="menudata.js"></script>
+<script type="text/javascript" src="menu.js"></script>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function() {
+  initMenu('',true,false,'search.php','Search',false);
+  $(function() { init_search(); });
+});
+/* @license-end */
+</script>
+<div id="main-nav"></div>
+<script type="text/javascript">
+/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
+$(function(){ initResizable(false); });
+/* @license-end */
+</script>
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<div id="MSearchResults">
+<div class="SRPage">
+<div id="SRIndex">
+<div id="SRResults"></div>
+<div class="SRStatus" id="Loading">Loading...</div>
+<div class="SRStatus" id="Searching">Searching...</div>
+<div class="SRStatus" id="NoMatches">No Matches</div>
+</div>
+</div>
+</div>
+</div>
+
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">Shape2D</a></li>  </ul>
+</div>
+</div><!-- top -->
+<div id="doc-content">
+<div class="header">
+  <div class="headertitle"><div class="title">mlx::steel::Shape2D&lt; RInt, CInt &gt; Member List</div></div>
+</div><!--header-->
+<div class="contents">
+
+<p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a>, including all inherited members.</p>
+<table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html#ae51347b2131647f2ed735ed43840d26e">c</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html#a6e9e8d56782fc8772bc432c7f58393fe">r</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html#a070ce70eb6d84361c7f313159c438a5c">Shape2D</a>(RInt r_, CInt c_)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">mlx::steel::Shape2D&lt; RInt, CInt &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+</table></div><!-- contents -->
+<!-- start footer part -->
+<hr class="footer"/><address class="footer"><small>
+Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
+</small></address>
+</div><!-- doc-content -->
+</body>
+</html>
diff --git a/docs/build/html/struct_m_l_x_scaled_dot_product_attention_params.html b/docs/build/html/structmlx_1_1steel_1_1_shape2_d.html
similarity index 50%
rename from docs/build/html/struct_m_l_x_scaled_dot_product_attention_params.html
rename to docs/build/html/structmlx_1_1steel_1_1_shape2_d.html
index 28aee6579..bd57f4f7f 100644
--- a/docs/build/html/struct_m_l_x_scaled_dot_product_attention_params.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_shape2_d.html
@@ -5,7 +5,7 @@
 <meta http-equiv="X-UA-Compatible" content="IE=11"/>
 <meta name="generator" content="Doxygen 1.12.0"/>
 <meta name="viewport" content="width=device-width, initial-scale=1"/>
-<title>MLX: MLXScaledDotProductAttentionParams Struct Reference</title>
+<title>MLX: mlx::steel::Shape2D&lt; RInt, CInt &gt; Struct Template Reference</title>
 <link href="tabs.css" rel="stylesheet" type="text/css"/>
 <script type="text/javascript" src="jquery.js"></script>
 <script type="text/javascript" src="dynsections.js"></script>
@@ -81,96 +81,94 @@ $(function(){ initResizable(false); });
 </div>
 </div>
 
+<div id="nav-path" class="navpath">
+  <ul>
+<li class="navelem"><a class="el" href="namespacemlx.html">mlx</a></li><li class="navelem"><a class="el" href="namespacemlx_1_1steel.html">steel</a></li><li class="navelem"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">Shape2D</a></li>  </ul>
+</div>
 </div><!-- top -->
 <div id="doc-content">
 <div class="header">
   <div class="summary">
+<a href="#pub-methods">Public Member Functions</a> &#124;
 <a href="#pub-attribs">Public Attributes</a> &#124;
-<a href="struct_m_l_x_scaled_dot_product_attention_params-members.html">List of all members</a>  </div>
-  <div class="headertitle"><div class="title">MLXScaledDotProductAttentionParams Struct Reference</div></div>
+<a href="structmlx_1_1steel_1_1_shape2_d-members.html">List of all members</a>  </div>
+  <div class="headertitle"><div class="title">mlx::steel::Shape2D&lt; RInt, CInt &gt; Struct Template Reference</div></div>
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="scaled__dot__product__attention__params_8h_source.html">scaled_dot_product_attention_params.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="attn_2mma_8h_source.html">mma.h</a>&gt;</code></p>
 <table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
+Public Member Functions</h2></td></tr>
+<tr class="memitem:a070ce70eb6d84361c7f313159c438a5c" id="r_a070ce70eb6d84361c7f313159c438a5c"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a070ce70eb6d84361c7f313159c438a5c">Shape2D</a> (RInt r_, CInt c_)</td></tr>
+<tr class="separator:a070ce70eb6d84361c7f313159c438a5c"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
 Public Attributes</h2></td></tr>
-<tr class="memitem:a46cc2da6a069d822f36983ee18467e5c" id="r_a46cc2da6a069d822f36983ee18467e5c"><td class="memItemLeft" align="right" valign="top">const uint&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a46cc2da6a069d822f36983ee18467e5c">QUERY_SEQUENCE_LENGTH</a> = 1</td></tr>
-<tr class="separator:a46cc2da6a069d822f36983ee18467e5c"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a1a63d2e7ad712b4ba26219c784c95177" id="r_a1a63d2e7ad712b4ba26219c784c95177"><td class="memItemLeft" align="right" valign="top">const uint&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1a63d2e7ad712b4ba26219c784c95177">N_Q_HEADS</a> = 32</td></tr>
-<tr class="separator:a1a63d2e7ad712b4ba26219c784c95177"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a68a292b9986c20560aca88394f82e9f7" id="r_a68a292b9986c20560aca88394f82e9f7"><td class="memItemLeft" align="right" valign="top">const uint&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a68a292b9986c20560aca88394f82e9f7">N_KV_HEADS</a> = 32</td></tr>
-<tr class="separator:a68a292b9986c20560aca88394f82e9f7"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a58ef2765fd681e6b35b2ba72030610e0" id="r_a58ef2765fd681e6b35b2ba72030610e0"><td class="memItemLeft" align="right" valign="top">const uint&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a58ef2765fd681e6b35b2ba72030610e0">KV_TILES</a> = 1</td></tr>
-<tr class="separator:a58ef2765fd681e6b35b2ba72030610e0"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a7461e0e17cdc7d3fed80bb00d58d8644" id="r_a7461e0e17cdc7d3fed80bb00d58d8644"><td class="memItemLeft" align="right" valign="top">const float&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7461e0e17cdc7d3fed80bb00d58d8644">INV_ALPHA</a> = 0.08838834764831843<a class="el" href="types_2bf16_8h.html#af900396d7b72ff2a7002e8befe8cf8f1">f</a></td></tr>
-<tr class="separator:a7461e0e17cdc7d3fed80bb00d58d8644"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a6e9e8d56782fc8772bc432c7f58393fe" id="r_a6e9e8d56782fc8772bc432c7f58393fe"><td class="memItemLeft" align="right" valign="top">RInt&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6e9e8d56782fc8772bc432c7f58393fe">r</a></td></tr>
+<tr class="separator:a6e9e8d56782fc8772bc432c7f58393fe"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae51347b2131647f2ed735ed43840d26e" id="r_ae51347b2131647f2ed735ed43840d26e"><td class="memItemLeft" align="right" valign="top">CInt&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae51347b2131647f2ed735ed43840d26e">c</a></td></tr>
+<tr class="separator:ae51347b2131647f2ed735ed43840d26e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
+<h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
+<a id="a070ce70eb6d84361c7f313159c438a5c" name="a070ce70eb6d84361c7f313159c438a5c"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a070ce70eb6d84361c7f313159c438a5c">&#9670;&#160;</a></span>Shape2D()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename RInt , typename CInt &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">mlx::steel::Shape2D</a>&lt; RInt, CInt &gt;::Shape2D </td>
+          <td>(</td>
+          <td class="paramtype">RInt</td>          <td class="paramname"><span class="paramname"><em>r_</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">CInt</td>          <td class="paramname"><span class="paramname"><em>c_</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
 <h2 class="groupheader">Member Data Documentation</h2>
-<a id="a7461e0e17cdc7d3fed80bb00d58d8644" name="a7461e0e17cdc7d3fed80bb00d58d8644"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7461e0e17cdc7d3fed80bb00d58d8644">&#9670;&#160;</a></span>INV_ALPHA</h2>
+<a id="ae51347b2131647f2ed735ed43840d26e" name="ae51347b2131647f2ed735ed43840d26e"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ae51347b2131647f2ed735ed43840d26e">&#9670;&#160;</a></span>c</h2>
 
 <div class="memitem">
 <div class="memproto">
+<div class="memtemplate">
+template&lt;typename RInt , typename CInt &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">const float MLXScaledDotProductAttentionParams::INV_ALPHA = 0.08838834764831843<a class="el" href="types_2bf16_8h.html#af900396d7b72ff2a7002e8befe8cf8f1">f</a></td>
+          <td class="memname">CInt <a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">mlx::steel::Shape2D</a>&lt; RInt, CInt &gt;::c</td>
         </tr>
       </table>
 </div><div class="memdoc">
 
 </div>
 </div>
-<a id="a58ef2765fd681e6b35b2ba72030610e0" name="a58ef2765fd681e6b35b2ba72030610e0"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a58ef2765fd681e6b35b2ba72030610e0">&#9670;&#160;</a></span>KV_TILES</h2>
+<a id="a6e9e8d56782fc8772bc432c7f58393fe" name="a6e9e8d56782fc8772bc432c7f58393fe"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a6e9e8d56782fc8772bc432c7f58393fe">&#9670;&#160;</a></span>r</h2>
 
 <div class="memitem">
 <div class="memproto">
+<div class="memtemplate">
+template&lt;typename RInt , typename CInt &gt; </div>
       <table class="memname">
         <tr>
-          <td class="memname">const uint MLXScaledDotProductAttentionParams::KV_TILES = 1</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a68a292b9986c20560aca88394f82e9f7" name="a68a292b9986c20560aca88394f82e9f7"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a68a292b9986c20560aca88394f82e9f7">&#9670;&#160;</a></span>N_KV_HEADS</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const uint MLXScaledDotProductAttentionParams::N_KV_HEADS = 32</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a1a63d2e7ad712b4ba26219c784c95177" name="a1a63d2e7ad712b4ba26219c784c95177"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a1a63d2e7ad712b4ba26219c784c95177">&#9670;&#160;</a></span>N_Q_HEADS</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const uint MLXScaledDotProductAttentionParams::N_Q_HEADS = 32</td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="a46cc2da6a069d822f36983ee18467e5c" name="a46cc2da6a069d822f36983ee18467e5c"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a46cc2da6a069d822f36983ee18467e5c">&#9670;&#160;</a></span>QUERY_SEQUENCE_LENGTH</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">const uint MLXScaledDotProductAttentionParams::QUERY_SEQUENCE_LENGTH = 1</td>
+          <td class="memname">RInt <a class="el" href="structmlx_1_1steel_1_1_shape2_d.html">mlx::steel::Shape2D</a>&lt; RInt, CInt &gt;::r</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -178,7 +176,7 @@ Public Attributes</h2></td></tr>
 </div>
 </div>
 <hr/>The documentation for this struct was generated from the following file:<ul>
-<li>mlx/backend/metal/kernels/<a class="el" href="scaled__dot__product__attention__params_8h_source.html">scaled_dot_product_attention_params.h</a></li>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="attn_2mma_8h_source.html">mma.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_transform_add-members.html b/docs/build/html/structmlx_1_1steel_1_1_transform_add-members.html
index 987d08de5..e49712a61 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_transform_add-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_transform_add-members.html
@@ -94,9 +94,12 @@ $(function(){ initResizable(false); });
 
 <p>This is the complete list of members for <a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a>, including all inherited members.</p>
 <table class="directory">
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf">apply</a>(InT x)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19">apply</a>(InT x, OutT c)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html#afbb688d84443fd622b4dd2768cfe0acf">apply</a>(InT x)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html#a4923b0059d88099b2739f2cf0273ea19">apply</a>(InT x, OutT c)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae">TransformAdd</a>(const float, const float)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html#a7c1b7292910b74281e5296b3dac157ae">TransformAdd</a>(const float, const float)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_transform_add.html b/docs/build/html/structmlx_1_1steel_1_1_transform_add.html
index 245113f1a..e4189cfd8 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_transform_add.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_transform_add.html
@@ -96,12 +96,14 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
 <tr class="memitem:a7c1b7292910b74281e5296b3dac157ae" id="r_a7c1b7292910b74281e5296b3dac157ae"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7c1b7292910b74281e5296b3dac157ae">TransformAdd</a> (const float, const float)</td></tr>
 <tr class="separator:a7c1b7292910b74281e5296b3dac157ae"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a7c1b7292910b74281e5296b3dac157ae" id="r_a7c1b7292910b74281e5296b3dac157ae"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7c1b7292910b74281e5296b3dac157ae">TransformAdd</a> (const float, const float)</td></tr>
+<tr class="separator:a7c1b7292910b74281e5296b3dac157ae"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
 Static Public Member Functions</h2></td></tr>
@@ -109,10 +111,45 @@ Static Public Member Functions</h2></td></tr>
 <tr class="separator:afbb688d84443fd622b4dd2768cfe0acf"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a4923b0059d88099b2739f2cf0273ea19" id="r_a4923b0059d88099b2739f2cf0273ea19"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4923b0059d88099b2739f2cf0273ea19">apply</a> (InT x, OutT c)</td></tr>
 <tr class="separator:a4923b0059d88099b2739f2cf0273ea19"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:afbb688d84443fd622b4dd2768cfe0acf" id="r_afbb688d84443fd622b4dd2768cfe0acf"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afbb688d84443fd622b4dd2768cfe0acf">apply</a> (InT x)</td></tr>
+<tr class="separator:afbb688d84443fd622b4dd2768cfe0acf"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a4923b0059d88099b2739f2cf0273ea19" id="r_a4923b0059d88099b2739f2cf0273ea19"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4923b0059d88099b2739f2cf0273ea19">apply</a> (InT x, OutT c)</td></tr>
+<tr class="separator:a4923b0059d88099b2739f2cf0273ea19"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <a id="a7c1b7292910b74281e5296b3dac157ae" name="a7c1b7292910b74281e5296b3dac157ae"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a7c1b7292910b74281e5296b3dac157ae">&#9670;&#160;</a></span>TransformAdd()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a7c1b7292910b74281e5296b3dac157ae">&#9670;&#160;</a></span>TransformAdd() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OutT , typename InT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd</a>&lt; OutT, InT &gt;::TransformAdd </td>
+          <td>(</td>
+          <td class="paramtype">const float</td>          <td class="paramname"><span class="paramname"><em></em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const float</td>          <td class="paramname"><span class="paramname"><em></em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a7c1b7292910b74281e5296b3dac157ae" name="a7c1b7292910b74281e5296b3dac157ae"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a7c1b7292910b74281e5296b3dac157ae">&#9670;&#160;</a></span>TransformAdd() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -144,7 +181,34 @@ template&lt;typename OutT , typename InT &gt; </div>
 </div>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="afbb688d84443fd622b4dd2768cfe0acf" name="afbb688d84443fd622b4dd2768cfe0acf"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#afbb688d84443fd622b4dd2768cfe0acf">&#9670;&#160;</a></span>apply() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#afbb688d84443fd622b4dd2768cfe0acf">&#9670;&#160;</a></span>apply() <span class="overload">[1/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OutT , typename InT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC OutT <a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd</a>&lt; OutT, InT &gt;::apply </td>
+          <td>(</td>
+          <td class="paramtype">InT</td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="afbb688d84443fd622b4dd2768cfe0acf" name="afbb688d84443fd622b4dd2768cfe0acf"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#afbb688d84443fd622b4dd2768cfe0acf">&#9670;&#160;</a></span>apply() <span class="overload">[2/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -171,7 +235,7 @@ template&lt;typename OutT , typename InT &gt; </div>
 </div>
 </div>
 <a id="a4923b0059d88099b2739f2cf0273ea19" name="a4923b0059d88099b2739f2cf0273ea19"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a4923b0059d88099b2739f2cf0273ea19">&#9670;&#160;</a></span>apply() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a4923b0059d88099b2739f2cf0273ea19">&#9670;&#160;</a></span>apply() <span class="overload">[3/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -201,7 +265,39 @@ template&lt;typename OutT , typename InT &gt; </div>
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
+<a id="a4923b0059d88099b2739f2cf0273ea19" name="a4923b0059d88099b2739f2cf0273ea19"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a4923b0059d88099b2739f2cf0273ea19">&#9670;&#160;</a></span>apply() <span class="overload">[4/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OutT , typename InT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC OutT <a class="el" href="structmlx_1_1steel_1_1_transform_add.html">mlx::steel::TransformAdd</a>&lt; OutT, InT &gt;::apply </td>
+          <td>(</td>
+          <td class="paramtype">InT</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">OutT</td>          <td class="paramname"><span class="paramname"><em>c</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a></li>
 <li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_transform_axpby-members.html b/docs/build/html/structmlx_1_1steel_1_1_transform_axpby-members.html
index 6e6bb9390..e83e98a9c 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_transform_axpby-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_transform_axpby-members.html
@@ -97,8 +97,11 @@ $(function(){ initResizable(false); });
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#ab3223b49c6b3b7f89eba91aeaff9dcff">alpha</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87">apply</a>(InT x)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba">apply</a>(InT x, OutT c) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#a14ad48b0189d6bdde06c66f1b567ae87">apply</a>(InT x)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#aaf3a45e25d7abf7a34b48cc612e631ba">apply</a>(InT x, OutT c) const</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#a5fc726f085bafd1acbc391886f7fb8b6">beta</a></td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9">TransformAxpby</a>(const float alpha_, const float beta_)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html#ad7d11c53de13646b725921391d15bbe9">TransformAxpby</a>(const float alpha_, const float beta_)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_transform_axpby.html b/docs/build/html/structmlx_1_1steel_1_1_transform_axpby.html
index e35aec244..240382e1a 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_transform_axpby.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_transform_axpby.html
@@ -97,7 +97,7 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-methods" name="pub-methods"></a>
 Public Member Functions</h2></td></tr>
@@ -105,11 +105,17 @@ Public Member Functions</h2></td></tr>
 <tr class="separator:ad7d11c53de13646b725921391d15bbe9"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aaf3a45e25d7abf7a34b48cc612e631ba" id="r_aaf3a45e25d7abf7a34b48cc612e631ba"><td class="memItemLeft" align="right" valign="top">METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aaf3a45e25d7abf7a34b48cc612e631ba">apply</a> (InT x, OutT c) const</td></tr>
 <tr class="separator:aaf3a45e25d7abf7a34b48cc612e631ba"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ad7d11c53de13646b725921391d15bbe9" id="r_ad7d11c53de13646b725921391d15bbe9"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad7d11c53de13646b725921391d15bbe9">TransformAxpby</a> (const float alpha_, const float beta_)</td></tr>
+<tr class="separator:ad7d11c53de13646b725921391d15bbe9"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aaf3a45e25d7abf7a34b48cc612e631ba" id="r_aaf3a45e25d7abf7a34b48cc612e631ba"><td class="memItemLeft" align="right" valign="top">METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aaf3a45e25d7abf7a34b48cc612e631ba">apply</a> (InT x, OutT c) const</td></tr>
+<tr class="separator:aaf3a45e25d7abf7a34b48cc612e631ba"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
 Static Public Member Functions</h2></td></tr>
 <tr class="memitem:a14ad48b0189d6bdde06c66f1b567ae87" id="r_a14ad48b0189d6bdde06c66f1b567ae87"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a14ad48b0189d6bdde06c66f1b567ae87">apply</a> (InT x)</td></tr>
 <tr class="separator:a14ad48b0189d6bdde06c66f1b567ae87"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a14ad48b0189d6bdde06c66f1b567ae87" id="r_a14ad48b0189d6bdde06c66f1b567ae87"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a14ad48b0189d6bdde06c66f1b567ae87">apply</a> (InT x)</td></tr>
+<tr class="separator:a14ad48b0189d6bdde06c66f1b567ae87"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-attribs" name="pub-attribs"></a>
 Public Attributes</h2></td></tr>
@@ -120,7 +126,38 @@ Public Attributes</h2></td></tr>
 </table>
 <h2 class="groupheader">Constructor &amp; Destructor Documentation</h2>
 <a id="ad7d11c53de13646b725921391d15bbe9" name="ad7d11c53de13646b725921391d15bbe9"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ad7d11c53de13646b725921391d15bbe9">&#9670;&#160;</a></span>TransformAxpby()</h2>
+<h2 class="memtitle"><span class="permalink"><a href="#ad7d11c53de13646b725921391d15bbe9">&#9670;&#160;</a></span>TransformAxpby() <span class="overload">[1/2]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OutT , typename InT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname"><a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby</a>&lt; OutT, InT &gt;::TransformAxpby </td>
+          <td>(</td>
+          <td class="paramtype">const float</td>          <td class="paramname"><span class="paramname"><em>alpha_</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const float</td>          <td class="paramname"><span class="paramname"><em>beta_</em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="ad7d11c53de13646b725921391d15bbe9" name="ad7d11c53de13646b725921391d15bbe9"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ad7d11c53de13646b725921391d15bbe9">&#9670;&#160;</a></span>TransformAxpby() <span class="overload">[2/2]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -152,7 +189,34 @@ template&lt;typename OutT , typename InT &gt; </div>
 </div>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="a14ad48b0189d6bdde06c66f1b567ae87" name="a14ad48b0189d6bdde06c66f1b567ae87"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a14ad48b0189d6bdde06c66f1b567ae87">&#9670;&#160;</a></span>apply() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a14ad48b0189d6bdde06c66f1b567ae87">&#9670;&#160;</a></span>apply() <span class="overload">[1/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OutT , typename InT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC OutT <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby</a>&lt; OutT, InT &gt;::apply </td>
+          <td>(</td>
+          <td class="paramtype">InT</td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a14ad48b0189d6bdde06c66f1b567ae87" name="a14ad48b0189d6bdde06c66f1b567ae87"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a14ad48b0189d6bdde06c66f1b567ae87">&#9670;&#160;</a></span>apply() <span class="overload">[2/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -179,7 +243,38 @@ template&lt;typename OutT , typename InT &gt; </div>
 </div>
 </div>
 <a id="aaf3a45e25d7abf7a34b48cc612e631ba" name="aaf3a45e25d7abf7a34b48cc612e631ba"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#aaf3a45e25d7abf7a34b48cc612e631ba">&#9670;&#160;</a></span>apply() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#aaf3a45e25d7abf7a34b48cc612e631ba">&#9670;&#160;</a></span>apply() <span class="overload">[3/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OutT , typename InT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">METAL_FUNC OutT <a class="el" href="structmlx_1_1steel_1_1_transform_axpby.html">mlx::steel::TransformAxpby</a>&lt; OutT, InT &gt;::apply </td>
+          <td>(</td>
+          <td class="paramtype">InT</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">OutT</td>          <td class="paramname"><span class="paramname"><em>c</em></span>&#160;) const</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="aaf3a45e25d7abf7a34b48cc612e631ba" name="aaf3a45e25d7abf7a34b48cc612e631ba"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#aaf3a45e25d7abf7a34b48cc612e631ba">&#9670;&#160;</a></span>apply() <span class="overload">[4/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -242,7 +337,8 @@ template&lt;typename OutT , typename InT &gt; </div>
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a></li>
 <li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/structmlx_1_1steel_1_1_transform_none-members.html b/docs/build/html/structmlx_1_1steel_1_1_transform_none-members.html
index 1795af7ae..82002435d 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_transform_none-members.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_transform_none-members.html
@@ -96,6 +96,8 @@ $(function(){ initResizable(false); });
 <table class="directory">
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75">apply</a>(InT x)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_none.html">mlx::steel::TransformNone&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90">apply</a>(InT x, OutT)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_none.html">mlx::steel::TransformNone&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_none.html#a84daa89be5b3348b5715bf8c5a01da75">apply</a>(InT x)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_none.html">mlx::steel::TransformNone&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_none.html#ae4c397038f386b13eaa386638a0fce90">apply</a>(InT x, OutT)</td><td class="entry"><a class="el" href="structmlx_1_1steel_1_1_transform_none.html">mlx::steel::TransformNone&lt; OutT, InT &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1steel_1_1_transform_none.html b/docs/build/html/structmlx_1_1steel_1_1_transform_none.html
index ca73ed3aa..03b35abad 100644
--- a/docs/build/html/structmlx_1_1steel_1_1_transform_none.html
+++ b/docs/build/html/structmlx_1_1steel_1_1_transform_none.html
@@ -95,7 +95,7 @@ $(function(){ initResizable(false); });
 </div><!--header-->
 <div class="contents">
 
-<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
+<p><code>#include &lt;<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a>&gt;</code></p>
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="pub-static-methods" name="pub-static-methods"></a>
 Static Public Member Functions</h2></td></tr>
@@ -103,10 +103,41 @@ Static Public Member Functions</h2></td></tr>
 <tr class="separator:a84daa89be5b3348b5715bf8c5a01da75"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ae4c397038f386b13eaa386638a0fce90" id="r_ae4c397038f386b13eaa386638a0fce90"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae4c397038f386b13eaa386638a0fce90">apply</a> (InT x, OutT)</td></tr>
 <tr class="separator:ae4c397038f386b13eaa386638a0fce90"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a84daa89be5b3348b5715bf8c5a01da75" id="r_a84daa89be5b3348b5715bf8c5a01da75"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a84daa89be5b3348b5715bf8c5a01da75">apply</a> (InT x)</td></tr>
+<tr class="separator:a84daa89be5b3348b5715bf8c5a01da75"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ae4c397038f386b13eaa386638a0fce90" id="r_ae4c397038f386b13eaa386638a0fce90"><td class="memItemLeft" align="right" valign="top">static METAL_FUNC OutT&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae4c397038f386b13eaa386638a0fce90">apply</a> (InT x, OutT)</td></tr>
+<tr class="separator:ae4c397038f386b13eaa386638a0fce90"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Member Function Documentation</h2>
 <a id="a84daa89be5b3348b5715bf8c5a01da75" name="a84daa89be5b3348b5715bf8c5a01da75"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a84daa89be5b3348b5715bf8c5a01da75">&#9670;&#160;</a></span>apply() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#a84daa89be5b3348b5715bf8c5a01da75">&#9670;&#160;</a></span>apply() <span class="overload">[1/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OutT , typename InT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC OutT <a class="el" href="structmlx_1_1steel_1_1_transform_none.html">mlx::steel::TransformNone</a>&lt; OutT, InT &gt;::apply </td>
+          <td>(</td>
+          <td class="paramtype">InT</td>          <td class="paramname"><span class="paramname"><em>x</em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a84daa89be5b3348b5715bf8c5a01da75" name="a84daa89be5b3348b5715bf8c5a01da75"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a84daa89be5b3348b5715bf8c5a01da75">&#9670;&#160;</a></span>apply() <span class="overload">[2/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -133,7 +164,7 @@ template&lt;typename OutT , typename InT &gt; </div>
 </div>
 </div>
 <a id="ae4c397038f386b13eaa386638a0fce90" name="ae4c397038f386b13eaa386638a0fce90"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ae4c397038f386b13eaa386638a0fce90">&#9670;&#160;</a></span>apply() <span class="overload">[2/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#ae4c397038f386b13eaa386638a0fce90">&#9670;&#160;</a></span>apply() <span class="overload">[3/4]</span></h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -163,7 +194,39 @@ template&lt;typename OutT , typename InT &gt; </div>
 
 </div>
 </div>
-<hr/>The documentation for this struct was generated from the following file:<ul>
+<a id="ae4c397038f386b13eaa386638a0fce90" name="ae4c397038f386b13eaa386638a0fce90"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ae4c397038f386b13eaa386638a0fce90">&#9670;&#160;</a></span>apply() <span class="overload">[4/4]</span></h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename OutT , typename InT &gt; </div>
+<table class="mlabels">
+  <tr>
+  <td class="mlabels-left">
+      <table class="memname">
+        <tr>
+          <td class="memname">static METAL_FUNC OutT <a class="el" href="structmlx_1_1steel_1_1_transform_none.html">mlx::steel::TransformNone</a>&lt; OutT, InT &gt;::apply </td>
+          <td>(</td>
+          <td class="paramtype">InT</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">OutT</td>          <td class="paramname"><span class="paramname"><em></em></span>&#160;)</td>
+        </tr>
+      </table>
+  </td>
+  <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">static</span></span>  </td>
+  </tr>
+</table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<hr/>The documentation for this struct was generated from the following files:<ul>
+<li>mlx/backend/metal/kernels/steel/attn/<a class="el" href="backend_2metal_2kernels_2steel_2attn_2transforms_8h_source.html">transforms.h</a></li>
 <li>mlx/backend/metal/kernels/steel/gemm/<a class="el" href="backend_2metal_2kernels_2steel_2gemm_2transforms_8h_source.html">transforms.h</a></li>
 </ul>
 </div><!-- contents -->
diff --git a/docs/build/html/type__traits_8h_source.html b/docs/build/html/type__traits_8h_source.html
index 1a35bd3d0..ca3288b10 100644
--- a/docs/build/html/type__traits_8h_source.html
+++ b/docs/build/html/type__traits_8h_source.html
@@ -156,7 +156,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>} <span class="comment">// namespace metal</span></div>
 <div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span> </div>
 <div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span><span class="preprocessor">#pragma METAL internals : disable</span></div>
-<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
+<div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16_math.h:226</div></div>
 <div class="ttc" id="anamespacemetal_html_a192322c772aa8b168d59edc55fb806f1"><div class="ttname"><a href="namespacemetal.html#a192322c772aa8b168d59edc55fb806f1">metal::void_t</a></div><div class="ttdeci">typename make_void&lt; Ts... &gt;::type void_t</div><div class="ttdef"><b>Definition</b> type_traits.h:25</div></div>
 <div class="ttc" id="anamespacemetal_html_ac82ee6c3fbe9ec5c78c07329424aaec9"><div class="ttname"><a href="namespacemetal.html#ac82ee6c3fbe9ec5c78c07329424aaec9">metal::pointer_element_t</a></div><div class="ttdeci">typename pointer_element&lt; remove_cv_t&lt; T &gt; &gt;::type pointer_element_t</div><div class="ttdef"><b>Definition</b> type_traits.h:51</div></div>
 <div class="ttc" id="astructmetal_1_1is__empty_html"><div class="ttname"><a href="structmetal_1_1is__empty.html">metal::is_empty</a></div><div class="ttdef"><b>Definition</b> type_traits.h:12</div></div>
diff --git a/docs/build/html/types_2bf16_8h.html b/docs/build/html/types_2bf16_8h.html
index f1405201b..7e2087076 100644
--- a/docs/build/html/types_2bf16_8h.html
+++ b/docs/build/html/types_2bf16_8h.html
@@ -503,7 +503,7 @@ Functions</h2></td></tr>
 <div class="line">  bfloat_binop_helper(_op_, _operator_, <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>, uint32_t, <span class="keywordtype">float</span>);     \</div>
 <div class="line">  bfloat_binop_helper(_op_, _operator_, <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>, int64_t, <span class="keywordtype">float</span>);      \</div>
 <div class="line">  bfloat_binop_helper(_op_, _operator_, <a class="code hl_struct" href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a>, uint64_t, <span class="keywordtype">float</span>);</div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
 <div class="ttc" id="atypes_2bf16_8h_html_a78c92beda4436da9a2e520fa98c59f70"><div class="ttname"><a href="#a78c92beda4436da9a2e520fa98c59f70">bfloat_binop_base</a></div><div class="ttdeci">#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype)</div><div class="ttdef"><b>Definition</b> bf16.h:71</div></div>
 </div><!-- fragment -->
 </div>
@@ -627,7 +627,7 @@ Functions</h2></td></tr>
 <div class="line">    out.<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a> = lhs __op__ rhs.<a class="code hl_variable" href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">bits_</a>;                                       \</div>
 <div class="line">    <span class="keywordflow">return</span> out;                                                             \</div>
 <div class="line">  }</div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html_a4113263b63e3757ea8334cc4f0f5c3c8"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">_MLX_BFloat16::bits_</a></div><div class="ttdeci">uint16_t bits_</div><div class="ttdef"><b>Definition</b> bf16.h:57</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html_a4113263b63e3757ea8334cc4f0f5c3c8"><div class="ttname"><a href="struct___m_l_x___b_float16.html#a4113263b63e3757ea8334cc4f0f5c3c8">_MLX_BFloat16::bits_</a></div><div class="ttdeci">uint16_t bits_</div><div class="ttdef"><b>Definition</b> bf16.h:51</div></div>
 </div><!-- fragment -->
 </div>
 </div>
diff --git a/docs/build/html/types_2bf16_8h_source.html b/docs/build/html/types_2bf16_8h_source.html
index cdd4e5494..a9fdeed46 100644
--- a/docs/build/html/types_2bf16_8h_source.html
+++ b/docs/build/html/types_2bf16_8h_source.html
@@ -202,10 +202,10 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span><span class="preprocessor">  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint64_t, float);</span></div>
 </div>
 <div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span> </div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ac14b984970cafd8fbe24d080949515cc">   96</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(+, <span class="keyword">operator</span>+);</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a622ce842fe44e4b6a95e03242341b459">   97</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(-, <span class="keyword">operator</span>-);</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a0cc824d6318f97f7058918ab64ddfc25">   98</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(*, <span class="keyword">operator</span>*);</div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a7573ac3b93ddecd69e9c88a26fc84ba9">   99</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(/, <span class="keyword">operator</span>/);</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ac14b984970cafd8fbe24d080949515cc">   96</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(+, <span class="keyword">operator</span>+);</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a622ce842fe44e4b6a95e03242341b459">   97</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(-, <span class="keyword">operator</span>-);</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a0cc824d6318f97f7058918ab64ddfc25">   98</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(*, <span class="keyword">operator</span>*);</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a7573ac3b93ddecd69e9c88a26fc84ba9">   99</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a>(/, <span class="keyword">operator</span>/);</div>
 <div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span> </div>
 <div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span><span class="preprocessor">#undef bfloat_binop</span></div>
 <div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span> </div>
@@ -222,12 +222,12 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span><span class="preprocessor">  bfloat_binop_helper(__op__, __operator__, bool, uint64_t, float);</span></div>
 </div>
 <div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span> </div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aedc4e9df4bf71c0ac34fcfae60cdf550">  114</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&gt;, <span class="keyword">operator</span>&gt;);</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a987d631e1508e8df55d98ddd57e4d086">  115</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&lt;, <span class="keyword">operator</span>&lt;);</div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a8494764f5c686743ede66dc76d85d955">  116</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&gt;=, <span class="keyword">operator</span>&gt;=);</div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a0066a47cb21223ddebc77992ee874fb9">  117</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&lt;=, <span class="keyword">operator</span>&lt;=);</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aec63a0472cb943fe39f31e7678555572">  118</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(==, <span class="keyword">operator</span>==);</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ad2f9e1c230ec35d5c406dd616e8f4dea">  119</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(!=, <span class="keyword">operator</span>!=);</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aedc4e9df4bf71c0ac34fcfae60cdf550">  114</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&gt;, <span class="keyword">operator</span>&gt;);</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a987d631e1508e8df55d98ddd57e4d086">  115</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&lt;, <span class="keyword">operator</span>&lt;);</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a8494764f5c686743ede66dc76d85d955">  116</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&gt;=, <span class="keyword">operator</span>&gt;=);</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a0066a47cb21223ddebc77992ee874fb9">  117</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(&lt;=, <span class="keyword">operator</span>&lt;=);</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aec63a0472cb943fe39f31e7678555572">  118</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(==, <span class="keyword">operator</span>==);</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ad2f9e1c230ec35d5c406dd616e8f4dea">  119</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a>(!=, <span class="keyword">operator</span>!=);</div>
 <div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span> </div>
 <div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span><span class="preprocessor">#undef bfloat_compop</span></div>
 <div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span> </div>
@@ -251,10 +251,10 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span><span class="preprocessor">  }</span></div>
 </div>
 <div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span> </div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a9f2c9d2f21fbf9fbbacd940c6967c9d1">  139</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(+, <span class="keyword">operator</span>+=);</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a8b8a55690df46d97fcfc2a60120783af">  140</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(-, <span class="keyword">operator</span>-=);</div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a0dd3893abc8986901872c8365ab1509d">  141</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(*, <span class="keyword">operator</span>*=);</div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a045ff27257cb6d8ab7a94771ba5a17e6">  142</a></span><a class="code hl_define" href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(/, <span class="keyword">operator</span>/=);</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a9f2c9d2f21fbf9fbbacd940c6967c9d1">  139</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(+, <span class="keyword">operator</span>+=);</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a8b8a55690df46d97fcfc2a60120783af">  140</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(-, <span class="keyword">operator</span>-=);</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a0dd3893abc8986901872c8365ab1509d">  141</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(*, <span class="keyword">operator</span>*=);</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a045ff27257cb6d8ab7a94771ba5a17e6">  142</a></span><a class="code hl_define" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a>(/, <span class="keyword">operator</span>/=);</div>
 <div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span> </div>
 <div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span><span class="preprocessor">#undef bfloat_inplace_op</span></div>
 <div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span> </div>
@@ -304,9 +304,9 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span><span class="preprocessor">#undef bfloat_inplace_bitop</span></div>
 <div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span> </div>
 <div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>} <span class="comment">// namespace mlx::core</span></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a330a0883503cb640f1cf628a7ca50239"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a></div><div class="ttdeci">#define bfloat_compop(__op__, __operator__)</div><div class="ttdef"><b>Definition</b> bf16.h:173</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a4ac82467fbc674e990090f482b9c1e5c"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a></div><div class="ttdeci">#define bfloat_inplace_op(itype)</div><div class="ttdef"><b>Definition</b> bf16.h:214</div></div>
-<div class="ttc" id="abackend_2metal_2kernels_2bf16_8h_html_a7694892a131c0e31e5153c088cccb707"><div class="ttname"><a href="backend_2metal_2kernels_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a></div><div class="ttdeci">#define bfloat_binop(_op_, _operator_)</div><div class="ttdef"><b>Definition</b> bf16.h:156</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a330a0883503cb640f1cf628a7ca50239"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a330a0883503cb640f1cf628a7ca50239">bfloat_compop</a></div><div class="ttdeci">#define bfloat_compop(__op__, __operator__)</div><div class="ttdef"><b>Definition</b> bf16.h:167</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a4ac82467fbc674e990090f482b9c1e5c"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a4ac82467fbc674e990090f482b9c1e5c">bfloat_inplace_op</a></div><div class="ttdeci">#define bfloat_inplace_op(itype)</div><div class="ttdef"><b>Definition</b> bf16.h:208</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2metal__3__0_2bf16_8h_html_a7694892a131c0e31e5153c088cccb707"><div class="ttname"><a href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7694892a131c0e31e5153c088cccb707">bfloat_binop</a></div><div class="ttdeci">#define bfloat_binop(_op_, _operator_)</div><div class="ttdef"><b>Definition</b> bf16.h:150</div></div>
 <div class="ttc" id="agroup__ops_html_gade2eea48989f4caaf36e89f7bd2a8816"><div class="ttname"><a href="group__ops.html#gade2eea48989f4caaf36e89f7bd2a8816">mlx::core::operator-</a></div><div class="ttdeci">array operator-(const array &amp;a)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1___m_l_x___b_float16_html"><div class="ttname"><a href="structmlx_1_1core_1_1___m_l_x___b_float16.html">mlx::core::_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:21</div></div>
diff --git a/docs/build/html/types_2complex_8h.html b/docs/build/html/types_2complex_8h.html
index 1d4b91255..ade84b8e1 100644
--- a/docs/build/html/types_2complex_8h.html
+++ b/docs/build/html/types_2complex_8h.html
@@ -167,9 +167,9 @@ Functions</h2></td></tr>
 <tr class="separator:ad38b38a3faf050735d45eed4438ee27a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a358e66ff205bda3e8542427b6d2edadc" id="r_a358e66ff205bda3e8542427b6d2edadc"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a358e66ff205bda3e8542427b6d2edadc">mlx::core::operator+</a> (const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;x, <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> y)</td></tr>
 <tr class="separator:a358e66ff205bda3e8542427b6d2edadc"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af56d4b85e329e39a825c01a50e3a2522" id="r_af56d4b85e329e39a825c01a50e3a2522"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#af56d4b85e329e39a825c01a50e3a2522">mlx::core::operator+</a> (<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;y)</td></tr>
+<tr class="memitem:af56d4b85e329e39a825c01a50e3a2522" id="r_af56d4b85e329e39a825c01a50e3a2522"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#af56d4b85e329e39a825c01a50e3a2522">mlx::core::operator+</a> (<a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> x, const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;y)</td></tr>
 <tr class="separator:af56d4b85e329e39a825c01a50e3a2522"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a806a495a129ebaab69cc57ca7db831d6" id="r_a806a495a129ebaab69cc57ca7db831d6"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a806a495a129ebaab69cc57ca7db831d6">mlx::core::operator+</a> (const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;x, <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
+<tr class="memitem:a806a495a129ebaab69cc57ca7db831d6" id="r_a806a495a129ebaab69cc57ca7db831d6"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a806a495a129ebaab69cc57ca7db831d6">mlx::core::operator+</a> (const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;x, <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> y)</td></tr>
 <tr class="separator:a806a495a129ebaab69cc57ca7db831d6"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a09fc6ebda917969383783a112a8547e7" id="r_a09fc6ebda917969383783a112a8547e7"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a09fc6ebda917969383783a112a8547e7">mlx::core::operator+</a> (float x, const <a class="el" href="structmlx_1_1core_1_1complex64__t.html">complex64_t</a> &amp;y)</td></tr>
 <tr class="separator:a09fc6ebda917969383783a112a8547e7"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -222,8 +222,8 @@ Variables</h2></td></tr>
 <div class="line">  complex_binop_helper(_op_, _operator_, <a class="code hl_typedef" href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a>)                                   \</div>
 <div class="line">  complex_binop_helper(_op_, _operator_, <a class="code hl_struct" href="struct___m_l_x___b_float16.html">bfloat16_t</a>)                                  \</div>
 <div class="line">  complex_binop_helper(_op_, _operator_, <span class="keywordtype">float</span>)</div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_acb8ddf4a29129846b673c50ba7078773"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a></div><div class="ttdeci">half float16_t</div><div class="ttdef"><b>Definition</b> utils.h:10</div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_acb8ddf4a29129846b673c50ba7078773"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#acb8ddf4a29129846b673c50ba7078773">float16_t</a></div><div class="ttdeci">half float16_t</div><div class="ttdef"><b>Definition</b> utils.h:16</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
 <div class="ttc" id="astructcomplex64__t_html"><div class="ttname"><a href="structcomplex64__t.html">complex64_t</a></div><div class="ttdef"><b>Definition</b> complex.h:20</div></div>
 </div><!-- fragment -->
 </div>
diff --git a/docs/build/html/unary__ops_8h_source.html b/docs/build/html/unary__ops_8h_source.html
index d1b20270d..2be8e3ced 100644
--- a/docs/build/html/unary__ops_8h_source.html
+++ b/docs/build/html/unary__ops_8h_source.html
@@ -709,35 +709,35 @@ $(function(){ initResizable(false); });
 </div>
 <div class="line"><a id="l00414" name="l00414"></a><span class="lineno">  414</span>};</div>
 </div>
-<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a27c03f2f90ab56db2e4d59559a3d2e9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a></div><div class="ttdeci">float log1p(float x)</div><div class="ttdef"><b>Definition</b> utils.h:277</div></div>
+<div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a27c03f2f90ab56db2e4d59559a3d2e9a"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a27c03f2f90ab56db2e4d59559a3d2e9a">log1p</a></div><div class="ttdeci">float log1p(float x)</div><div class="ttdef"><b>Definition</b> utils.h:318</div></div>
 <div class="ttc" id="aerf_8h_html"><div class="ttname"><a href="erf_8h.html">erf.h</a></div></div>
 <div class="ttc" id="aerf_8h_html_a1846e0d683c7aff826bb32addcc3b885"><div class="ttname"><a href="erf_8h.html#a1846e0d683c7aff826bb32addcc3b885">erfinv</a></div><div class="ttdeci">float erfinv(float a)</div><div class="ttdef"><b>Definition</b> erf.h:42</div></div>
 <div class="ttc" id="aerf_8h_html_a6ce199ee56105c67adbf8c48c019a8b2"><div class="ttname"><a href="erf_8h.html#a6ce199ee56105c67adbf8c48c019a8b2">erf</a></div><div class="ttdeci">float erf(float a)</div><div class="ttdef"><b>Definition</b> erf.h:11</div></div>
 <div class="ttc" id="aexpm1f_8h_html"><div class="ttname"><a href="expm1f_8h.html">expm1f.h</a></div></div>
 <div class="ttc" id="aexpm1f_8h_html_a87f66d30e185950f42ce3641783cdc40"><div class="ttname"><a href="expm1f_8h.html#a87f66d30e185950f42ce3641783cdc40">expm1f</a></div><div class="ttdeci">float expm1f(float a)</div><div class="ttdef"><b>Definition</b> expm1f.h:80</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a1f489fabffab969b8677b56bb1136067"><div class="ttname"><a href="namespacemetal_1_1precise.html#a1f489fabffab969b8677b56bb1136067">metal::precise::acosh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t acosh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a341c2b8c27d1bed860f85f8b355023d4"><div class="ttname"><a href="namespacemetal_1_1precise.html#a341c2b8c27d1bed860f85f8b355023d4">metal::precise::log</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a44239067e8e9248b1574353f98e94d72"><div class="ttname"><a href="namespacemetal_1_1precise.html#a44239067e8e9248b1574353f98e94d72">metal::precise::log10</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log10(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a632dbbdcc1a465cf4739a14306147573"><div class="ttname"><a href="namespacemetal_1_1precise.html#a632dbbdcc1a465cf4739a14306147573">metal::precise::log2</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log2(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a71acf77ffd29c56f56afae0195c98a1c"><div class="ttname"><a href="namespacemetal_1_1precise.html#a71acf77ffd29c56f56afae0195c98a1c">metal::precise::sin</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sin(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a72d86d508300a9b58f4ccbbe70da4fbc"><div class="ttname"><a href="namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc">metal::precise::cosh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t cosh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a741c27a10cc968dd1e63473d9fcd8f99"><div class="ttname"><a href="namespacemetal_1_1precise.html#a741c27a10cc968dd1e63473d9fcd8f99">metal::precise::tanh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t tanh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a8261ed22f03122ef15b89512358acb1f"><div class="ttname"><a href="namespacemetal_1_1precise.html#a8261ed22f03122ef15b89512358acb1f">metal::precise::tan</a></div><div class="ttdeci">METAL_FUNC bfloat16_t tan(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a8a2bcc89fc0b7e74f0453f82f89a8604"><div class="ttname"><a href="namespacemetal_1_1precise.html#a8a2bcc89fc0b7e74f0453f82f89a8604">metal::precise::acos</a></div><div class="ttdeci">METAL_FUNC bfloat16_t acos(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a8d8d2d5700ce432b33cf47cf22528e8f"><div class="ttname"><a href="namespacemetal_1_1precise.html#a8d8d2d5700ce432b33cf47cf22528e8f">metal::precise::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_a902994837653b90c47f4285673e712c4"><div class="ttname"><a href="namespacemetal_1_1precise.html#a902994837653b90c47f4285673e712c4">metal::precise::atanh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t atanh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_aaad1cdde6687c8011fbc5fda1bb13424"><div class="ttname"><a href="namespacemetal_1_1precise.html#aaad1cdde6687c8011fbc5fda1bb13424">metal::precise::asinh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t asinh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_aaaf4b5f4786a912089bbf0ae7619a6be"><div class="ttname"><a href="namespacemetal_1_1precise.html#aaaf4b5f4786a912089bbf0ae7619a6be">metal::precise::atan</a></div><div class="ttdeci">METAL_FUNC bfloat16_t atan(bfloat16_t y_over_x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_abc8f4f59dd6e7204ab5d84f0af96331c"><div class="ttname"><a href="namespacemetal_1_1precise.html#abc8f4f59dd6e7204ab5d84f0af96331c">metal::precise::sinh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sinh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_ac4941f62e7d8ab9d7cabbd967aa9f220"><div class="ttname"><a href="namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220">metal::precise::cos</a></div><div class="ttdeci">METAL_FUNC bfloat16_t cos(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_acb213467361cd2cab93a8d5ea1aa5bfd"><div class="ttname"><a href="namespacemetal_1_1precise.html#acb213467361cd2cab93a8d5ea1aa5bfd">metal::precise::sqrt</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sqrt(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_adc7b8b6e12e320cb32030f728dcbf438"><div class="ttname"><a href="namespacemetal_1_1precise.html#adc7b8b6e12e320cb32030f728dcbf438">metal::precise::asin</a></div><div class="ttdeci">METAL_FUNC bfloat16_t asin(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_1_1precise_html_afb397b477745f12a44423934fa2b05ac"><div class="ttname"><a href="namespacemetal_1_1precise.html#afb397b477745f12a44423934fa2b05ac">metal::precise::rsqrt</a></div><div class="ttdeci">METAL_FUNC bfloat16_t rsqrt(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:252</div></div>
-<div class="ttc" id="anamespacemetal_html_a020790f30c28a9982c4a83deaa258277"><div class="ttname"><a href="namespacemetal.html#a020790f30c28a9982c4a83deaa258277">metal::floor</a></div><div class="ttdeci">METAL_FUNC bfloat16_t floor(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a29ab6060527120eee745aec0daa06e01"><div class="ttname"><a href="namespacemetal.html#a29ab6060527120eee745aec0daa06e01">metal::rint</a></div><div class="ttdeci">METAL_FUNC bfloat16_t rint(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_a87c5122c60f9a12afceb9925a5b78ffb"><div class="ttname"><a href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">metal::abs</a></div><div class="ttdeci">METAL_FUNC bfloat16_t abs(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_ac2a0b3618d922ac014baac8189d44650"><div class="ttname"><a href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">metal::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="anamespacemetal_html_ad63204d38bc01df6ffc64583f7886b3c"><div class="ttname"><a href="namespacemetal.html#ad63204d38bc01df6ffc64583f7886b3c">metal::ceil</a></div><div class="ttdeci">METAL_FUNC bfloat16_t ceil(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a1f489fabffab969b8677b56bb1136067"><div class="ttname"><a href="namespacemetal_1_1precise.html#a1f489fabffab969b8677b56bb1136067">metal::precise::acosh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t acosh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a341c2b8c27d1bed860f85f8b355023d4"><div class="ttname"><a href="namespacemetal_1_1precise.html#a341c2b8c27d1bed860f85f8b355023d4">metal::precise::log</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a44239067e8e9248b1574353f98e94d72"><div class="ttname"><a href="namespacemetal_1_1precise.html#a44239067e8e9248b1574353f98e94d72">metal::precise::log10</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log10(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a632dbbdcc1a465cf4739a14306147573"><div class="ttname"><a href="namespacemetal_1_1precise.html#a632dbbdcc1a465cf4739a14306147573">metal::precise::log2</a></div><div class="ttdeci">METAL_FUNC bfloat16_t log2(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a71acf77ffd29c56f56afae0195c98a1c"><div class="ttname"><a href="namespacemetal_1_1precise.html#a71acf77ffd29c56f56afae0195c98a1c">metal::precise::sin</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sin(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a72d86d508300a9b58f4ccbbe70da4fbc"><div class="ttname"><a href="namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc">metal::precise::cosh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t cosh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a741c27a10cc968dd1e63473d9fcd8f99"><div class="ttname"><a href="namespacemetal_1_1precise.html#a741c27a10cc968dd1e63473d9fcd8f99">metal::precise::tanh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t tanh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a8261ed22f03122ef15b89512358acb1f"><div class="ttname"><a href="namespacemetal_1_1precise.html#a8261ed22f03122ef15b89512358acb1f">metal::precise::tan</a></div><div class="ttdeci">METAL_FUNC bfloat16_t tan(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a8a2bcc89fc0b7e74f0453f82f89a8604"><div class="ttname"><a href="namespacemetal_1_1precise.html#a8a2bcc89fc0b7e74f0453f82f89a8604">metal::precise::acos</a></div><div class="ttdeci">METAL_FUNC bfloat16_t acos(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a8d8d2d5700ce432b33cf47cf22528e8f"><div class="ttname"><a href="namespacemetal_1_1precise.html#a8d8d2d5700ce432b33cf47cf22528e8f">metal::precise::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_a902994837653b90c47f4285673e712c4"><div class="ttname"><a href="namespacemetal_1_1precise.html#a902994837653b90c47f4285673e712c4">metal::precise::atanh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t atanh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_aaad1cdde6687c8011fbc5fda1bb13424"><div class="ttname"><a href="namespacemetal_1_1precise.html#aaad1cdde6687c8011fbc5fda1bb13424">metal::precise::asinh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t asinh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_aaaf4b5f4786a912089bbf0ae7619a6be"><div class="ttname"><a href="namespacemetal_1_1precise.html#aaaf4b5f4786a912089bbf0ae7619a6be">metal::precise::atan</a></div><div class="ttdeci">METAL_FUNC bfloat16_t atan(bfloat16_t y_over_x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_abc8f4f59dd6e7204ab5d84f0af96331c"><div class="ttname"><a href="namespacemetal_1_1precise.html#abc8f4f59dd6e7204ab5d84f0af96331c">metal::precise::sinh</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sinh(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_ac4941f62e7d8ab9d7cabbd967aa9f220"><div class="ttname"><a href="namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220">metal::precise::cos</a></div><div class="ttdeci">METAL_FUNC bfloat16_t cos(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_acb213467361cd2cab93a8d5ea1aa5bfd"><div class="ttname"><a href="namespacemetal_1_1precise.html#acb213467361cd2cab93a8d5ea1aa5bfd">metal::precise::sqrt</a></div><div class="ttdeci">METAL_FUNC bfloat16_t sqrt(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_adc7b8b6e12e320cb32030f728dcbf438"><div class="ttname"><a href="namespacemetal_1_1precise.html#adc7b8b6e12e320cb32030f728dcbf438">metal::precise::asin</a></div><div class="ttdeci">METAL_FUNC bfloat16_t asin(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_1_1precise_html_afb397b477745f12a44423934fa2b05ac"><div class="ttname"><a href="namespacemetal_1_1precise.html#afb397b477745f12a44423934fa2b05ac">metal::precise::rsqrt</a></div><div class="ttdeci">METAL_FUNC bfloat16_t rsqrt(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:250</div></div>
+<div class="ttc" id="anamespacemetal_html_a020790f30c28a9982c4a83deaa258277"><div class="ttname"><a href="namespacemetal.html#a020790f30c28a9982c4a83deaa258277">metal::floor</a></div><div class="ttdeci">METAL_FUNC bfloat16_t floor(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a29ab6060527120eee745aec0daa06e01"><div class="ttname"><a href="namespacemetal.html#a29ab6060527120eee745aec0daa06e01">metal::rint</a></div><div class="ttdeci">METAL_FUNC bfloat16_t rint(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_a87c5122c60f9a12afceb9925a5b78ffb"><div class="ttname"><a href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">metal::abs</a></div><div class="ttdeci">METAL_FUNC bfloat16_t abs(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_ac2a0b3618d922ac014baac8189d44650"><div class="ttname"><a href="namespacemetal.html#ac2a0b3618d922ac014baac8189d44650">metal::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
+<div class="ttc" id="anamespacemetal_html_ad63204d38bc01df6ffc64583f7886b3c"><div class="ttname"><a href="namespacemetal.html#ad63204d38bc01df6ffc64583f7886b3c">metal::ceil</a></div><div class="ttdeci">METAL_FUNC bfloat16_t ceil(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:232</div></div>
 <div class="ttc" id="astruct_abs_html"><div class="ttname"><a href="struct_abs.html">Abs</a></div><div class="ttdef"><b>Definition</b> unary_ops.h:15</div></div>
 <div class="ttc" id="astruct_abs_html_a0ca113fd036151c443df3f83cc667f28"><div class="ttname"><a href="struct_abs.html#a0ca113fd036151c443df3f83cc667f28">Abs::operator()</a></div><div class="ttdeci">uint8_t operator()(uint8_t x)</div><div class="ttdef"><b>Definition</b> unary_ops.h:21</div></div>
 <div class="ttc" id="astruct_abs_html_a99d2a2f37a6cddd3168b0224f2a9b963"><div class="ttname"><a href="struct_abs.html#a99d2a2f37a6cddd3168b0224f2a9b963">Abs::operator()</a></div><div class="ttdeci">uint32_t operator()(uint32_t x)</div><div class="ttdef"><b>Definition</b> unary_ops.h:29</div></div>
diff --git a/docs/build/html/usage/compile.html b/docs/build/html/usage/compile.html
index b8dda3452..45d10dfa1 100644
--- a/docs/build/html/usage/compile.html
+++ b/docs/build/html/usage/compile.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Compilation &#8212; MLX 0.20.0 documentation</title>
+    <title>Compilation &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Function Transforms" href="function_transforms.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/usage/distributed.html b/docs/build/html/usage/distributed.html
index 3f3b15ce4..d0883736c 100644
--- a/docs/build/html/usage/distributed.html
+++ b/docs/build/html/usage/distributed.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Distributed Communication &#8212; MLX 0.20.0 documentation</title>
+    <title>Distributed Communication &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Conversion to NumPy and Other Frameworks" href="numpy.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/usage/function_transforms.html b/docs/build/html/usage/function_transforms.html
index ac1423603..23ed5d63d 100644
--- a/docs/build/html/usage/function_transforms.html
+++ b/docs/build/html/usage/function_transforms.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Function Transforms &#8212; MLX 0.20.0 documentation</title>
+    <title>Function Transforms &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Saving and Loading Arrays" href="saving_and_loading.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
@@ -1005,8 +1006,8 @@ where the vectorized axes should be in the outputs.</p>
 <span class="nb">print</span><span class="p">(</span><span class="n">timeit</span><span class="o">.</span><span class="n">timeit</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">mx</span><span class="o">.</span><span class="n">eval</span><span class="p">(</span><span class="n">vmap_add</span><span class="p">(</span><span class="n">xs</span><span class="p">,</span> <span class="n">ys</span><span class="p">)),</span> <span class="n">number</span><span class="o">=</span><span class="mi">100</span><span class="p">))</span>
 </pre></div>
 </div>
-<p>On an M1 Max the naive version takes in total <code class="docutils literal notranslate"><span class="pre">0.390</span></code> seconds whereas the
-vectorized version takes only <code class="docutils literal notranslate"><span class="pre">0.025</span></code> seconds, more than ten times faster.</p>
+<p>On an M1 Max the naive version takes in total <code class="docutils literal notranslate"><span class="pre">5.639</span></code> seconds whereas the
+vectorized version takes only <code class="docutils literal notranslate"><span class="pre">0.024</span></code> seconds, more than 200 times faster.</p>
 <p>Of course, this operation is quite contrived. A better approach is to simply do
 <code class="docutils literal notranslate"><span class="pre">xs</span> <span class="pre">+</span> <span class="pre">ys.T</span></code>, but for more complex functions <a class="reference internal" href="../python/_autosummary/mlx.core.vmap.html#mlx.core.vmap" title="mlx.core.vmap"><code class="xref py py-func docutils literal notranslate"><span class="pre">vmap()</span></code></a> can be quite handy.</p>
 </section>
diff --git a/docs/build/html/usage/indexing.html b/docs/build/html/usage/indexing.html
index d34af9407..a95e2abf9 100644
--- a/docs/build/html/usage/indexing.html
+++ b/docs/build/html/usage/indexing.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Indexing Arrays &#8212; MLX 0.20.0 documentation</title>
+    <title>Indexing Arrays &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Unified Memory" href="unified_memory.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/usage/lazy_evaluation.html b/docs/build/html/usage/lazy_evaluation.html
index da2b8f68e..6b991ad70 100644
--- a/docs/build/html/usage/lazy_evaluation.html
+++ b/docs/build/html/usage/lazy_evaluation.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Lazy Evaluation &#8212; MLX 0.20.0 documentation</title>
+    <title>Lazy Evaluation &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Quick Start Guide" href="quick_start.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/usage/numpy.html b/docs/build/html/usage/numpy.html
index 6593dc81e..d80738733 100644
--- a/docs/build/html/usage/numpy.html
+++ b/docs/build/html/usage/numpy.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Conversion to NumPy and Other Frameworks &#8212; MLX 0.20.0 documentation</title>
+    <title>Conversion to NumPy and Other Frameworks &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Compilation" href="compile.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/usage/quick_start.html b/docs/build/html/usage/quick_start.html
index 5639e9f1a..6c87d32f9 100644
--- a/docs/build/html/usage/quick_start.html
+++ b/docs/build/html/usage/quick_start.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Quick Start Guide &#8212; MLX 0.20.0 documentation</title>
+    <title>Quick Start Guide &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Build and Install" href="../install.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/usage/saving_and_loading.html b/docs/build/html/usage/saving_and_loading.html
index 7e44b7a8a..89573984b 100644
--- a/docs/build/html/usage/saving_and_loading.html
+++ b/docs/build/html/usage/saving_and_loading.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Saving and Loading Arrays &#8212; MLX 0.20.0 documentation</title>
+    <title>Saving and Loading Arrays &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Indexing Arrays" href="indexing.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/usage/unified_memory.html b/docs/build/html/usage/unified_memory.html
index 1c7207ddc..c32b609f5 100644
--- a/docs/build/html/usage/unified_memory.html
+++ b/docs/build/html/usage/unified_memory.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Unified Memory &#8212; MLX 0.20.0 documentation</title>
+    <title>Unified Memory &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Lazy Evaluation" href="lazy_evaluation.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/usage/using_streams.html b/docs/build/html/usage/using_streams.html
index fb91ff46e..25c3fdefd 100644
--- a/docs/build/html/usage/using_streams.html
+++ b/docs/build/html/usage/using_streams.html
@@ -8,7 +8,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Using Streams &#8212; MLX 0.20.0 documentation</title>
+    <title>Using Streams &#8212; MLX 0.21.0 documentation</title>
   
   
   
@@ -39,7 +39,7 @@
   <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=26a4bc78f4c0ddb94549" />
 <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=26a4bc78f4c0ddb94549" />
 
-    <script src="../_static/documentation_options.js?v=eb97cb82"></script>
+    <script src="../_static/documentation_options.js?v=174dfe6e"></script>
     <script src="../_static/doctools.js?v=9a2dae69"></script>
     <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
@@ -51,7 +51,7 @@
     <link rel="prev" title="Distributed Communication" href="distributed.html" />
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0" />
+  <meta name="docsearch:version" content="0.21.0" />
   </head>
   
   
@@ -130,8 +130,8 @@
       
     
     
-    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.20.0 documentation - Home"/>
-    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.20.0 documentation - Home"/>
+    <img src="../_static/mlx_logo.png" class="logo__image only-light" alt="MLX 0.21.0 documentation - Home"/>
+    <img src="../_static/mlx_logo_dark.png" class="logo__image only-dark pst-js-only" alt="MLX 0.21.0 documentation - Home"/>
   
   
 </a></div>
@@ -444,7 +444,6 @@
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.layer_norm.html">mlx.core.fast.layer_norm</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.rope.html">mlx.core.fast.rope</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html">mlx.core.fast.scaled_dot_product_attention</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.affine_quantize.html">mlx.core.fast.affine_quantize</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../python/_autosummary/mlx.core.fast.metal_kernel.html">mlx.core.fast.metal_kernel</a></li>
 </ul>
 </details></li>
@@ -521,6 +520,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.ALiBi.html">mlx.nn.ALiBi</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool1d.html">mlx.nn.AvgPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool2d.html">mlx.nn.AvgPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.AvgPool3d.html">mlx.nn.AvgPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.BatchNorm.html">mlx.nn.BatchNorm</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.CELU.html">mlx.nn.CELU</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Conv1d.html">mlx.nn.Conv1d</a></li>
@@ -550,6 +550,7 @@
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.LSTM.html">mlx.nn.LSTM</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool1d.html">mlx.nn.MaxPool1d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool2d.html">mlx.nn.MaxPool2d</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MaxPool3d.html">mlx.nn.MaxPool3d</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.Mish.html">mlx.nn.Mish</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.MultiHeadAttention.html">mlx.nn.MultiHeadAttention</a></li>
 <li class="toctree-l3"><a class="reference internal" href="../python/nn/_autosummary/mlx.nn.PReLU.html">mlx.nn.PReLU</a></li>
diff --git a/docs/build/html/utils_8h.html b/docs/build/html/utils_8h.html
index 758422d83..228c93a04 100644
--- a/docs/build/html/utils_8h.html
+++ b/docs/build/html/utils_8h.html
@@ -118,6 +118,8 @@ Namespaces</h2></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html">mlx::core</a></td></tr>
 <tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:"><td class="memItemLeft" align="right" valign="top">namespace &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1env.html">mlx::core::env</a></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="typedef-members" name="typedef-members"></a>
 Typedefs</h2></td></tr>
@@ -166,12 +168,18 @@ Functions</h2></td></tr>
 <tr class="separator:a42a19c8442b173606e714364227e7d45"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a57eb97a5eba99a846ac429795e407574" id="r_a57eb97a5eba99a846ac429795e407574"><td class="memItemLeft" align="right" valign="top">std::ostream &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a57eb97a5eba99a846ac429795e407574">mlx::core::operator&lt;&lt;</a> (std::ostream &amp;os, const <a class="el" href="namespacemlx_1_1core.html#afbd2769c30e721afc85a7b9fb55b8e52">float16_t</a> &amp;v)</td></tr>
 <tr class="separator:a57eb97a5eba99a846ac429795e407574"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a7db909d54cf07375e89424c32c07a29c" id="r_a7db909d54cf07375e89424c32c07a29c"><td class="memItemLeft" align="right" valign="top">std::ostream &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a7db909d54cf07375e89424c32c07a29c">mlx::core::operator&lt;&lt;</a> (std::ostream &amp;os, const <a class="el" href="backend_2metal_2kernels_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &amp;v)</td></tr>
+<tr class="memitem:a7db909d54cf07375e89424c32c07a29c" id="r_a7db909d54cf07375e89424c32c07a29c"><td class="memItemLeft" align="right" valign="top">std::ostream &amp;&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a7db909d54cf07375e89424c32c07a29c">mlx::core::operator&lt;&lt;</a> (std::ostream &amp;os, const <a class="el" href="backend_2metal_2kernels_2metal__3__0_2bf16_8h.html#a7782de82393104dd4ad754ce3b316e82">bfloat16_t</a> &amp;v)</td></tr>
 <tr class="separator:a7db909d54cf07375e89424c32c07a29c"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:adacbc4526e8964b267a8ec3eb1bc1a32" id="r_adacbc4526e8964b267a8ec3eb1bc1a32"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#adacbc4526e8964b267a8ec3eb1bc1a32">mlx::core::is_power_of_2</a> (int n)</td></tr>
 <tr class="separator:adacbc4526e8964b267a8ec3eb1bc1a32"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a685c0530e338aabc622325685846ce93" id="r_a685c0530e338aabc622325685846ce93"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a685c0530e338aabc622325685846ce93">mlx::core::next_power_of_2</a> (int n)</td></tr>
 <tr class="separator:a685c0530e338aabc622325685846ce93"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a0efecbf9efe695adafad12b5a4945df3" id="r_a0efecbf9efe695adafad12b5a4945df3"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3">mlx::core::env::get_var</a> (const char *name, int default_value)</td></tr>
+<tr class="separator:a0efecbf9efe695adafad12b5a4945df3"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ac3266e1259a64c8b56bdc6c7029179f2" id="r_ac3266e1259a64c8b56bdc6c7029179f2"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1env.html#ac3266e1259a64c8b56bdc6c7029179f2">mlx::core::env::bfs_max_width</a> ()</td></tr>
+<tr class="separator:ac3266e1259a64c8b56bdc6c7029179f2"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:aedbf4e739553024c33dd0094dd9107aa" id="r_aedbf4e739553024c33dd0094dd9107aa"><td class="memItemLeft" align="right" valign="top">int&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core_1_1env.html#aedbf4e739553024c33dd0094dd9107aa">mlx::core::env::max_ops_per_buffer</a> ()</td></tr>
+<tr class="separator:aedbf4e739553024c33dd0094dd9107aa"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="var-members" name="var-members"></a>
 Variables</h2></td></tr>
diff --git a/docs/build/html/utils_8h_source.html b/docs/build/html/utils_8h_source.html
index edd6e2180..22d36ebfd 100644
--- a/docs/build/html/utils_8h_source.html
+++ b/docs/build/html/utils_8h_source.html
@@ -230,7 +230,29 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>}</div>
 </div>
 <div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span> </div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>} <span class="comment">// namespace mlx::core</span></div>
+<div class="foldopen" id="foldopen00123" data-start="{" data-end="}">
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1env.html">  123</a></span><span class="keyword">namespace </span>env {</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span> </div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3">  125</a></span><span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3">get_var</a>(<span class="keyword">const</span> <span class="keywordtype">char</span>* name, <span class="keywordtype">int</span> default_value);</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span> </div>
+<div class="foldopen" id="foldopen00127" data-start="{" data-end="}">
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1env.html#ac3266e1259a64c8b56bdc6c7029179f2">  127</a></span><span class="keyword">inline</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1env.html#ac3266e1259a64c8b56bdc6c7029179f2">bfs_max_width</a>() {</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>  <span class="keyword">static</span> <span class="keywordtype">int</span> bfs_max_width_ = <a class="code hl_function" href="namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3">get_var</a>(<span class="stringliteral">&quot;MLX_BFS_MAX_WIDTH&quot;</span>, 20);</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>  <span class="keywordflow">return</span> bfs_max_width_;</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>}</div>
+</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span> </div>
+<div class="foldopen" id="foldopen00132" data-start="{" data-end="}">
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1env.html#aedbf4e739553024c33dd0094dd9107aa">  132</a></span><span class="keyword">inline</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1env.html#aedbf4e739553024c33dd0094dd9107aa">max_ops_per_buffer</a>() {</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  <span class="keyword">static</span> <span class="keywordtype">int</span> max_ops_per_buffer_ = <a class="code hl_function" href="namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3">get_var</a>(<span class="stringliteral">&quot;MLX_MAX_OPS_PER_BUFFER&quot;</span>, 10);</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>  <span class="keywordflow">return</span> max_ops_per_buffer_;</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>}</div>
+</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span> </div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>} <span class="comment">// namespace env</span></div>
+</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span> </div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html_ae29e7d6fbfbea1e5e321a8d1ea3cfacd"><div class="ttname"><a href="classmlx_1_1core_1_1array.html#ae29e7d6fbfbea1e5e321a8d1ea3cfacd">mlx::core::array::dtype</a></div><div class="ttdeci">Dtype dtype() const</div><div class="ttdoc">Get the arrays data type.</div><div class="ttdef"><b>Definition</b> array.h:127</div></div>
@@ -239,6 +261,9 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="agroup__ops_html_ga7fed87d96cc7741d8267f4eac83f5fe7"><div class="ttname"><a href="group__ops.html#ga7fed87d96cc7741d8267f4eac83f5fe7">mlx::core::max</a></div><div class="ttdeci">array max(const array &amp;a, bool keepdims, StreamOrDevice s={})</div><div class="ttdoc">The maximum of all elements of the array.</div></div>
 <div class="ttc" id="agroup__ops_html_gab27599802617a4c8f9964ab5f4ffee12"><div class="ttname"><a href="group__ops.html#gab27599802617a4c8f9964ab5f4ffee12">mlx::core::min</a></div><div class="ttdeci">array min(const array &amp;a, bool keepdims, StreamOrDevice s={})</div><div class="ttdoc">The minimum of all elements of the array.</div></div>
 <div class="ttc" id="agroup__ops_html_gad656c30f9fd7d9467e405657b325aa7e"><div class="ttname"><a href="group__ops.html#gad656c30f9fd7d9467e405657b325aa7e">mlx::core::operator&lt;&lt;</a></div><div class="ttdeci">array operator&lt;&lt;(const array &amp;a, const array &amp;b)</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_1_1env_html_a0efecbf9efe695adafad12b5a4945df3"><div class="ttname"><a href="namespacemlx_1_1core_1_1env.html#a0efecbf9efe695adafad12b5a4945df3">mlx::core::env::get_var</a></div><div class="ttdeci">int get_var(const char *name, int default_value)</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_1_1env_html_ac3266e1259a64c8b56bdc6c7029179f2"><div class="ttname"><a href="namespacemlx_1_1core_1_1env.html#ac3266e1259a64c8b56bdc6c7029179f2">mlx::core::env::bfs_max_width</a></div><div class="ttdeci">int bfs_max_width()</div><div class="ttdef"><b>Definition</b> utils.h:127</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_1_1env_html_aedbf4e739553024c33dd0094dd9107aa"><div class="ttname"><a href="namespacemlx_1_1core_1_1env.html#aedbf4e739553024c33dd0094dd9107aa">mlx::core::env::max_ops_per_buffer</a></div><div class="ttdeci">int max_ops_per_buffer()</div><div class="ttdef"><b>Definition</b> utils.h:132</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a0181b5d72bf3d34448dabc70f7ff858d"><div class="ttname"><a href="namespacemlx_1_1core.html#a0181b5d72bf3d34448dabc70f7ff858d">mlx::core::normalize_axis</a></div><div class="ttdeci">int normalize_axis(int axis, int ndim)</div><div class="ttdoc">Returns the axis normalized to be in the range [0, ndim).</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a0196171cfe6ee2953113abce597dc815"><div class="ttname"><a href="namespacemlx_1_1core.html#a0196171cfe6ee2953113abce597dc815">mlx::core::default_device</a></div><div class="ttdeci">const Device &amp; default_device()</div></div>
@@ -256,7 +281,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="anamespacemlx_1_1core_html_af35a2b06517d8bb7dbb469692b4f841c"><div class="ttname"><a href="namespacemlx_1_1core.html#af35a2b06517d8bb7dbb469692b4f841c">mlx::core::set_default_stream</a></div><div class="ttdeci">void set_default_stream(Stream s)</div><div class="ttdoc">Make the stream the default for its device.</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_af5a408a78cc934717dd711ddfda58ea6"><div class="ttname"><a href="namespacemlx_1_1core.html#af5a408a78cc934717dd711ddfda58ea6">mlx::core::global_formatter</a></div><div class="ttdeci">PrintFormatter global_formatter</div></div>
 <div class="ttc" id="astream_8h_html"><div class="ttname"><a href="stream_8h.html">stream.h</a></div></div>
-<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:54</div></div>
+<div class="ttc" id="astruct___m_l_x___b_float16_html"><div class="ttname"><a href="struct___m_l_x___b_float16.html">_MLX_BFloat16</a></div><div class="ttdef"><b>Definition</b> bf16.h:48</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1___m_l_x___float16_html"><div class="ttname"><a href="structmlx_1_1core_1_1___m_l_x___float16.html">mlx::core::_MLX_Float16</a></div><div class="ttdef"><b>Definition</b> fp16.h:21</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1_device_html"><div class="ttname"><a href="structmlx_1_1core_1_1_device.html">mlx::core::Device</a></div><div class="ttdef"><b>Definition</b> device.h:7</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1_dtype_html"><div class="ttname"><a href="structmlx_1_1core_1_1_dtype.html">mlx::core::Dtype</a></div><div class="ttdef"><b>Definition</b> dtype.h:13</div></div>